From 913873c13a2dffb7c4188c39b4eb188f912f523e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hl=C3=B6=C3=B0ver=20Sigur=C3=B0sson?= <hlolli@gmail.com>
Date: Thu, 16 Nov 2023 12:20:24 +0100
Subject: [PATCH] feat: arweave specific randomx patch

---
 doc/design.md             | 273 ++++++------------
 doc/specs.md              | 594 ++++++++++++++++++++------------------
 src/aes_hash.cpp          |  40 +--
 src/asm/configuration.asm |  17 +-
 src/configuration.h       |  24 +-
 src/tests/benchmark.cpp   |   2 +-
 src/tests/tests.cpp       |   6 +-
 7 files changed, 433 insertions(+), 523 deletions(-)

diff --git a/doc/design.md b/doc/design.md
index 7a1b8ef2..676f117e 100644
--- a/doc/design.md
+++ b/doc/design.md
@@ -1,26 +1,27 @@
 # RandomX design
-To minimize the performance advantage of specialized hardware, a proof of work (PoW) algorithm must achieve *device binding* by targeting specific features of existing general-purpose hardware. This is a complex task because we have to target a large class of devices with different architectures from different manufacturers.
+
+To minimize the performance advantage of specialized hardware, a proof of work (PoW) algorithm must achieve _device binding_ by targeting specific features of existing general-purpose hardware. This is a complex task because we have to target a large class of devices with different architectures from different manufacturers.
 
 There are two distinct classes of general processing devices: central processing units (CPUs) and graphics processing units (GPUs). RandomX targets CPUs for the following reasons:
 
-* CPUs, being less specialized devices, are more prevalent and widely accessible. A CPU-bound algorithm is more egalitarian and allows more participants to join the network. This is one of the goals stated in the original CryptoNote whitepaper [[1](https://cryptonote.org/whitepaper.pdf)]. 
-* A large common subset of native hardware instructions exists among different CPU architectures. The same cannot be said about GPUs. For example, there is no common integer multiplication instruction for NVIDIA and AMD GPUs [[2](https://github.com/ifdefelse/ProgPOW/issues/16)].
-* All major CPU instruction sets are well documented with multiple open source compilers available. In comparison, GPU instruction sets are usually proprietary and may require vendor specific closed-source drivers for maximum performance.
+- CPUs, being less specialized devices, are more prevalent and widely accessible. A CPU-bound algorithm is more egalitarian and allows more participants to join the network. This is one of the goals stated in the original CryptoNote whitepaper [[1](https://cryptonote.org/whitepaper.pdf)].
+- A large common subset of native hardware instructions exists among different CPU architectures. The same cannot be said about GPUs. For example, there is no common integer multiplication instruction for NVIDIA and AMD GPUs [[2](https://github.com/ifdefelse/ProgPOW/issues/16)].
+- All major CPU instruction sets are well documented with multiple open source compilers available. In comparison, GPU instruction sets are usually proprietary and may require vendor specific closed-source drivers for maximum performance.
 
 ## 1. Design considerations
 
-The most basic idea of a CPU-bound proof of work is that the "work" must be dynamic. This takes advantage of the fact that CPUs accept two kinds of inputs: *data* (the main input) and *code* (which specifies what to perform with the data).
+The most basic idea of a CPU-bound proof of work is that the "work" must be dynamic. This takes advantage of the fact that CPUs accept two kinds of inputs: _data_ (the main input) and _code_ (which specifies what to perform with the data).
 
-Conversely, typical cryptographic hashing functions [[3](https://en.wikipedia.org/wiki/Cryptographic_hash_function)] do not represent suitable work for the CPU because their only input is *data*, while the sequence of operations is fixed and can be performed more efficiently by a specialized integrated circuit.
+Conversely, typical cryptographic hashing functions [[3](https://en.wikipedia.org/wiki/Cryptographic_hash_function)] do not represent suitable work for the CPU because their only input is _data_, while the sequence of operations is fixed and can be performed more efficiently by a specialized integrated circuit.
 
 ### 1.1 Dynamic proof of work
 
 A dynamic proof of work algorithm can generally consist of the following 4 steps:
 
-1) Generate a random program.
-2) Translate it into the native machine code of the CPU.
-3) Execute the program.
-4) Transform the output of the program into a cryptographically secure value.
+1. Generate a random program.
+2. Translate it into the native machine code of the CPU.
+3. Execute the program.
+4. Transform the output of the program into a cryptographically secure value.
 
 The actual 'useful' CPU-bound work is performed in step 3, so the algorithm must be tuned to minimize the overhead of the remaining steps.
 
@@ -28,10 +29,10 @@ The actual 'useful' CPU-bound work is performed in step 3, so the algorithm must
 
 Early attempts at a dynamic proof of work design were based on generating a program in a high-level language, such as C or Javascript [[4](https://github.com/hyc/randprog), [5](https://github.com/tevador/RandomJS)]. However, this is very inefficient for two main reasons:
 
-* High level languages have a complex syntax, so generating a valid program is relatively slow since it requires the creation of an abstract syntax tree (ASL).
-* Once the source code of the program is generated, the compiler will generally parse the textual representation back into the ASL, which makes the whole process of generating source code redundant.
+- High level languages have a complex syntax, so generating a valid program is relatively slow since it requires the creation of an abstract syntax tree (ASL).
+- Once the source code of the program is generated, the compiler will generally parse the textual representation back into the ASL, which makes the whole process of generating source code redundant.
 
-The fastest way to generate a random program is to use a *logic-less* generator - simply filling a buffer with random data. This of course requires designing a syntaxless programming language (or instruction set) in which all random bit strings represent valid programs.
+The fastest way to generate a random program is to use a _logic-less_ generator - simply filling a buffer with random data. This of course requires designing a syntaxless programming language (or instruction set) in which all random bit strings represent valid programs.
 
 #### 1.1.2 Translating the program into machine code
 
@@ -41,16 +42,16 @@ This step is inevitable because we don't want to limit the algorithm to a specif
 
 The actual program execution should utilize as many CPU components as possible. Some of the features that should be utilized in the program are:
 
-* multi-level caches (L1, L2, L3)
-* μop cache [[6](https://en.wikipedia.org/wiki/CPU_cache#Micro-operation_(%CE%BCop_or_uop)_cache)]
-* arithmetic logic unit (ALU)
-* floating point unit (FPU)
-* memory controller
-* instruction level parallelism [[7](https://en.wikipedia.org/wiki/Instruction-level_parallelism)]
-    * superscalar execution [[8](https://en.wikipedia.org/wiki/Superscalar_processor)]
-    * out-of-order execution [[9](https://en.wikipedia.org/wiki/Out-of-order_execution)]
-    * speculative execution [[10](https://en.wikipedia.org/wiki/Speculative_execution)]
-    * register renaming [[11](https://en.wikipedia.org/wiki/Register_renaming)]
+- multi-level caches (L1, L2, L3)
+- μop cache [[6](<https://en.wikipedia.org/wiki/CPU_cache#Micro-operation_(%CE%BCop_or_uop)_cache>)]
+- arithmetic logic unit (ALU)
+- floating point unit (FPU)
+- memory controller
+- instruction level parallelism [[7](https://en.wikipedia.org/wiki/Instruction-level_parallelism)]
+  - superscalar execution [[8](https://en.wikipedia.org/wiki/Superscalar_processor)]
+  - out-of-order execution [[9](https://en.wikipedia.org/wiki/Out-of-order_execution)]
+  - speculative execution [[10](https://en.wikipedia.org/wiki/Speculative_execution)]
+  - register renaming [[11](https://en.wikipedia.org/wiki/Register_renaming)]
 
 Chapter 2 describes how the RandomX VM takes advantages of these features.
 
@@ -67,7 +68,7 @@ When a random program is generated, one may choose to execute it only when it's
 1. The runtime of randomly generated programs typically follows a log-normal distribution [[14](https://en.wikipedia.org/wiki/Log-normal_distribution)] (also see Appendix C). A generated program may be quickly analyzed and if it's likely to have above-average runtime, program execution may be skipped and a new program may be generated instead. This can significantly boost performance especially in case the runtime distribution has a heavy tail (many long-running outliers) and if program generation is cheap.
 2. An implementation may choose to optimize for a subset of the features required for program execution. For example, the support for some operations (such as division) may be dropped or some instruction sequences may be implemented more efficiently. Generated programs would then be analyzed and be executed only if they match the specific requirements of the optimized implementation.
 
-These strategies of searching for programs of particular properties deviate from the objectives of this proof of work, so they must be eliminated. This can be achieved by requiring a sequence of *N* random programs to be executed such that each program is generated from the output of the previous one. The output of the final program is then used as the result.
+These strategies of searching for programs of particular properties deviate from the objectives of this proof of work, so they must be eliminated. This can be achieved by requiring a sequence of _N_ random programs to be executed such that each program is generated from the output of the previous one. The output of the final program is then used as the result.
 
 ```
           +---------------+     +---------------+               +---------------+     +---------------+
@@ -89,10 +90,10 @@ Since the purpose of the proof of work is to be used in a trustless peer-to-peer
 
 Besides pure computational resources, such as ALUs and FPUs, CPUs usually have access to a large amount of memory in the form of DRAM [[16](https://en.wikipedia.org/wiki/Dynamic_random-access_memory)]. The performance of the memory subsystem is typically tuned to match the compute capabilities, for example [[17](https://en.wikipedia.org/wiki/Multi-channel_memory_architecture)]:
 
-* single channel memory for embedded and low power CPUs
-* dual channel memory for desktop CPUs 
-* triple or quad channel memory for workstation CPUs
-* six or eight channel memory for high-end server CPUs
+- single channel memory for embedded and low power CPUs
+- dual channel memory for desktop CPUs
+- triple or quad channel memory for workstation CPUs
+- six or eight channel memory for high-end server CPUs
 
 In order to utilize the external memory as well as the on-chip memory controllers, the proof of work algorithm should access a large memory buffer (called the "Dataset"). The Dataset must be:
 
@@ -115,14 +116,17 @@ Given the constraints described in the previous chapters, the maximum possible p
 Additionally, 256 MiB was selected as the maximum amount of memory that can be required in the light-client mode. This amount is acceptable even for small single-board computers such as the Raspberry Pi.
 
 To keep a constant memory-time product, the maximum fast-mode memory requirement is:
+
 ```
 8 * 256 MiB = 2048 MiB
 ```
+
 This can be further increased since the light mode requires additional chip area for the SuperscalarHash function (see chapter 3.4 and chapter 6 of the Specification). Assuming a conservative estimate of 0.2 mm<sup>2</sup> per SuperscalarHash core and DRAM density of 0.149 Gb/mm<sup>2</sup> [[20](http://en.thelec.kr/news/articleView.html?idxno=20)], the additional memory is:
 
 ```
 8 * 0.2 * 0.149 * 1024 / 8 = 30.5 MiB
 ```
+
 or 32 MiB when rounded to the nearest power of 2. The total memory requirement of the fast mode can be 2080 MiB with a roughly constant AT product.
 
 ## 2. Virtual machine architecture
@@ -141,9 +145,9 @@ The VM is a complex instruction set machine that allows both register and memory
 
 The program executed by the VM has the form of a loop consisting of 256 random instructions.
 
-* 256 instructions is long enough to provide a large number of possible programs and enough space for branches. The number of different programs that can be generated is limited to 2<sup>512</sup> = 1.3e+154, which is the number of possible seed values of the random generator.
-* 256 instructions is short enough so that high-performance CPUs can execute one iteration in similar time it takes to fetch data from DRAM. This is advantageous because it allows Dataset accesses to be synchronized and fully prefetchable (see chapter 2.9).
-* Since the program is a loop, it can take advantage of the μop cache [[6](https://en.wikipedia.org/wiki/CPU_cache#Micro-operation_(%CE%BCop_or_uop)_cache)] that is present in some x86 CPUs. Running a loop from the μop cache allows the CPU to power down the x86 instruction decoders, which should help to equalize the power efficiency between x86 and architectures with simple instruction decoding.
+- 256 instructions is long enough to provide a large number of possible programs and enough space for branches. The number of different programs that can be generated is limited to 2<sup>512</sup> = 1.3e+154, which is the number of possible seed values of the random generator.
+- 256 instructions is short enough so that high-performance CPUs can execute one iteration in similar time it takes to fetch data from DRAM. This is advantageous because it allows Dataset accesses to be synchronized and fully prefetchable (see chapter 2.9).
+- Since the program is a loop, it can take advantage of the μop cache [[6](<https://en.wikipedia.org/wiki/CPU_cache#Micro-operation_(%CE%BCop_or_uop)_cache>)] that is present in some x86 CPUs. Running a loop from the μop cache allows the CPU to power down the x86 instruction decoders, which should help to equalize the power efficiency between x86 and architectures with simple instruction decoding.
 
 ### 2.3 Registers
 
@@ -187,7 +191,7 @@ Approximate distribution of floating point register values at the end of each pr
 
 ![Imgur](https://i.imgur.com/64G4qE8.png)
 
-*(Note: bins are marked by the left-side value of the interval, e.g. bin marked `1e-40` contains values from `1e-40` to `1e-20`.)*
+_(Note: bins are marked by the left-side value of the interval, e.g. bin marked `1e-40` contains values from `1e-40` to `1e-20`.)_
 
 The small number of F register values at `1e+14` is caused by the FSCAL instruction, which significantly increases the range of the register values.
 
@@ -199,12 +203,13 @@ To maximize entropy and also to fit into one 64-byte cache line, floating point
 
 Modern CPUs invest a lot of die area and energy to handle branches. This includes:
 
-* Branch predictor unit [[21](https://en.wikipedia.org/wiki/Branch_predictor)]
-* Checkpoint/rollback states that allow the CPU to recover in case of a branch misprediction.
+- Branch predictor unit [[21](https://en.wikipedia.org/wiki/Branch_predictor)]
+- Checkpoint/rollback states that allow the CPU to recover in case of a branch misprediction.
 
 To take advantage of speculative designs, the random programs should contain branches. However, if branch prediction fails, the speculatively executed instructions are thrown away, which results in a certain amount of wasted energy with each misprediction. Therefore we should aim to minimize the number of mispredictions.
 
 Additionally, branches in the code are essential because they significantly reduce the amount of static optimizations that can be made. For example, consider the following x86 instruction sequence:
+
 ```asm
     ...
 branch_target_00:
@@ -215,6 +220,7 @@ branch_target_00:
     xor r8, r9
     ...
 ```
+
 The XOR operations would normally cancel out, but cannot be optimized away due to the branch because the result will be different if the branch is taken. Similarly, the ISWAP_R instruction could be always statically optimized out if it wasn't for branches.
 
 In general, random branches must be designed in such way that:
@@ -231,15 +237,15 @@ Unfortunately, we haven't found a way how to utilize branch prediction in Random
 
 RandomX therefore uses random branches with a jump probability of 1/256 and branch condition that depends on an integer register value. These branches will be predicted as "not taken" by the CPU. Such branches are "free" in most CPU designs unless they are taken. While this doesn't take advantage of the branch predictors, speculative designs will see a significant performance boost compared to non-speculative branch handling - see Appendix B for more information.
 
-The branching conditions and jump targets are chosen in such way that infinite loops in RandomX code are impossible because the register controlling the branch will never be modified in the repeated code block. Each CBRANCH instruction can jump up to twice in a row. Handling CBRANCH using predicated execution [[22](https://en.wikipedia.org/wiki/Predication_(computer_architecture))] is impractical because the branch is not taken most of the time.
+The branching conditions and jump targets are chosen in such way that infinite loops in RandomX code are impossible because the register controlling the branch will never be modified in the repeated code block. Each CBRANCH instruction can jump up to twice in a row. Handling CBRANCH using predicated execution [[22](<https://en.wikipedia.org/wiki/Predication_(computer_architecture)>)] is impractical because the branch is not taken most of the time.
 
 ### 2.7 Instruction-level parallelism
 
 CPUs improve their performance using several techniques that utilize instruction-level parallelism of the executed code. These techniques include:
 
-* Having multiple execution units that can execute operations in parallel (*superscalar execution*).
-* Executing instruction not in program order, but in the order of operand availability (*out-of-order execution*).
-* Predicting which way branches will go to enhance the benefits of both superscalar and out-of-order execution.
+- Having multiple execution units that can execute operations in parallel (_superscalar execution_).
+- Executing instruction not in program order, but in the order of operand availability (_out-of-order execution_).
+- Predicting which way branches will go to enhance the benefits of both superscalar and out-of-order execution.
 
 RandomX benefits from all these optimizations. See Appendix B for a detailed analysis.
 
@@ -251,11 +257,11 @@ The Scratchpad is used as read-write memory. Its size was selected to fit entire
 
 The Scratchpad is split into 3 levels to mimic the typical CPU cache hierarchy [[23](https://en.wikipedia.org/wiki/CPU_cache)]. Most VM instructions access "L1" and "L2" Scratchpad because L1 and L2 CPU caches are located close to the CPU execution units and provide the best random access latency. The ratio of reads from L1 and L2 is 3:1, which matches the inverse ratio of typical latencies (see table below).
 
-|CPU μ-architecture|L1 latency|L2 latency|L3 latency|source|
-|----------------|----------|----------|----------|------|
-ARM Cortex A55|2|6|-|[[24](https://www.anandtech.com/show/11441/dynamiq-and-arms-new-cpus-cortex-a75-a55/4)]
-|AMD Zen+|4|12|40|[[25](https://en.wikichip.org/wiki/amd/microarchitectures/zen%2B#Memory_Hierarchy)]|
-|Intel Skylake|4|12|42|[[26](https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(client)#Memory_Hierarchy)]
+| CPU μ-architecture | L1 latency | L2 latency | L3 latency | source                                                                                            |
+| ------------------ | ---------- | ---------- | ---------- | ------------------------------------------------------------------------------------------------- |
+| ARM Cortex A55     | 2          | 6          | -          | [[24](https://www.anandtech.com/show/11441/dynamiq-and-arms-new-cpus-cortex-a75-a55/4)]           |
+| AMD Zen+           | 4          | 12         | 40         | [[25](https://en.wikichip.org/wiki/amd/microarchitectures/zen%2B#Memory_Hierarchy)]               |
+| Intel Skylake      | 4          | 12         | 42         | [[26](<https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(client)#Memory_Hierarchy>)] |
 
 The L3 cache is much larger and located further from the CPU core. As a result, its access latencies are much higher and can cause stalls in program execution.
 
@@ -280,8 +286,8 @@ See Appendix D for the analysis of Scratchpad entropy.
 
 Programs make, on average, 39 reads (instructions IADD_M, ISUB_M, IMUL_M, IMULH_M, ISMULH_M, IXOR_M, FADD_M, FSUB_M, FDIV_M) and 16 writes (instruction ISTORE) to the Scratchpad per program iteration. Additional 128 bytes are read and written implicitly to initialize and store register values. 64 bytes of data is read from the Dataset per iteration. In total:
 
-* The average amount of data read from memory per program iteration is: 39 * 8 + 128 + 64 = **504 bytes**.
-* The average mount of data written to memory per program iteration is: 16 * 8 + 128 = **256 bytes**.
+- The average amount of data read from memory per program iteration is: 39 \* 8 + 128 + 64 = **504 bytes**.
+- The average mount of data written to memory per program iteration is: 16 \* 8 + 128 = **256 bytes**.
 
 This is close to a 2:1 read/write ratio, which CPUs are optimized for.
 
@@ -305,30 +311,23 @@ Using less than 256 MiB of memory is not possible due to the use of tradeoff-res
 
 ### 3.1 AesGenerator1R
 
-AesGenerator1R was designed for the fastest possible generation of pseudorandom data to fill the Scratchpad. It takes advantage of hardware accelerated AES in modern CPUs. Only one AES round is performed per 16 bytes of output, which results in throughput exceeding 20 GB/s in most modern CPUs. 
-
-AesGenerator1R gives a good output distribution provided that it's initialized with a sufficiently 'random' initial state (see Appendix F).
+AesGenerator1R was designed for the fastest possible generation of pseudorandom data to fill the Scratchpad. It takes advantage of hardware accelerated AES in modern CPUs. Only one AES round is performed per 16 bytes of output, which results in throughput exceeding 20 GB/s in most modern CPUs. While 1 AES round is not sufficient for a good distribution of random values, this is not an issue because the purpose is just to initialize the Scratchpad with random non-zero data.
 
 ### 3.2 AesGenerator4R
 
-AesGenerator4R uses 4 AES rounds to generate pseudorandom data for Program Buffer initialization. Since 2 AES rounds are sufficient for full avalanche of all input bits [[28](https://csrc.nist.gov/csrc/media/projects/cryptographic-standards-and-guidelines/documents/aes-development/rijndael-ammended.pdf)], AesGenerator4R has excellent statistical properties (see Appendix F) while maintaining very good performance.
+AesGenerator4R uses 4 AES rounds to generate pseudorandom data for Program Buffer initialization. Since 2 AES rounds are sufficient for full avalanche of all input bits [[28](https://csrc.nist.gov/csrc/media/projects/cryptographic-standards-and-guidelines/documents/aes-development/rijndael-ammended.pdf)], AesGenerator4R provides an excellent output distribution while maintaining very good performance.
 
 The reversible nature of this generator is not an issue since the generator state is always initialized using the output of a non-reversible hashing function (Blake2b).
 
 ### 3.3 AesHash1R
 
-AesHash was designed for the fastest possible calculation of the Scratchpad fingerprint. It interprets the Scratchpad as a set of AES round keys, so it's equivalent to AES encryption with 32768 rounds. Two extra rounds are performed at the end to ensure avalanche of all Scratchpad bits in each lane.
-
-The reversible nature of AesHash1R is not a problem for two main reasons:
-
-* It is not possible to directly control the input of AesHash1R.
-* The output of AesHash1R is passed into the Blake2b hashing function, which is not reversible.
+AesHash was designed for the fastest possible calculation of the Scratchpad fingerprint. It interprets the Scratchpad as a set of AES round keys, so it's equivalent to AES encryption with 32768 rounds. Two extra rounds are performed at the end to ensure avalanche of all Scratchpad bits in each lane. The output of the AesHash is fed into the Blake2b hashing function to calculate the final PoW hash.
 
 ### 3.4 SuperscalarHash
 
 SuperscalarHash was designed to burn as much power as possible while the CPU is waiting for data to be loaded from DRAM. The target latency of 170 cycles corresponds to the usual DRAM latency of 40-80 ns and clock frequency of 2-4 GHz. ASIC devices designed for light-mode mining with low-latency memory will be bottlenecked by SuperscalarHash when calculating Dataset items and their efficiency will be destroyed by the high power usage of SuperscalarHash.
 
-The average SuperscalarHash function contains a total of 450 instructions, out of which 155 are 64-bit multiplications. On average, the longest dependency chain is 95 instructions long. An ASIC design for light-mode mining, with 256 MiB of on-die memory and 1-cycle latency for all operations, will need on average 95 * 8 = 760 cycles to construct a Dataset item, assuming unlimited parallelization. It will have to execute 155 * 8 = 1240 64-bit multiplications per item, which will consume energy comparable to loading 64 bytes from DRAM.
+The average SuperscalarHash function contains a total of 450 instructions, out of which 155 are 64-bit multiplications. On average, the longest dependency chain is 95 instructions long. An ASIC design for light-mode mining, with 256 MiB of on-die memory and 1-cycle latency for all operations, will need on average 95 _ 8 = 760 cycles to construct a Dataset item, assuming unlimited parallelization. It will have to execute 155 _ 8 = 1240 64-bit multiplications per item, which will consume energy comparable to loading 64 bytes from DRAM.
 
 ## Appendix
 
@@ -336,14 +335,13 @@ The average SuperscalarHash function contains a total of 450 instructions, out o
 
 Chapter 1.2 describes why `N` random programs are chained to prevent mining strategies that search for 'easy' programs. RandomX uses a value of `N = 8`.
 
-Let's define `Q` as the ratio of acceptable programs in a strategy that uses filtering. For example `Q = 0.75` means that 25% of programs are rejected. 
+Let's define `Q` as the ratio of acceptable programs in a strategy that uses filtering. For example `Q = 0.75` means that 25% of programs are rejected.
 
 For `N = 1`, there are no wasted program executions and the only cost is program generation and the filtering itself. The calculations below assume that these costs are zero and the only real cost is program execution. However, this is a simplification because program generation in RandomX is not free (the first program generation requires full Scratchpad initialization), but it describes a best-case scenario for an attacker.
 
+For `N > 1`, the first program can be filtered as usual, but after the program is executed, there is a chance of `1-Q` that the next program should be rejected and we have wasted one program execution.
 
- For `N > 1`, the first program can be filtered as usual, but after the program is executed, there is a chance of `1-Q` that the next program should be rejected and we have wasted one program execution.
-
-For `N` chained executions, the chance is only <code>Q<sup>N</sup></code> that all programs in the chain are acceptable. However, during each attempt to find such chain, we will waste the execution of some programs. For `N = 8`, the number of wasted programs per attempt is equal to <code>(1-Q)*(1+2\*Q+3\*Q<sup>2</sup>+4\*Q<sup>3</sup>+5\*Q<sup>4</sup>+6\*Q<sup>5</sup>+7\*Q<sup>6</sup>)</code> (approximately 2.5 for `Q = 0.75`).
+For `N` chained executions, the chance is only <code>Q<sup>N</sup></code> that all programs in the chain are acceptable. However, during each attempt to find such chain, we will waste the execution of some programs. For `N = 8`, the number of wasted programs per attempt is equal to <code>(1-Q)\*(1+2\*Q+3\*Q<sup>2</sup>+4\*Q<sup>3</sup>+5\*Q<sup>4</sup>+6\*Q<sup>5</sup>+7\*Q<sup>6</sup>)</code> (approximately 2.5 for `Q = 0.75`).
 
 Let's consider 3 mining strategies:
 
@@ -363,12 +361,12 @@ Miner that can execute all programs, but rejects 25% of the slowest programs for
 
 The table below lists the results for the above 3 strategies and different values of `N`. The columns **N(I)**, **N(II)** and **N(III)** list the number of programs that each strategy has to execute on average to get one valid hash result (this includes programs wasted in rejected chains). Columns **Speed(I)**, **Speed(II)** and **Speed(III)** list the average mining performance relative to strategy I.
 
-|N|N(I)|N(II)|N(III)|Speed(I)|Speed(II)|Speed(III)|
-|---|----|----|----|---------|---------|---------|
-|1|1|1|1|1.00|1.50|1.05|
-|2|2|2.3|2|1.00|1.28|1.02|
-|4|4|6.5|4|1.00|0.92|1.01|
-|8|8|27.0|8|1.00|0.44|1.00|
+| N   | N(I) | N(II) | N(III) | Speed(I) | Speed(II) | Speed(III) |
+| --- | ---- | ----- | ------ | -------- | --------- | ---------- |
+| 1   | 1    | 1     | 1      | 1.00     | 1.50      | 1.05       |
+| 2   | 2    | 2.3   | 2      | 1.00     | 1.28      | 1.02       |
+| 4   | 4    | 6.5   | 4      | 1.00     | 0.92      | 1.01       |
+| 8   | 8    | 27.0  | 8      | 1.00     | 0.44      | 1.00       |
 
 For `N = 8`, strategy II will perform at less than half the speed of the honest miner despite having a 50% performance advantage for selected programs. The small statistical advantage of strategy III is negligible with `N = 8`.
 
@@ -379,6 +377,7 @@ As discussed in chapter 2.7, RandomX aims to take advantage of the complex desig
 #### CPU model
 
 The model CPU uses a 3-stage pipeline to achieve an ideal throughput of 1 instruction per cycle:
+
 ```
         (1)                        (2)                     (3)
 +------------------+       +----------------+      +----------------+
@@ -387,6 +386,7 @@ The model CPU uses a 3-stage pipeline to achieve an ideal throughput of 1 instru
 |    + decode      |       |                |      |                |
 +------------------+       +----------------+      +----------------+
 ```
+
 The 3 stages are:
 
 1. Instruction fetch and decode. This stage loads the instruction from the Program Buffer and decodes the instruction operation and operands.
@@ -399,8 +399,8 @@ Note that this is an optimistically short pipeline that would not allow very hig
 
 Our model CPU contains two kinds of components:
 
-* Execution unit (EXU) - it is used to perform the actual integer or floating point operation. All RandomX instructions except ISTORE must use an execution unit in the 3rd pipeline stage. All operations are considered to take only 1 clock cycle.
-* Memory unit (MEM) - it is used for loads and stores into Scratchpad. All memory instructions (including ISTORE) use a memory unit in the 2nd pipeline stage.
+- Execution unit (EXU) - it is used to perform the actual integer or floating point operation. All RandomX instructions except ISTORE must use an execution unit in the 3rd pipeline stage. All operations are considered to take only 1 clock cycle.
+- Memory unit (MEM) - it is used for loads and stores into Scratchpad. All memory instructions (including ISTORE) use a memory unit in the 2nd pipeline stage.
 
 A superscalar design will contain multiple execution or memory units to improve performance.
 
@@ -422,18 +422,18 @@ The simulation model supports two types of branch handling:
 
 The following 10 designs were simulated and the average number of clock cycles to execute a RandomX program (256 instructions) was measured.
 
-|design|superscalar config.|reordering|branch handling|execution time [cycles]|IPC|
-|-------|-----------|----------|---------------|-----------------------|---|
-|#1|1 EXU + 1 MEM|in-order|non-speculative|293|0.87|
-|#2|1 EXU + 1 MEM|in-order|speculative|262|0.98|
-|#3|2 EXU + 1 MEM|in-order|non-speculative|197|1.3|
-|#4|2 EXU + 1 MEM|in-order|speculative|161|1.6|
-|#5|2 EXU + 1 MEM|out-of-order|non-speculative|144|1.8|
-|#6|2 EXU + 1 MEM|out-of-order|speculative|122|2.1|
-|#7|4 EXU + 2 MEM|in-order|non-speculative|135|1.9|
-|#8|4 EXU + 2 MEM|in-order|speculative|99|2.6|
-|#9|4 EXU + 2 MEM|out-of-order|non-speculative|89|2.9|
-|#10|4 EXU + 2 MEM|out-of-order|speculative|64|4.0|
+| design | superscalar config. | reordering   | branch handling | execution time [cycles] | IPC  |
+| ------ | ------------------- | ------------ | --------------- | ----------------------- | ---- |
+| #1     | 1 EXU + 1 MEM       | in-order     | non-speculative | 293                     | 0.87 |
+| #2     | 1 EXU + 1 MEM       | in-order     | speculative     | 262                     | 0.98 |
+| #3     | 2 EXU + 1 MEM       | in-order     | non-speculative | 197                     | 1.3  |
+| #4     | 2 EXU + 1 MEM       | in-order     | speculative     | 161                     | 1.6  |
+| #5     | 2 EXU + 1 MEM       | out-of-order | non-speculative | 144                     | 1.8  |
+| #6     | 2 EXU + 1 MEM       | out-of-order | speculative     | 122                     | 2.1  |
+| #7     | 4 EXU + 2 MEM       | in-order     | non-speculative | 135                     | 1.9  |
+| #8     | 4 EXU + 2 MEM       | in-order     | speculative     | 99                      | 2.6  |
+| #9     | 4 EXU + 2 MEM       | out-of-order | non-speculative | 89                      | 2.9  |
+| #10    | 4 EXU + 2 MEM       | out-of-order | speculative     | 64                      | 4.0  |
 
 The benefits of superscalar, out-of-order and speculative designs are clearly demonstrated.
 
@@ -449,10 +449,10 @@ The following figure shows the distribution of the runtimes of a single VM progr
 
 AMD Ryzen 7 1700 can calculate 625 hashes per second in fast mode (using 1 thread), which means a single hash result takes 1600 μs (1.6 ms). This consists of (approximately):
 
-* 1480 μs for VM execution (8 programs)
-* 45 μs for initial Scratchpad fill (AesGenerator1R).
-* 45 μs for final Scratchpad hash (AesHash1R).
-* 30 μs for program generation and JIT compilation (8 programs)
+- 1480 μs for VM execution (8 programs)
+- 45 μs for initial Scratchpad fill (AesGenerator1R).
+- 45 μs for final Scratchpad hash (AesHash1R).
+- 30 μs for program generation and JIT compilation (8 programs)
 
 This gives a total overhead of 7.5% (time per hash spent not executing VM).
 
@@ -481,110 +481,7 @@ The following figure shows the sensitivity of SuperscalarHash to changing a sing
 
 This shows that SuperscalaHash has quite low sensitivity to high-order bits and somewhat decreased sensitivity to the lowest-order bits. Sensitivity is highest for bits 3-53 (inclusive).
 
-When calculating a Dataset item, the input of the first SuperscalarHash depends only on the item number. To ensure a good distribution of results, the constants described in section 7.3 of the Specification were chosen to provide unique values of bits 3-53 for *all* item numbers in the range 0-34078718 (the Dataset contains 34078719 items). All initial register values for all Dataset item numbers were checked to make sure bits 3-53 of each register are unique and there are no collisions (source code: [superscalar-init.cpp](../src/tests/superscalar-init.cpp)). While this is not strictly necessary to get unique output from SuperscalarHash, it's a security precaution that mitigates the non-perfect avalanche properties of the randomly generated SuperscalarHash instances.
-
-### F. Statistical tests of RNG
-
-Both AesGenerator1R and AesGenerator4R were tested using the TestU01 library [[30](http://simul.iro.umontreal.ca/testu01/tu01.html)] intended for empirical testing of random number generators. The source code is available in [rng-tests.cpp](../src/tests/rng-tests.cpp).
-
-The tests sample about 200 MB ("SmallCrush" test), 500 GB ("Crush" test) or 4 TB ("BigCrush" test) of output from each generator. This is considerably more than the amounts generated in RandomX (2176 bytes for AesGenerator4R and 2 MiB for AesGenerator1R), so failures in the tests don't necessarily imply that the generators are not suitable for their use case.
-
-
-#### AesGenerator4R
-The generator passes all tests in the "BigCrush" suite when initialized using the Blake2b hash function:
-
-```
-$ bin/rng-tests 1
-state0 = 67e8bbe567a1c18c91a316faf19fab73
-state1 = 39f7c0e0a8d96512c525852124fdc9fe
-state2 = 7abb07b2c90e04f098261e323eee8159
-state3 = 3df534c34cdfbb4e70f8c0e1826f4cf7
-
-...
-
-========= Summary results of BigCrush =========
-
- Version:          TestU01 1.2.3
- Generator:        AesGenerator4R
- Number of statistics:  160
- Total CPU time:   02:50:18.34
-
- All tests were passed
-```
-
-
-The generator passes all tests in the "Crush" suite even with an initial state set to all zeroes.
-```
-$ bin/rng-tests 0
-state0 = 00000000000000000000000000000000
-state1 = 00000000000000000000000000000000
-state2 = 00000000000000000000000000000000
-state3 = 00000000000000000000000000000000
-
-...
-
-========= Summary results of Crush =========
-
- Version:          TestU01 1.2.3
- Generator:        AesGenerator4R
- Number of statistics:  144
- Total CPU time:   00:25:17.95
-
- All tests were passed
-```
-
-#### AesGenerator1R
-
-The generator passes all tests in the "Crush" suite when initialized using the Blake2b hash function.
-
-```
-$ bin/rng-tests 1
-state0 = 67e8bbe567a1c18c91a316faf19fab73
-state1 = 39f7c0e0a8d96512c525852124fdc9fe
-state2 = 7abb07b2c90e04f098261e323eee8159
-state3 = 3df534c34cdfbb4e70f8c0e1826f4cf7
-
-...
-
-========= Summary results of Crush =========
-
- Version:          TestU01 1.2.3
- Generator:        AesGenerator1R
- Number of statistics:  144
- Total CPU time:   00:25:06.07
-
- All tests were passed
-
-```
-
-When the initial state is initialized to all zeroes, the generator fails 1 test out of 144 tests in the "Crush" suite:
-
-```
-$ bin/rng-tests 0
-state0 = 00000000000000000000000000000000
-state1 = 00000000000000000000000000000000
-state2 = 00000000000000000000000000000000
-state3 = 00000000000000000000000000000000
-
-...
-
-========= Summary results of Crush =========
-
- Version:          TestU01 1.2.3
- Generator:        AesGenerator1R
- Number of statistics:  144
- Total CPU time:   00:26:12.75
- The following tests gave p-values outside [0.001, 0.9990]:
- (eps  means a value < 1.0e-300):
- (eps1 means a value < 1.0e-15):
-
-       Test                          p-value
- ----------------------------------------------
- 12  BirthdaySpacings, t = 3        1 -  4.4e-5
- ----------------------------------------------
- All other tests were passed
-
-```
+When calculating a Dataset item, the input of the first SuperscalarHash depends only on the item number. To ensure a good distribution of results, the constants described in section 7.3 of the Specification were chosen to provide unique values of bits 3-53 for _all_ item numbers in the range 0-34078718 (the Dataset contains 34078719 items). All initial register values for all Dataset item numbers were checked to make sure bits 3-53 of each register are unique and there are no collisions (source code: [superscalar-init.cpp](../src/tests/superscalar-init.cpp)). While this is not strictly necessary to get unique output from SuperscalarHash, it's a security precaution that mitigates the non-perfect avalanche properties of the randomly generated SuperscalarHash instances.
 
 ## References
 
diff --git a/doc/specs.md b/doc/specs.md
index f2ab8b24..6adfd3c6 100644
--- a/doc/specs.md
+++ b/doc/specs.md
@@ -12,7 +12,6 @@ RandomX is a proof of work (PoW) algorithm which was designed to close the gap b
 1. [SuperscalarHash](#6-superscalarhash)
 1. [Dataset](#7-dataset)
 
-
 ## 1. Definitions
 
 ### 1.1 General definitions
@@ -50,38 +49,38 @@ RandomX is a proof of work (PoW) algorithm which was designed to close the gap b
 **Dataset** refers to a large read-only buffer described in chapter 7. It is constructed from the Cache using the SuperscalarHash function.
 
 ### 1.2 Configurable parameters
+
 RandomX has several configurable parameters that are listed in Table 1.2.1 with their default values.
 
-*Table 1.2.1 - Configurable parameters*
-
-|parameter|description|default value|
-|---------|-----|-------|
-|`RANDOMX_ARGON_MEMORY`|The number of 1 KiB Argon2 blocks in the Cache| `262144`|
-|`RANDOMX_ARGON_ITERATIONS`|The number of Argon2d iterations for Cache initialization|`3`|
-|`RANDOMX_ARGON_LANES`|The number of parallel lanes for Cache initialization|`1`|
-|`RANDOMX_ARGON_SALT`|Argon2 salt|`"RandomX\x03"`|
-|`RANDOMX_CACHE_ACCESSES`|The number of random Cache accesses per Dataset item|`8`|
-|`RANDOMX_SUPERSCALAR_LATENCY`|Target latency for SuperscalarHash (in cycles of the reference CPU)|`170`|
-|`RANDOMX_DATASET_BASE_SIZE`|Dataset base size in bytes|`2147483648`|
-|`RANDOMX_DATASET_EXTRA_SIZE`|Dataset extra size in bytes|`33554368`|
-|`RANDOMX_PROGRAM_SIZE`|The number of instructions in a RandomX program|`256`|
-|`RANDOMX_PROGRAM_ITERATIONS`|The number of iterations per program|`2048`|
-|`RANDOMX_PROGRAM_COUNT`|The number of programs per hash|`8`|
-|`RANDOMX_JUMP_BITS`|Jump condition mask size in bits|`8`|
-|`RANDOMX_JUMP_OFFSET`|Jump condition mask offset in bits|`8`|
-|`RANDOMX_SCRATCHPAD_L3`|Scratchpad L3 size in bytes|`2097152`|
-|`RANDOMX_SCRATCHPAD_L2`|Scratchpad L2 size in bytes|`262144`|
-|`RANDOMX_SCRATCHPAD_L1`|Scratchpad L1 size in bytes|`16384`|
+_Table 1.2.1 - Configurable parameters_
+
+| parameter                     | description                                                         | default value   |
+| ----------------------------- | ------------------------------------------------------------------- | --------------- |
+| `RANDOMX_ARGON_MEMORY`        | The number of 1 KiB Argon2 blocks in the Cache                      | `262144`        |
+| `RANDOMX_ARGON_ITERATIONS`    | The number of Argon2d iterations for Cache initialization           | `3`             |
+| `RANDOMX_ARGON_LANES`         | The number of parallel lanes for Cache initialization               | `1`             |
+| `RANDOMX_ARGON_SALT`          | Argon2 salt                                                         | `"RandomX\x03"` |
+| `RANDOMX_CACHE_ACCESSES`      | The number of random Cache accesses per Dataset item                | `8`             |
+| `RANDOMX_SUPERSCALAR_LATENCY` | Target latency for SuperscalarHash (in cycles of the reference CPU) | `170`           |
+| `RANDOMX_DATASET_BASE_SIZE`   | Dataset base size in bytes                                          | `2147483648`    |
+| `RANDOMX_DATASET_EXTRA_SIZE`  | Dataset extra size in bytes                                         | `33554368`      |
+| `RANDOMX_PROGRAM_SIZE`        | The number of instructions in a RandomX program                     | `256`           |
+| `RANDOMX_PROGRAM_ITERATIONS`  | The number of iterations per program                                | `2048`          |
+| `RANDOMX_PROGRAM_COUNT`       | The number of programs per hash                                     | `8`             |
+| `RANDOMX_JUMP_BITS`           | Jump condition mask size in bits                                    | `8`             |
+| `RANDOMX_JUMP_OFFSET`         | Jump condition mask offset in bits                                  | `8`             |
+| `RANDOMX_SCRATCHPAD_L3`       | Scratchpad L3 size in bytes                                         | `2097152`       |
+| `RANDOMX_SCRATCHPAD_L2`       | Scratchpad L2 size in bytes                                         | `262144`        |
+| `RANDOMX_SCRATCHPAD_L1`       | Scratchpad L1 size in bytes                                         | `16384`         |
 
 Instruction frequencies listed in Tables 5.2.1, 5.3.1, 5.4.1 and 5.5.1 are also configurable.
 
-
 ## 2. Algorithm description
 
 The RandomX algorithm accepts two input values:
 
-* String `K` with a size of 0-60 bytes (key)
-* String `H` of arbitrary length (the value to be hashed)
+- String `K` with a size of 0-60 bytes (key)
+- String `H` of arbitrary length (the value to be hashed)
 
 and outputs a 256-bit result `R`.
 
@@ -103,6 +102,7 @@ The algorithm consists of the following steps:
 1. Result is calculated as `R = Hash256(RegisterFile)`.
 
 The input of the `Hash512` function in step 9 is the following 256 bytes:
+
 ```
  +---------------------------------+
  |         registers r0-r7         | (64 bytes)
@@ -116,6 +116,7 @@ The input of the `Hash512` function in step 9 is the following 256 bytes:
 ```
 
 The input of the `Hash256` function in step 14 is the following 256 bytes:
+
 ```
  +---------------------------------+
  |         registers r0-r7         | (64 bytes)
@@ -132,9 +133,9 @@ The input of the `Hash256` function in step 14 is the following 256 bytes:
 
 ### 3.1 Definitions
 
-Two of the custom functions are based on the [Advanced Encryption Standard](https://en.wikipedia.org/wiki/Advanced_Encryption_Standard) (AES). 
+Two of the custom functions are based on the [Advanced Encryption Standard](https://en.wikipedia.org/wiki/Advanced_Encryption_Standard) (AES).
 
-**AES encryption round** refers to the application of the ShiftRows, SubBytes and MixColumns transformations followed by a XOR with the round key. 
+**AES encryption round** refers to the application of the ShiftRows, SubBytes and MixColumns transformations followed by a XOR with the round key.
 
 **AES decryption round** refers to the application of inverse ShiftRows, inverse SubBytes and inverse MixColumns transformations followed by a XOR with the round key.
 
@@ -150,13 +151,15 @@ key1 = 07 af 7c 6d 0d 71 6a 84 78 d3 25 17 4e dc a1 0d
 key2 = f1 62 12 3f c6 7e 94 9f 4f 79 c0 f4 45 e3 20 3e
 key3 = 35 81 ef 6a 7c 31 ba b1 88 4c 31 16 54 91 16 49
 ```
+
 These keys were generated as:
+
 ```
 key0, key1, key2, key3 = Hash512("RandomX AesGenerator1R keys")
 ```
 
-
 Single iteration produces 64 bytes of output which also become the new generator state.
+
 ```
 state0 (16 B)    state1 (16 B)    state2 (16 B)    state3 (16 B)
      |                |                |                |
@@ -169,46 +172,43 @@ state0 (16 B)    state1 (16 B)    state2 (16 B)    state3 (16 B)
 
 ### 3.3 AesGenerator4R
 
-AesGenerator4R works similar way as AesGenerator1R, except it uses 4 rounds per column. Columns 0 and 1 use a different set of keys than columns 2 and 3.
+AesGenerator4R works the same way as AesGenerator1R, except it uses 4 rounds per column:
 
 ```
 state0 (16 B)    state1 (16 B)    state2 (16 B)    state3 (16 B)
      |                |                |                |
  AES decrypt      AES encrypt      AES decrypt      AES encrypt
-   (key0)           (key0)           (key4)           (key4)
+   (key0)           (key0)           (key0)           (key0)
      |                |                |                |
      v                v                v                v
  AES decrypt      AES encrypt      AES decrypt      AES encrypt
-   (key1)           (key1)           (key5)           (key5)
+   (key1)           (key1)           (key1)           (key1)
      |                |                |                |
      v                v                v                v
  AES decrypt      AES encrypt      AES decrypt      AES encrypt
-   (key2)           (key2)           (key6)           (key6)
+   (key2)           (key2)           (key2)           (key2)
      |                |                |                |
      v                v                v                v
  AES decrypt      AES encrypt      AES decrypt      AES encrypt
-   (key3)           (key3)           (key7)           (key7)
+   (key3)           (key3)           (key3)           (key3)
      |                |                |                |
      v                v                v                v
   state0'          state1'          state2'          state3'
 ```
 
-AesGenerator4R uses the following 8 round keys:
+AesGenerator4R uses the following 4 round keys:
 
 ```
-key0 = dd aa 21 64 db 3d 83 d1 2b 6d 54 2f 3f d2 e5 99
-key1 = 50 34 0e b2 55 3f 91 b6 53 9d f7 06 e5 cd df a5
-key2 = 04 d9 3e 5c af 7b 5e 51 9f 67 a4 0a bf 02 1c 17
-key3 = 63 37 62 85 08 5d 8f e7 85 37 67 cd 91 d2 de d8
-key4 = 73 6f 82 b5 a6 a7 d6 e3 6d 8b 51 3d b4 ff 9e 22
-key5 = f3 6b 56 c7 d9 b3 10 9c 4e 4d 02 e9 d2 b7 72 b2
-key6 = e7 c9 73 f2 8b a3 65 f7 0a 66 a9 2b a7 ef 3b f6
-key7 = 09 d6 7c 7a de 39 58 91 fd d1 06 0c 2d 76 b0 c0
+key0 = 5d 46 90 f8 a6 e4 fb 7f b7 82 1f 14 95 9e 35 cf
+key1 = 50 c4 55 6a 8a 27 e8 fe c3 5a 5c bd dc ff 41 67
+key2 = a4 47 4c 11 e4 fd 24 d5 d2 9a 27 a7 ac 4a 32 3d
+key3 = 2a 3a 0c 81 ff ae a9 99 d9 db d3 42 08 db f6 76
 ```
+
 These keys were generated as:
+
 ```
-key0, key1, key2, key3 = Hash512("RandomX AesGenerator4R keys 0-3")
-key4, key5, key6, key7 = Hash512("RandomX AesGenerator4R keys 4-7")
+key0, key1, key2, key3 = Hash512("RandomX AesGenerator4R keys")
 ```
 
 ### 3.4 AesHash1R
@@ -225,6 +225,7 @@ state3 = 0c 24 0a 63 8d 82 ad 07 05 00 a1 79 48 49 99 7e
 ```
 
 The initial state vectors were generated as:
+
 ```
 state0, state1, state2, state3 = Hash512("RandomX AesHash1R state")
 ```
@@ -249,6 +250,7 @@ xkey1 = d1 63 b2 61 3c e0 f4 51 c6 43 10 ee 9b f9 18 ed
 ```
 
 The extra keys were generated as:
+
 ```
 xkey0, xkey1 = Hash256("RandomX AesHash1R xkeys")
 ```
@@ -264,7 +266,7 @@ state0 (16 B)    state1 (16 B)    state2 (16 B)    state3 (16 B)
    (xkey1)          (xkey1)          (xkey1)          (xkey1)
      |                |                |                |
      v                v                v                v
-finalState0      finalState1      finalState2      finalState3 
+finalState0      finalState1      finalState2      finalState3
 ```
 
 The final state is the output of the function.
@@ -285,7 +287,7 @@ The generator can generate 1 byte or 4 bytes at a time by supplying data from it
 
 The components of the RandomX virtual machine are summarized in Fig. 4.1.
 
-*Figure 4.1 - Virtual Machine*
+_Figure 4.1 - Virtual Machine_
 
 ![Imgur](https://i.imgur.com/Enk42b8.png)
 
@@ -299,21 +301,21 @@ Dataset is described in detail in chapter 7. It's a large read-only buffer. Its
 
 Scratchpad represents the workspace memory of the VM. Its size is `RANDOMX_SCRATCHPAD_L3` bytes and it's divided into 3 "levels":
 
-* The whole scratchpad is the third level "L3".
-* The first `RANDOMX_SCRATCHPAD_L2` bytes of the scratchpad is the second level "L2".
-* The first `RANDOMX_SCRATCHPAD_L1` bytes of the scratchpad is the first level "L1".
+- The whole scratchpad is the third level "L3".
+- The first `RANDOMX_SCRATCHPAD_L2` bytes of the scratchpad is the second level "L2".
+- The first `RANDOMX_SCRATCHPAD_L1` bytes of the scratchpad is the first level "L1".
 
 The scratchpad levels are inclusive, i.e. L3 contains both L2 and L1 and L2 contains L1.
 
 To access a particular scratchpad level, bitwise AND with a mask according to table 4.2.1 is applied to the memory address.
 
-*Table 4.2.1: Scratchpad access masks*
+_Table 4.2.1: Scratchpad access masks_
 
-|Level|8-byte aligned mask|64-byte aligned mask|
-|---------|-|-|
-|L1|`(RANDOMX_SCRATCHPAD_L1 - 1) & ~7`|-|
-|L2|`(RANDOMX_SCRATCHPAD_L2 - 1) & ~7`|-|
-|L3|`(RANDOMX_SCRATCHPAD_L3 - 1) & ~7`|`(RANDOMX_SCRATCHPAD_L3 - 1) & ~63`|
+| Level | 8-byte aligned mask                | 64-byte aligned mask                |
+| ----- | ---------------------------------- | ----------------------------------- |
+| L1    | `(RANDOMX_SCRATCHPAD_L1 - 1) & ~7` | -                                   |
+| L2    | `(RANDOMX_SCRATCHPAD_L2 - 1) & ~7` | -                                   |
+| L3    | `(RANDOMX_SCRATCHPAD_L3 - 1) & ~7` | `(RANDOMX_SCRATCHPAD_L3 - 1) & ~63` |
 
 ### 4.3 Registers
 
@@ -333,14 +335,14 @@ Floating point registers `e0`-`e3` are the "multiplicative" registers, which can
 
 The 2-bit `fprc` register determines the rounding mode of all floating point operations according to Table 4.3.1. The four rounding modes are defined by the IEEE 754 standard.
 
-*Table 4.3.1: Rounding modes*
+_Table 4.3.1: Rounding modes_
 
-|`fprc`|rounding mode|
-|-------|------------|
-|0|roundTiesToEven|
-|1|roundTowardNegative|
-|2|roundTowardPositive|
-|3|roundTowardZero|
+| `fprc` | rounding mode       |
+| ------ | ------------------- |
+| 0      | roundTiesToEven     |
+| 1      | roundTowardNegative |
+| 2      | roundTowardPositive |
+| 3      | roundTowardZero     |
 
 #### 4.3.1 Group F register conversion
 
@@ -363,29 +365,29 @@ The Program buffer stores the program to be executed by the VM. The program cons
 
 The VM requires `128 + 8 * RANDOMX_PROGRAM_SIZE` bytes to be programmed. This is split into two parts:
 
-* `128` bytes of configuration data = 16 quadwords (16×8 bytes), used according to Table 4.5.1
-* `8 * RANDOMX_PROGRAM_SIZE` bytes of program data, copied directly into the Program Buffer
-
-*Table 4.5.1 - Configuration data*
-
-|quadword|description|
-|-----|-----------|
-|0|initialize low half of register `a0`|
-|1|initialize high half of register `a0`|
-|2|initialize low half of register `a1`|
-|3|initialize high half of register `a1`|
-|4|initialize low half of register `a2`|
-|5|initialize high half of register `a2`|
-|6|initialize low half of register `a3`|
-|7|initialize high half of register `a3`|
-|8|initialize register `ma`|
-|9|(reserved)|
-|10|initialize register `mx`|
-|11|(reserved)|
-|12|select address registers|
-|13|select Dataset offset|
-|14|initialize register masks for low half of group E registers|
-|15|initialize register masks for high half of group E registers|
+- `128` bytes of configuration data = 16 quadwords (16×8 bytes), used according to Table 4.5.1
+- `8 * RANDOMX_PROGRAM_SIZE` bytes of program data, copied directly into the Program Buffer
+
+_Table 4.5.1 - Configuration data_
+
+| quadword | description                                                  |
+| -------- | ------------------------------------------------------------ |
+| 0        | initialize low half of register `a0`                         |
+| 1        | initialize high half of register `a0`                        |
+| 2        | initialize low half of register `a1`                         |
+| 3        | initialize high half of register `a1`                        |
+| 4        | initialize low half of register `a2`                         |
+| 5        | initialize high half of register `a2`                        |
+| 6        | initialize low half of register `a3`                         |
+| 7        | initialize high half of register `a3`                        |
+| 8        | initialize register `ma`                                     |
+| 9        | (reserved)                                                   |
+| 10       | initialize register `mx`                                     |
+| 11       | (reserved)                                                   |
+| 12       | select address registers                                     |
+| 13       | select Dataset offset                                        |
+| 14       | initialize register masks for low half of group E registers  |
+| 15       | initialize register masks for high half of group E registers |
 
 #### 4.5.2 Group A register initialization
 
@@ -395,13 +397,13 @@ The values of the floating point registers `a0`-`a3` are initialized using confi
 
 The fraction has full 52 bits of precision and the exponent value ranges from 0 to 31. These values are obtained from the initialization quadword (in little endian format) according to Table 4.5.2.
 
-*Table 4.5.2 - Group A register initialization*
+_Table 4.5.2 - Group A register initialization_
 
-|bits|description|
-|----|-----------|
-|0-51|fraction|
-|52-58|(reserved)|
-|59-63|exponent|
+| bits  | description |
+| ----- | ----------- |
+| 0-51  | fraction    |
+| 52-58 | (reserved)  |
+| 59-63 | exponent    |
 
 #### 4.5.3 Memory registers
 
@@ -411,14 +413,14 @@ Registers `ma` and `mx` are initialized using the low 32 bits of quadwords 8 and
 
 Bits 0-3 of quadword 12 are used to select 4 address registers for program execution. Each bit chooses one register from a pair of integer registers according to Table 4.5.3.
 
-*Table 4.5.3 - Address registers*
+_Table 4.5.3 - Address registers_
 
-|address register (bit)|value = 0|value = 1|
-|----------------------|-|-|
-|`readReg0` (0)|`r0`|`r1`|
-|`readReg1` (1)|`r2`|`r3`|
-|`readReg2` (2)|`r4`|`r5`|
-|`readReg3` (3)|`r6`|`r7`|
+| address register (bit) | value = 0 | value = 1 |
+| ---------------------- | --------- | --------- |
+| `readReg0` (0)         | `r0`      | `r1`      |
+| `readReg1` (1)         | `r2`      | `r3`      |
+| `readReg2` (2)         | `r4`      | `r5`      |
+| `readReg3` (3)         | `r6`      | `r7`      |
 
 #### 4.5.5 Dataset offset
 
@@ -426,7 +428,7 @@ The `datasetOffset` is calculated as the remainder of dividing quadword 13 by `R
 
 #### 4.5.6 Group E register masks
 
-These masks are used for the conversion of group E registers (see 4.3.2). The low and high halves each have their own masks initialized from quadwords 14 and 15. The fraction mask is given by bits 0-21 and the exponent mask by bits 60-63 of the initialization quadword. 
+These masks are used for the conversion of group E registers (see 4.3.2). The low and high halves each have their own masks initialized from quadwords 14 and 15. The fraction mask is given by bits 0-21 and the exponent mask by bits 60-63 of the initialization quadword.
 
 ### 4.6 VM execution
 
@@ -457,7 +459,6 @@ The loop described below is repeated until the value of the `ic` register reache
 12. `spAddr0` and `spAddr1` are both set to zero.
 13. `ic` is decreased by 1.
 
-
 ## 5. Instruction set
 
 The VM executes programs in a special instruction set, which was designed in such way that any random 8-byte word is a valid instruction and any sequence of valid instructions is a valid program. Because there are no "syntax" rules, generating a random program is as easy as filling the program buffer with random data.
@@ -466,40 +467,42 @@ The VM executes programs in a special instruction set, which was designed in suc
 
 Each instruction word is 64 bits long. Instruction fields are encoded as shown in Fig. 5.1.
 
-*Figure 5.1 - Instruction encoding*
+_Figure 5.1 - Instruction encoding_
 
 ![Imgur](https://i.imgur.com/FtkWRwe.png)
 
 #### 5.1.1 opcode
+
 There are 256 opcodes, which are distributed between 29 distinct instructions. Each instruction can be encoded using multiple opcodes (the number of opcodes specifies the frequency of the instruction in a random program).
 
-*Table 5.1.1: Instruction groups*
+_Table 5.1.1: Instruction groups_
 
-|group|# instructions|# opcodes||
-|---------|-----------------|----|-|
-|integer |17|120|46.9%|
-|floating point |9|94|36.7%|
-|control |2|26|10.2%|
-|store |1|16|6.2%|
-||**29**|**256**|**100%**
+| group          | # instructions | # opcodes |          |
+| -------------- | -------------- | --------- | -------- |
+| integer        | 17             | 120       | 46.9%    |
+| floating point | 9              | 94        | 36.7%    |
+| control        | 2              | 26        | 10.2%    |
+| store          | 1              | 16        | 6.2%     |
+|                | **29**         | **256**   | **100%** |
 
 All instructions are described below in chapters 5.2 - 5.5.
 
 #### 5.1.2 dst
+
 Destination register. Only bits 0-1 (register groups A, F, E) or 0-2 (groups R, F+E) are used to encode a register according to Table 5.1.2.
 
-*Table 5.1.2: Addressable register groups*
+_Table 5.1.2: Addressable register groups_
 
-|index|R|A|F|E|F+E|
-|--|--|--|--|--|--|
-|0|`r0`|`a0`|`f0`|`e0`|`f0`|
-|1|`r1`|`a1`|`f1`|`e1`|`f1`|
-|2|`r2`|`a2`|`f2`|`e2`|`f2`|
-|3|`r3`|`a3`|`f3`|`e3`|`f3`|
-|4|`r4`||||`e0`|
-|5|`r5`||||`e1`|
-|6|`r6`||||`e2`|
-|7|`r7`||||`e3`|
+| index | R    | A    | F    | E    | F+E  |
+| ----- | ---- | ---- | ---- | ---- | ---- |
+| 0     | `r0` | `a0` | `f0` | `e0` | `f0` |
+| 1     | `r1` | `a1` | `f1` | `e1` | `f1` |
+| 2     | `r2` | `a2` | `f2` | `e2` | `f2` |
+| 3     | `r3` | `a3` | `f3` | `e3` | `f3` |
+| 4     | `r4` |      |      |      | `e0` |
+| 5     | `r5` |      |      |      | `e1` |
+| 6     | `r6` |      |      |      | `e2` |
+| 7     | `r7` |      |      |      | `e3` |
 
 #### 5.1.3 src
 
@@ -513,63 +516,65 @@ For register-memory instructions, the source operand is used to calculate the me
 
 The `mod` flag is encoded as:
 
-*Table 5.1.3: mod flag encoding*
+_Table 5.1.3: mod flag encoding_
 
-|`mod` bits|description|range of values|
-|----|--------|----|
-|0-1|`mod.mem` flag|0-3|
-|2-3|`mod.shift` flag|0-3|
-|4-7|`mod.cond` flag|0-15|
+| `mod` bits | description      | range of values |
+| ---------- | ---------------- | --------------- |
+| 0-1        | `mod.mem` flag   | 0-3             |
+| 2-3        | `mod.shift` flag | 0-3             |
+| 4-7        | `mod.cond` flag  | 0-15            |
 
 The `mod.mem` flag selects between Scratchpad levels L1 and L2 when reading from or writing to memory except for two cases:
 
-* it's a memory read and `dst` and `src` encode the same register
-* it's a memory write `mod.cond` is 14 or 15
+- it's a memory read and `dst` and `src` encode the same register
+- it's a memory write `mod.cond` is 14 or 15
 
 In these two cases, the Scratchpad level is L3 (see Table 5.1.4).
 
-*Table 5.1.4: memory access Scratchpad level*
+_Table 5.1.4: memory access Scratchpad level_
 
-|condition|Scratchpad level|
-|---------|-|
-|`src == dst` (read)|L3|
-|`mod.cond >= 14` (write)|L3|
-|`mod.mem == 0`|L2|
-|`mod.mem != 0`|L1|
+| condition                | Scratchpad level |
+| ------------------------ | ---------------- |
+| `src == dst` (read)      | L3               |
+| `mod.cond >= 14` (write) | L3               |
+| `mod.mem == 0`           | L2               |
+| `mod.mem != 0`           | L1               |
 
 The address for reading/writing is calculated by applying bitwise AND operation to the address and the 8-byte aligned address mask listed in Table 4.2.1.
 
 The `mod.cond` and `mod.shift` flags are used by some instructions (see 5.2, 5.4).
 
 #### 5.1.5 imm32
+
 A 32-bit immediate value that can be used as the source operand and is used to calculate addresses for memory operations. The immediate value is sign-extended to 64 bits unless specified otherwise.
 
 ### 5.2 Integer instructions
+
 For integer instructions, the destination is always an integer register (register group R). Source operand (if applicable) can be either an integer register or memory value. If `dst` and `src` refer to the same register, most instructions use `0` or `imm32` instead of the register. This is indicated in the 'src == dst' column in Table 5.2.1.
 
 `[mem]` indicates a memory operand loaded as an 8-byte value from the address `src + imm32`.
 
-*Table 5.2.1 Integer instructions*
-
-|frequency|instruction|dst|src|`src == dst ?`|operation|
-|-|-|-|-|-|-|
-|16/256|IADD_RS|R|R|`src = dst`|`dst = dst + (src << mod.shift) (+ imm32)`|
-|7/256|IADD_M|R|R|`src = 0`|`dst = dst + [mem]`|
-|16/256|ISUB_R|R|R|`src = imm32`|`dst = dst - src`|
-|7/256|ISUB_M|R|R|`src = 0`|`dst = dst - [mem]`|
-|16/256|IMUL_R|R|R|`src = imm32`|`dst = dst * src`|
-|4/256|IMUL_M|R|R|`src = 0`|`dst = dst * [mem]`|
-|4/256|IMULH_R|R|R|`src = dst`|`dst = (dst * src) >> 64`|
-|1/256|IMULH_M|R|R|`src = 0`|`dst = (dst * [mem]) >> 64`|
-|4/256|ISMULH_R|R|R|`src = dst`|`dst = (dst * src) >> 64` (signed)|
-|1/256|ISMULH_M|R|R|`src = 0`|`dst = (dst * [mem]) >> 64` (signed)|
-|8/256|IMUL_RCP|R|-|-|<code>dst = 2<sup>x</sup> / imm32 * dst</code>|
-|2/256|INEG_R|R|-|-|`dst = -dst`|
-|15/256|IXOR_R|R|R|`src = imm32`|`dst = dst ^ src`|
-|5/256|IXOR_M|R|R|`src = 0`|`dst = dst ^ [mem]`|
-|8/256|IROR_R|R|R|`src = imm32`|`dst = dst >>> src`|
-|2/256|IROL_R|R|R|`src = imm32`|`dst = dst <<< src`|
-|4/256|ISWAP_R|R|R|`src = dst`|`temp = src; src = dst; dst = temp`|
+_Table 5.2.1 Integer instructions_
+
+| frequency | instruction | dst | src | `src == dst ?` | operation                                       |
+| --------- | ----------- | --- | --- | -------------- | ----------------------------------------------- |
+| 16/256    | IADD_RS     | R   | R   | `src = dst`    | `dst = dst + (src << mod.shift) (+ imm32)`      |
+| 7/256     | IADD_M      | R   | R   | `src = 0`      | `dst = dst + [mem]`                             |
+| 16/256    | ISUB_R      | R   | R   | `src = imm32`  | `dst = dst - src`                               |
+| 7/256     | ISUB_M      | R   | R   | `src = 0`      | `dst = dst - [mem]`                             |
+| 16/256    | IMUL_R      | R   | R   | `src = imm32`  | `dst = dst * src`                               |
+| 4/256     | IMUL_M      | R   | R   | `src = 0`      | `dst = dst * [mem]`                             |
+| 4/256     | IMULH_R     | R   | R   | `src = dst`    | `dst = (dst * src) >> 64`                       |
+| 1/256     | IMULH_M     | R   | R   | `src = 0`      | `dst = (dst * [mem]) >> 64`                     |
+| 4/256     | ISMULH_R    | R   | R   | `src = dst`    | `dst = (dst * src) >> 64` (signed)              |
+| 1/256     | ISMULH_M    | R   | R   | `src = 0`      | `dst = (dst * [mem]) >> 64` (signed)            |
+| 8/256     | IMUL_RCP    | R   | -   | -              | <code>dst = 2<sup>x</sup> / imm32 \* dst</code> |
+| 2/256     | INEG_R      | R   | -   | -              | `dst = -dst`                                    |
+| 15/256    | IXOR_R      | R   | R   | `src = imm32`  | `dst = dst ^ src`                               |
+| 5/256     | IXOR_M      | R   | R   | `src = 0`      | `dst = dst ^ [mem]`                             |
+| 8/256     | IROR_R      | R   | R   | `src = imm32`  | `dst = dst >>> src`                             |
+| 2/256     | IROL_R      | R   | R   | `src = imm32`  | `dst = dst <<< src`                             |
+| 4/256     | ISWAP_R     | R   | R   | `src = dst`    | `temp = src; src = dst; dst = temp`             |
 
 #### 5.2.1 IADD_RS
 
@@ -588,43 +593,50 @@ This instructions adds the values of two registers (modulo 2<sup>64</sup>). The
 64-bit integer multiplication (performed modulo 2<sup>64</sup>). IMUL_R uses a register source operand, IMUL_M uses a memory source operand.
 
 #### 5.2.5 IMULH_R, IMULH_M, ISMULH_R, ISMULH_M
+
 These instructions output the high 64 bits of the whole 128-bit multiplication result. The result differs for signed and unsigned multiplication (IMULH is unsigned, ISMULH is signed). The variants with a register source operand perform a squaring operation if `dst` equals `src`.
 
 #### 5.2.6 IMUL_RCP
+
 If `imm32` equals 0 or is a power of 2, IMUL_RCP is a no-op. In other cases, the instruction multiplies the destination register by a reciprocal of `imm32` (the immediate value is zero-extended and treated as unsigned). The reciprocal is calculated as <code>rcp = 2<sup>x</sup> / imm32</code> by choosing the largest integer `x` such that <code>rcp < 2<sup>64</sup></code>.
 
 #### 5.2.7 INEG_R
+
 Performs two's complement negation of the destination register.
 
 #### 5.2.8 IXOR_R, IXOR_M
+
 64-bit exclusive OR operation. IXOR_R uses a register source operand, IXOR_M uses a memory source operand.
 
 #### 5.2.9 IROR_R, IROL_R
+
 Performs a cyclic shift (rotation) of the destination register. Source operand (shift count) is implicitly masked to 6 bits. IROR rotates bits right, IROL left.
 
 #### 5.2.9 ISWAP_R
+
 This instruction swaps the values of two registers. If source and destination refer to the same register, the result is a no-op.
 
 ### 5.3 Floating point instructions
+
 For floating point instructions, the destination can be a group F or group E register. Source operand is either a group A register or a memory value.
 
 `[mem]` indicates a memory operand loaded as an 8-byte value from the address `src + imm32` and converted according to the rules in chapters 4.3.1 (group F) or 4.3.2 (group E). The lower and upper memory operands are denoted as `[mem][0]` and `[mem][1]`.
 
 All floating point operations are rounded according to the current value of the `fprc` register (see Table 4.3.1). Due to restrictions on the values of the floating point registers, no operation results in `NaN` or a denormal number.
 
-*Table 5.3.1 Floating point instructions*
+_Table 5.3.1 Floating point instructions_
 
-|frequency|instruction|dst|src|operation|
-|-|-|-|-|-|
-|4/256|FSWAP_R|F+E|-|`(dst0, dst1) = (dst1, dst0)`|
-|16/256|FADD_R|F|A|`(dst0, dst1) = (dst0 + src0, dst1 + src1)`|
-|5/256|FADD_M|F|R|`(dst0, dst1) = (dst0 + [mem][0], dst1 + [mem][1])`|
-|16/256|FSUB_R|F|A|`(dst0, dst1) = (dst0 - src0, dst1 - src1)`|
-|5/256|FSUB_M|F|R|`(dst0, dst1) = (dst0 - [mem][0], dst1 - [mem][1])`|
-|6/256|FSCAL_R|F|-|<code>(dst0, dst1) = (-2<sup>x0</sup> * dst0, -2<sup>x1</sup> * dst1)</code>|
-|32/256|FMUL_R|E|A|`(dst0, dst1) = (dst0 * src0, dst1 * src1)`|
-|4/256|FDIV_M|E|R|`(dst0, dst1) = (dst0 / [mem][0], dst1 / [mem][1])`|
-|6/256|FSQRT_R|E|-|`(dst0, dst1) = (√dst0, √dst1)`|
+| frequency | instruction | dst | src | operation                                                                    |
+| --------- | ----------- | --- | --- | ---------------------------------------------------------------------------- |
+| 4/256     | FSWAP_R     | F+E | -   | `(dst0, dst1) = (dst1, dst0)`                                                |
+| 16/256    | FADD_R      | F   | A   | `(dst0, dst1) = (dst0 + src0, dst1 + src1)`                                  |
+| 5/256     | FADD_M      | F   | R   | `(dst0, dst1) = (dst0 + [mem][0], dst1 + [mem][1])`                          |
+| 16/256    | FSUB_R      | F   | A   | `(dst0, dst1) = (dst0 - src0, dst1 - src1)`                                  |
+| 5/256     | FSUB_M      | F   | R   | `(dst0, dst1) = (dst0 - [mem][0], dst1 - [mem][1])`                          |
+| 6/256     | FSCAL_R     | F   | -   | <code>(dst0, dst1) = (-2<sup>x0</sup> _ dst0, -2<sup>x1</sup> _ dst1)</code> |
+| 32/256    | FMUL_R      | E   | A   | `(dst0, dst1) = (dst0 * src0, dst1 * src1)`                                  |
+| 4/256     | FDIV_M      | E   | R   | `(dst0, dst1) = (dst0 / [mem][0], dst1 / [mem][1])`                          |
+| 6/256     | FSQRT_R     | E   | -   | `(dst0, dst1) = (√dst0, √dst1)`                                              |
 
 #### 5.3.1 FSWAP_R
 
@@ -639,6 +651,7 @@ Double precision floating point addition. FADD_R uses a group A register source
 Double precision floating point subtraction. FSUB_R uses a group A register source operand, FSUB_M uses a memory operand.
 
 #### 5.3.4 FSCAL_R
+
 This instruction negates the number and multiplies it by <code>2<sup>x</sup></code>. `x` is calculated by taking the 4 least significant digits of the biased exponent and interpreting them as a binary number using the digit set `{+1, -1}` as opposed to the traditional `{0, 1}`. The possible values of `x` are all odd numbers from -15 to +15.
 
 The mathematical operation described above is equivalent to a bitwise XOR of the binary representation with the value of `0x80F0000000000000`.
@@ -659,14 +672,15 @@ Double precision floating point square root of the destination register.
 
 There are 2 control instructions.
 
-*Table 5.4.1 - Control instructions*
+_Table 5.4.1 - Control instructions_
 
-|frequency|instruction|dst|src|operation|
-|-|-|-|-|-|
-|1/256|CFROUND|-|R|`fprc = src >>> imm32`
-|25/256|CBRANCH|R|-|`dst = dst + cimm`, conditional jump
+| frequency | instruction | dst | src | operation                            |
+| --------- | ----------- | --- | --- | ------------------------------------ |
+| 1/256     | CFROUND     | -   | R   | `fprc = src >>> imm32`               |
+| 25/256    | CBRANCH     | R   | -   | `dst = dst + cimm`, conditional jump |
 
 #### 5.4.1 CFROUND
+
 This instruction calculates a 2-bit value by rotating the source register right by `imm32` bits and taking the 2 least significant bits (the value of the source register is unaffected). The result is stored in the `fprc` register. This changes the rounding mode of all subsequent floating point instructions.
 
 #### 5.4.2 CBRANCH
@@ -675,10 +689,10 @@ This instruction adds an immediate value `cimm` (constructed from `imm32`, see b
 
 At the beginning of each program iteration, all registers are considered to be unmodified. A register is considered as modified by an instruction in the following cases:
 
-* It is the destination register of an integer instruction except IMUL_RCP and ISWAP_R.
-* It is the destination register of IMUL_RCP and `imm32` is not zero or a power of 2.
-* It is the source or the destination register of ISWAP_R and the destination and source registers are distinct.
-* The CBRANCH instruction is considered to modify all integer registers.
+- It is the destination register of an integer instruction except IMUL_RCP and ISWAP_R.
+- It is the destination register of IMUL_RCP and `imm32` is not zero or a power of 2.
+- It is the source or the destination register of ISWAP_R and the destination and source registers are distinct.
+- The CBRANCH instruction is considered to modify all integer registers.
 
 If register `dst` has not been modified yet, the jump target is the first instruction in the Program Buffer.
 
@@ -703,17 +717,19 @@ The second line uses `X` to mark bits of `dst` that will be checked by the condi
 The construction of the CBRANCH instruction ensures that no inifinite loops are possible in the program.
 
 ### 5.5 Store instruction
+
 There is one explicit store instruction for integer values.
 
 `[mem]` indicates the destination is an 8-byte value at the address `dst + imm32`.
 
-*Table 5.5.1 - Store instruction*
+_Table 5.5.1 - Store instruction_
 
-|frequency|instruction|dst|src|operation|
-|-|-|-|-|-|
-|16/256|ISTORE|R|R|`[mem] = src`
+| frequency | instruction | dst | src | operation     |
+| --------- | ----------- | --- | --- | ------------- |
+| 16/256    | ISTORE      | R   | R   | `[mem] = src` |
 
 #### 5.5.1 ISTORE
+
 This instruction stores the value of the source integer register to the memory at the address calculated from the value of the destination register. The `src` and `dst` can be the same register.
 
 ## 6. SuperscalarHash
@@ -723,84 +739,94 @@ SuperscalarHash is a custom diffusion function that was designed to burn as much
 The input and output of SuperscalarHash are 8 integer registers `r0`-`r7`, each 64 bits wide. The output of SuperscalarHash is used to construct the Dataset (see chapter 7.3).
 
 ### 6.1 Instructions
+
 The body of SuperscalarHash is a random sequence of instructions that can run on the Virtual Machine. SuperscalarHash uses a reduced set of only integer register-register instructions listed in Table 6.1.1. `dst` refers to the destination register, `src` to the source register.
 
-*Table 6.1.1 - SuperscalarHash instructions*
-
-|freq. †|instruction|Macro-ops|operation|rules|
-|-|-|-|-|-|
-|0.11|ISUB_R|`sub_rr`|`dst = dst - src`|`dst != src`|
-|0.11|IXOR_R|`xor_rr`|`dst = dst ^ src`|`dst != src`|
-|0.11|IADD_RS|`lea_sib`|`dst = dst + (src << mod.shift)`|`dst != src`, `dst != r5`
-|0.22|IMUL_R|`imul_rr`|`dst = dst * src`|`dst != src`|
-|0.11|IROR_C|`ror_ri`|`dst = dst >>> imm32`|`imm32 % 64 != 0`
-|0.10|IADD_C|`add_ri`|`dst = dst + imm32`|
-|0.10|IXOR_C|`xor_ri`|`dst = dst ^ imm32`|
-|0.03|IMULH_R|`mov_rr`,`mul_r`,`mov_rr`|`dst = (dst * src) >> 64`|
-|0.03|ISMULH_R|`mov_rr`,`imul_r`,`mov_rr`|`dst = (dst * src) >> 64` (signed)|
-|0.06|IMUL_RCP|`mov_ri`,`imul_rr`|<code>dst = 2<sup>x</sup> / imm32 * dst</code>|`imm32 != 0`, <code>imm32 != 2<sup>N</sup></code>|
+_Table 6.1.1 - SuperscalarHash instructions_
+
+| freq. † | instruction | Macro-ops                  | operation                                       | rules                                             |
+| ------- | ----------- | -------------------------- | ----------------------------------------------- | ------------------------------------------------- |
+| 0.11    | ISUB_R      | `sub_rr`                   | `dst = dst - src`                               | `dst != src`                                      |
+| 0.11    | IXOR_R      | `xor_rr`                   | `dst = dst ^ src`                               | `dst != src`                                      |
+| 0.11    | IADD_RS     | `lea_sib`                  | `dst = dst + (src << mod.shift)`                | `dst != src`, `dst != r5`                         |
+| 0.22    | IMUL_R      | `imul_rr`                  | `dst = dst * src`                               | `dst != src`                                      |
+| 0.11    | IROR_C      | `ror_ri`                   | `dst = dst >>> imm32`                           | `imm32 % 64 != 0`                                 |
+| 0.10    | IADD_C      | `add_ri`                   | `dst = dst + imm32`                             |
+| 0.10    | IXOR_C      | `xor_ri`                   | `dst = dst ^ imm32`                             |
+| 0.03    | IMULH_R     | `mov_rr`,`mul_r`,`mov_rr`  | `dst = (dst * src) >> 64`                       |
+| 0.03    | ISMULH_R    | `mov_rr`,`imul_r`,`mov_rr` | `dst = (dst * src) >> 64` (signed)              |
+| 0.06    | IMUL_RCP    | `mov_ri`,`imul_rr`         | <code>dst = 2<sup>x</sup> / imm32 \* dst</code> | `imm32 != 0`, <code>imm32 != 2<sup>N</sup></code> |
 
 † Frequencies are approximate. Instructions are generated based on complex rules.
 
 #### 6.1.1 ISUB_R
+
 See chapter 5.2.3. Source and destination are always distinct registers.
 
 #### 6.1.2 IXOR_R
+
 See chapter 5.2.8. Source and destination are always distinct registers.
 
 #### 6.1.3 IADD_RS
+
 See chapter 5.2.1. Source and destination are always distinct registers and register `r5` cannot be the destination.
 
 #### 6.1.4 IMUL_R
+
 See chapter 5.2.4. Source and destination are always distinct registers.
 
 #### 6.1.5 IROR_C
+
 The destination register is rotated right. The rotation count is given by `imm32` masked to 6 bits and cannot be 0.
 
 #### 6.1.6 IADD_C
+
 A sign-extended `imm32` is added to the destination register.
 
 #### 6.1.7 IXOR_C
+
 The destination register is XORed with a sign-extended `imm32`.
 
 #### 6.1.8 IMULH_R, ISMULH_R
+
 See chapter 5.2.5.
 
 #### 6.1.9 IMUL_RCP
+
 See chapter 5.2.6. `imm32` is never 0 or a power of 2.
 
 ### 6.2 The reference CPU
 
 Unlike a standard RandomX program, a SuperscalarHash program is generated using a strict set of rules to achieve the maximum performance on a superscalar CPU. For this purpose, the generator runs a simulation of a reference CPU.
 
-The reference CPU is loosely based on the [Intel Ivy Bridge microarchitecture](https://en.wikipedia.org/wiki/Ivy_Bridge_(microarchitecture)). It has the following properties:
-
-* The CPU has 3 integer execution ports P0, P1 and P5 that can execute instructions in parallel. Multiplication can run only on port P1.
-* Each of the Superscalar instructions listed in Table 6.1.1 consist of one or more *Macro-ops*. Each Macro-op has certain execution latency (in cycles) and size (in bytes) as shown in Table 6.2.1.
-* Each of the Macro-ops listed in Table 6.2.1 consists of 0-2 *Micro-ops* that can go to a subset of the 3 execution ports. If a Macro-op consists of 2 Micro-ops, both must be executed together.
-* The CPU can decode at most 16 bytes of code per cycle and at most 4 Micro-ops per cycle.
-
-*Table 6.2.1 - Macro-ops*
-
-|Macro-op|latency|size|1st Micro-op|2nd Micro-op|
-|-|-|-|-|-|
-|`sub_rr`|1|3|P015|-|
-|`xor_rr`|1|3|P015|-|
-|`lea_sib`|1|4|P01|-|
-|`imul_rr`|3|4|P1|-|
-|`ror_ri`|1|4|P05|-|
-|`add_ri`|1|7, 8, 9|P015|-|
-|`xor_ri`|1|7, 8, 9|P015|-|
-|`mov_rr`|0|3|-|-|
-|`mul_r`|4|3|P1|P5|
-|`imul_r`|4|3|P1|P5|
-|`mov_ri`|1|10|P015|-|
-
-* P015 - Micro-op can be executed on any port
-* P01 - Micro-op can be executed on ports P0 or P1
-* P05 - Micro-op can be executed on ports P0 or P5
-* P1 - Micro-op can be executed only on port P1
-* P5 - Micro-op can be executed only on port P5
+The reference CPU is loosely based on the [Intel Ivy Bridge microarchitecture](<https://en.wikipedia.org/wiki/Ivy_Bridge_(microarchitecture)>). It has the following properties:
+
+- The CPU has 3 integer execution ports P0, P1 and P5 that can execute instructions in parallel. Multiplication can run only on port P1.
+- Each of the Superscalar instructions listed in Table 6.1.1 consist of one or more _Macro-ops_. Each Macro-op has certain execution latency (in cycles) and size (in bytes) as shown in Table 6.2.1.
+- Each of the Macro-ops listed in Table 6.2.1 consists of 0-2 _Micro-ops_ that can go to a subset of the 3 execution ports. If a Macro-op consists of 2 Micro-ops, both must be executed together.
+- The CPU can decode at most 16 bytes of code per cycle and at most 4 Micro-ops per cycle.
+
+_Table 6.2.1 - Macro-ops_
+
+| Macro-op  | latency | size    | 1st Micro-op | 2nd Micro-op |
+| --------- | ------- | ------- | ------------ | ------------ |
+| `sub_rr`  | 1       | 3       | P015         | -            |
+| `xor_rr`  | 1       | 3       | P015         | -            |
+| `lea_sib` | 1       | 4       | P01          | -            |
+| `imul_rr` | 3       | 4       | P1           | -            |
+| `ror_ri`  | 1       | 4       | P05          | -            |
+| `add_ri`  | 1       | 7, 8, 9 | P015         | -            |
+| `xor_ri`  | 1       | 7, 8, 9 | P015         | -            |
+| `mov_rr`  | 0       | 3       | -            | -            |
+| `mul_r`   | 4       | 3       | P1           | P5           |
+| `imul_r`  | 4       | 3       | P1           | P5           |
+| `mov_ri`  | 1       | 10      | P015         | -            |
+
+- P015 - Micro-op can be executed on any port
+- P01 - Micro-op can be executed on ports P0 or P1
+- P05 - Micro-op can be executed on ports P0 or P5
+- P1 - Micro-op can be executed only on port P1
+- P5 - Micro-op can be executed only on port P5
 
 Macro-ops `add_ri` and `xor_ri` can be optionally padded to a size of 8 or 9 bytes for code alignment purposes. `mov_rr` has 0 execution latency and doesn't use an execution port, but still occupies space during the decoding stage (see chapter 6.3.1).
 
@@ -808,10 +834,10 @@ Macro-ops `add_ri` and `xor_ri` can be optionally padded to a size of 8 or 9 byt
 
 SuperscalarHash programs are generated to maximize the usage of all 3 execution ports of the reference CPU. The generation consists of 4 stages:
 
-* Decoding stage
-* Instruction selection
-* Port assignment
-* Operand assignment
+- Decoding stage
+- Instruction selection
+- Port assignment
+- Operand assignment
 
 Program generation is complete when one of two conditions is met:
 
@@ -822,38 +848,38 @@ Program generation is complete when one of two conditions is met:
 
 The generator produces instructions in groups of 3 or 4 Macro-op slots such that the size of each group is exactly 16 bytes.
 
-*Table 6.3.1 - Decoder configurations*
+_Table 6.3.1 - Decoder configurations_
 
-|decoder group|configuration|
-|-------------|-------------|
-|0|4-8-4|
-|1|7-3-3-3|
-|2|3-7-3-3|
-|3|4-9-3|
-|4|4-4-4-4|
-|5|3-3-10|
+| decoder group | configuration |
+| ------------- | ------------- |
+| 0             | 4-8-4         |
+| 1             | 7-3-3-3       |
+| 2             | 3-7-3-3       |
+| 3             | 4-9-3         |
+| 4             | 4-4-4-4       |
+| 5             | 3-3-10        |
 
 The rules for the selection of the decoder group are following:
 
-* If the currently processed instruction is IMULH_R or ISMULH_R, the next decode group is group 5 (the only group that starts with a 3-byte slot and has only 3 slots).
-* If the total number of multiplications that have been generated is less than or equal to the current decoding cycle, the next decode group is group 4.
-* If the currently processed instruction is IMUL_RCP, the next decode group is group 0 or 3 (must begin with a 4-byte slot for multiplication).
-* Otherwise a random decode group is selected from groups 0-3.
+- If the currently processed instruction is IMULH_R or ISMULH_R, the next decode group is group 5 (the only group that starts with a 3-byte slot and has only 3 slots).
+- If the total number of multiplications that have been generated is less than or equal to the current decoding cycle, the next decode group is group 4.
+- If the currently processed instruction is IMUL_RCP, the next decode group is group 0 or 3 (must begin with a 4-byte slot for multiplication).
+- Otherwise a random decode group is selected from groups 0-3.
 
 #### 6.3.2 Instruction selection
 
 Instructions are selected based on the size of the current decode group slot - see Table 6.3.2.
 
-*Table 6.3.2 - Decoder configurations*
+_Table 6.3.2 - Decoder configurations_
 
-|slot size|note|instructions|
-|-------------|-------------|-----|
-|3|-|ISUB_R, IXOR_R
-|3|last slot in the group|ISUB_R, IXOR_R, IMULH_R, ISMULH_R|
-|4|decode group 4, not the last slot|IMUL_R|
-|4|-|IROR_C, IADD_RS|
-|7,8,9|-|IADD_C, IXOR_C|
-|10|-|IMUL_RCP|
+| slot size | note                              | instructions                      |
+| --------- | --------------------------------- | --------------------------------- |
+| 3         | -                                 | ISUB_R, IXOR_R                    |
+| 3         | last slot in the group            | ISUB_R, IXOR_R, IMULH_R, ISMULH_R |
+| 4         | decode group 4, not the last slot | IMUL_R                            |
+| 4         | -                                 | IROR_C, IADD_RS                   |
+| 7,8,9     | -                                 | IADD_C, IXOR_C                    |
+| 10        | -                                 | IMUL_RCP                          |
 
 #### 6.3.3 Port assignment
 
@@ -865,16 +891,16 @@ The source operand (if needed) is selected first. is it selected from the group
 
 The destination operand is selected with more strict rules (see column 'rules' in Table 6.1.1):
 
-* value must be ready at the required cycle 
-* cannot be the same as the source register unless the instruction allows it (see column 'rules' in Table 6.1.1)
-    * this avoids optimizable operations such as `reg ^ reg` or `reg - reg`
-    * it also increases intermixing of register values
-* register cannot be multiplied twice in a row unless `allowChainedMul` is true 
-    * this avoids accumulation of trailing zeroes in registers due to excessive multiplication
-    * `allowChainedMul` is set to true if an attempt to find source/destination registers failed (this is quite rare, but prevents a catastrophic failure of the generator)
-* either the last instruction applied to the register or its source must be different than the current instruction
-    * this avoids optimizable instruction sequences such as `r1 = r1 ^ r2; r1 = r1 ^ r2` (can be eliminated) or `reg = reg >>> C1; reg = reg >>> C2` (can be reduced to one rotation) or `reg = reg + C1; reg = reg + C2` (can be reduced to one addition)
-* register `r5` cannot be the destination of the IADD_RS instruction (limitation of the x86 lea instruction)
+- value must be ready at the required cycle
+- cannot be the same as the source register unless the instruction allows it (see column 'rules' in Table 6.1.1)
+  - this avoids optimizable operations such as `reg ^ reg` or `reg - reg`
+  - it also increases intermixing of register values
+- register cannot be multiplied twice in a row unless `allowChainedMul` is true
+  - this avoids accumulation of trailing zeroes in registers due to excessive multiplication
+  - `allowChainedMul` is set to true if an attempt to find source/destination registers failed (this is quite rare, but prevents a catastrophic failure of the generator)
+- either the last instruction applied to the register or its source must be different than the current instruction
+  - this avoids optimizable instruction sequences such as `r1 = r1 ^ r2; r1 = r1 ^ r2` (can be eliminated) or `reg = reg >>> C1; reg = reg >>> C2` (can be reduced to one rotation) or `reg = reg + C1; reg = reg + C2` (can be reduced to one addition)
+- register `r5` cannot be the destination of the IADD_RS instruction (limitation of the x86 lea instruction)
 
 ## 7. Dataset
 
@@ -884,7 +910,7 @@ In order to allow PoW verification with a lower amount of memory, the Dataset is
 
 The whole Dataset is constructed from the key value `K`, which is an input parameter of RandomX. The whole Dataset needs to be recalculated everytime the key value changes. Fig. 7.1 shows the process of Dataset construction. Note: the maximum supported length of `K` is 60 bytes. Using a longer key results in implementation-defined behavior.
 
-*Figure 7.1 - Dataset construction*
+_Figure 7.1 - Dataset construction_
 
 ![Imgur](https://i.imgur.com/86h5SbW.png)
 
@@ -892,20 +918,20 @@ The whole Dataset is constructed from the key value `K`, which is an input param
 
 The key `K` is expanded into the Cache using the "memory fill" function of Argon2d with parameters according to Table 7.1.1. The key is used as the "password" field.
 
-*Table 7.1.1 - Argon2 parameters*
-
-|parameter|value|
-|------------|--|
-|parallelism|`RANDOMX_ARGON_LANES`|
-|output size|0|
-|memory|`RANDOMX_ARGON_MEMORY`|
-|iterations|`RANDOMX_ARGON_ITERATIONS`|
-|version|`0x13`|
-|hash type|0 (Argon2d)|
-|password|key value `K`|
-|salt|`RANDOMX_ARGON_SALT`
-|secret size|0|
-|assoc. data size|0|
+_Table 7.1.1 - Argon2 parameters_
+
+| parameter        | value                      |
+| ---------------- | -------------------------- |
+| parallelism      | `RANDOMX_ARGON_LANES`      |
+| output size      | 0                          |
+| memory           | `RANDOMX_ARGON_MEMORY`     |
+| iterations       | `RANDOMX_ARGON_ITERATIONS` |
+| version          | `0x13`                     |
+| hash type        | 0 (Argon2d)                |
+| password         | key value `K`              |
+| salt             | `RANDOMX_ARGON_SALT`       |
+| secret size      | 0                          |
+| assoc. data size | 0                          |
 
 The finalizer and output calculation steps of Argon2 are omitted. The output is the filled memory array.
 
@@ -914,19 +940,20 @@ The finalizer and output calculation steps of Argon2 are omitted. The output is
 The key value `K` is used to initialize a BlakeGenerator (see chapter 3.5), which is then used to generate 8 SuperscalarHash instances for Dataset initialization.
 
 ### 7.3 Dataset block generation
+
 Dataset items are numbered sequentially with `itemNumber` starting from 0. Each 64-byte Dataset item is generated independently using 8 SuperscalarHash functions (generated according to chapter 7.2) and by XORing randomly selected data from the Cache (constructed according to chapter 7.1).
 
 The item data is represented by 8 64-bit integer registers: `r0`-`r7`.
 
 1. The register values are initialized as follows (`*` = multiplication, `^` = XOR):
-    * `r0 = (itemNumber + 1) * 6364136223846793005`
-    * `r1 = r0 ^ 9298411001130361340`
-    * `r2 = r0 ^ 12065312585734608966`
-    * `r3 = r0 ^ 9306329213124626780`
-    * `r4 = r0 ^ 5281919268842080866`
-    * `r5 = r0 ^ 10536153434571861004`
-    * `r6 = r0 ^ 3398623926847679864`
-    * `r7 = r0 ^ 9549104520008361294`
+   - `r0 = (itemNumber + 1) * 6364136223846793005`
+   - `r1 = r0 ^ 9298411001130361340`
+   - `r2 = r0 ^ 12065312585734608966`
+   - `r3 = r0 ^ 9306329213124626780`
+   - `r4 = r0 ^ 5281919268842080866`
+   - `r5 = r0 ^ 10536153434571861004`
+   - `r6 = r0 ^ 3398623926847679864`
+   - `r7 = r0 ^ 9549104520008361294`
 1. Let `cacheIndex = itemNumber`
 1. Let `i = 0`
 1. Load a 64-byte item from the Cache. The item index is given by `cacheIndex` modulo the total number of 64-byte items in Cache.
@@ -938,6 +965,5 @@ The item data is represented by 8 64-bit integer registers: `r0`-`r7`.
 
 The constants used to initialize register values in step 1 were determined as follows:
 
-* Multiplier `6364136223846793005` was selected because it gives an excellent distribution for linear generators (D. Knuth: The Art of Computer Programming – Vol 2., also listed in [Commonly used LCG parameters](https://en.wikipedia.org/wiki/Linear_congruential_generator#Parameters_in_common_use))
-* XOR constants used to initialize registers `r1`-`r7` were determined by calculating `Hash512` of the ASCII value `"RandomX SuperScalarHash initialize"` and taking bytes 8-63 as 7 little-endian unsigned 64-bit integers. Additionally, the constant for `r1` was increased by <code>2<sup>33</sup>+700</code> and the constant for `r3` was increased by <code>2<sup>14</sup></code> (these changes are necessary to ensure that all registers have unique initial values for all values of `itemNumber`).
-
+- Multiplier `6364136223846793005` was selected because it gives an excellent distribution for linear generators (D. Knuth: The Art of Computer Programming – Vol 2., also listed in [Commonly used LCG parameters](https://en.wikipedia.org/wiki/Linear_congruential_generator#Parameters_in_common_use))
+- XOR constants used to initialize registers `r1`-`r7` were determined by calculating `Hash512` of the ASCII value `"RandomX SuperScalarHash initialize"` and taking bytes 8-63 as 7 little-endian unsigned 64-bit integers. Additionally, the constant for `r1` was increased by <code>2<sup>33</sup>+700</code> and the constant for `r3` was increased by <code>2<sup>14</sup></code> (these changes are necessary to ensure that all registers have unique initial values for all values of `itemNumber`).
diff --git a/src/aes_hash.cpp b/src/aes_hash.cpp
index a3b7395b..4f2771a0 100644
--- a/src/aes_hash.cpp
+++ b/src/aes_hash.cpp
@@ -171,18 +171,10 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer) {
 template void fillAes1Rx4<true>(void *state, size_t outputSize, void *buffer);
 template void fillAes1Rx4<false>(void *state, size_t outputSize, void *buffer);
 
-//AesGenerator4R:
-//key0, key1, key2, key3 = Blake2b-512("RandomX AesGenerator4R keys 0-3")
-//key4, key5, key6, key7 = Blake2b-512("RandomX AesGenerator4R keys 4-7")
-
-#define AES_GEN_4R_KEY0 0x99e5d23f, 0x2f546d2b, 0xd1833ddb, 0x6421aadd
-#define AES_GEN_4R_KEY1 0xa5dfcde5, 0x06f79d53, 0xb6913f55, 0xb20e3450
-#define AES_GEN_4R_KEY2 0x171c02bf, 0x0aa4679f, 0x515e7baf, 0x5c3ed904
-#define AES_GEN_4R_KEY3 0xd8ded291, 0xcd673785, 0xe78f5d08, 0x85623763
-#define AES_GEN_4R_KEY4 0x229effb4, 0x3d518b6d, 0xe3d6a7a6, 0xb5826f73
-#define AES_GEN_4R_KEY5 0xb272b7d2, 0xe9024d4e, 0x9c10b3d9, 0xc7566bf3
-#define AES_GEN_4R_KEY6 0xf63befa7, 0x2ba9660a, 0xf765a38b, 0xf273c9e7
-#define AES_GEN_4R_KEY7 0xc0b0762d, 0x0c06d1fd, 0x915839de, 0x7a7cd609
+#define AES_GEN_4R_KEY0 0xcf359e95, 0x141f82b7, 0x7ffbe4a6, 0xf890465d
+#define AES_GEN_4R_KEY1 0x6741ffdc, 0xbd5c5ac3, 0xfee8278a, 0x6a55c450
+#define AES_GEN_4R_KEY2 0x3d324aac, 0xa7279ad2, 0xd524fde4, 0x114c47a4
+#define AES_GEN_4R_KEY3 0x76f6db08, 0x42d3dbd9, 0x99a9aeff, 0x810c3a2a
 
 template<bool softAes>
 void fillAes4Rx4(void *state, size_t outputSize, void *buffer) {
@@ -191,16 +183,12 @@ void fillAes4Rx4(void *state, size_t outputSize, void *buffer) {
 	const uint8_t* outputEnd = outptr + outputSize;
 
 	rx_vec_i128 state0, state1, state2, state3;
-	rx_vec_i128 key0, key1, key2, key3, key4, key5, key6, key7;
+	rx_vec_i128 key0, key1, key2, key3;
 
 	key0 = rx_set_int_vec_i128(AES_GEN_4R_KEY0);
 	key1 = rx_set_int_vec_i128(AES_GEN_4R_KEY1);
 	key2 = rx_set_int_vec_i128(AES_GEN_4R_KEY2);
 	key3 = rx_set_int_vec_i128(AES_GEN_4R_KEY3);
-	key4 = rx_set_int_vec_i128(AES_GEN_4R_KEY4);
-	key5 = rx_set_int_vec_i128(AES_GEN_4R_KEY5);
-	key6 = rx_set_int_vec_i128(AES_GEN_4R_KEY6);
-	key7 = rx_set_int_vec_i128(AES_GEN_4R_KEY7);
 
 	state0 = rx_load_vec_i128((rx_vec_i128*)state + 0);
 	state1 = rx_load_vec_i128((rx_vec_i128*)state + 1);
@@ -210,23 +198,23 @@ void fillAes4Rx4(void *state, size_t outputSize, void *buffer) {
 	while (outptr < outputEnd) {
 		state0 = aesdec<softAes>(state0, key0);
 		state1 = aesenc<softAes>(state1, key0);
-		state2 = aesdec<softAes>(state2, key4);
-		state3 = aesenc<softAes>(state3, key4);
+		state2 = aesdec<softAes>(state2, key0);
+		state3 = aesenc<softAes>(state3, key0);
 
 		state0 = aesdec<softAes>(state0, key1);
 		state1 = aesenc<softAes>(state1, key1);
-		state2 = aesdec<softAes>(state2, key5);
-		state3 = aesenc<softAes>(state3, key5);
+		state2 = aesdec<softAes>(state2, key1);
+		state3 = aesenc<softAes>(state3, key1);
 
 		state0 = aesdec<softAes>(state0, key2);
 		state1 = aesenc<softAes>(state1, key2);
-		state2 = aesdec<softAes>(state2, key6);
-		state3 = aesenc<softAes>(state3, key6);
+		state2 = aesdec<softAes>(state2, key2);
+		state3 = aesenc<softAes>(state3, key2);
 
 		state0 = aesdec<softAes>(state0, key3);
 		state1 = aesenc<softAes>(state1, key3);
-		state2 = aesdec<softAes>(state2, key7);
-		state3 = aesenc<softAes>(state3, key7);
+		state2 = aesdec<softAes>(state2, key3);
+		state3 = aesenc<softAes>(state3, key3);
 
 		rx_store_vec_i128((rx_vec_i128*)outptr + 0, state0);
 		rx_store_vec_i128((rx_vec_i128*)outptr + 1, state1);
@@ -319,4 +307,4 @@ void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, voi
 }
 
 template void hashAndFillAes1Rx4<false>(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);
-template void hashAndFillAes1Rx4<true>(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);
+template void hashAndFillAes1Rx4<true>(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);
\ No newline at end of file
diff --git a/src/asm/configuration.asm b/src/asm/configuration.asm
index 794d7ad0..cde4f3e4 100644
--- a/src/asm/configuration.asm
+++ b/src/asm/configuration.asm
@@ -2,7 +2,6 @@
 RANDOMX_ARGON_MEMORY EQU 262144t
 RANDOMX_ARGON_ITERATIONS EQU 3t
 RANDOMX_ARGON_LANES EQU 1t
-RANDOMX_ARGON_SALT TEXTEQU <"RandomX\x03">
 RANDOMX_CACHE_ACCESSES EQU 8t
 RANDOMX_SUPERSCALAR_LATENCY EQU 170t
 RANDOMX_DATASET_BASE_SIZE EQU 2147483648t
@@ -15,7 +14,7 @@ RANDOMX_SCRATCHPAD_L2 EQU 262144t
 RANDOMX_SCRATCHPAD_L1 EQU 16384t
 RANDOMX_JUMP_BITS EQU 8t
 RANDOMX_JUMP_OFFSET EQU 8t
-RANDOMX_FREQ_IADD_RS EQU 16t
+RANDOMX_FREQ_IADD_RS EQU 25t
 RANDOMX_FREQ_IADD_M EQU 7t
 RANDOMX_FREQ_ISUB_R EQU 16t
 RANDOMX_FREQ_ISUB_M EQU 7t
@@ -29,19 +28,19 @@ RANDOMX_FREQ_IMUL_RCP EQU 8t
 RANDOMX_FREQ_INEG_R EQU 2t
 RANDOMX_FREQ_IXOR_R EQU 15t
 RANDOMX_FREQ_IXOR_M EQU 5t
-RANDOMX_FREQ_IROR_R EQU 8t
-RANDOMX_FREQ_IROL_R EQU 2t
+RANDOMX_FREQ_IROR_R EQU 10t
+RANDOMX_FREQ_IROL_R EQU 0t
 RANDOMX_FREQ_ISWAP_R EQU 4t
-RANDOMX_FREQ_FSWAP_R EQU 4t
-RANDOMX_FREQ_FADD_R EQU 16t
+RANDOMX_FREQ_FSWAP_R EQU 8t
+RANDOMX_FREQ_FADD_R EQU 20t
 RANDOMX_FREQ_FADD_M EQU 5t
-RANDOMX_FREQ_FSUB_R EQU 16t
+RANDOMX_FREQ_FSUB_R EQU 20t
 RANDOMX_FREQ_FSUB_M EQU 5t
 RANDOMX_FREQ_FSCAL_R EQU 6t
-RANDOMX_FREQ_FMUL_R EQU 32t
+RANDOMX_FREQ_FMUL_R EQU 20t
 RANDOMX_FREQ_FDIV_M EQU 4t
 RANDOMX_FREQ_FSQRT_R EQU 6t
-RANDOMX_FREQ_CBRANCH EQU 25t
+RANDOMX_FREQ_CBRANCH EQU 16t
 RANDOMX_FREQ_CFROUND EQU 1t
 RANDOMX_FREQ_ISTORE EQU 16t
 RANDOMX_FREQ_NOP EQU 0t
diff --git a/src/configuration.h b/src/configuration.h
index 84400ddc..35846e48 100644
--- a/src/configuration.h
+++ b/src/configuration.h
@@ -38,7 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define RANDOMX_ARGON_LANES        1
 
 //Argon2d salt
-#define RANDOMX_ARGON_SALT         "RandomX\x03"
+#define RANDOMX_ARGON_SALT         "RandomX-Arweave\x01"
 
 //Number of random Cache accesses per Dataset item. Minimum is 2.
 #define RANDOMX_CACHE_ACCESSES     8
@@ -47,10 +47,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define RANDOMX_SUPERSCALAR_LATENCY   170
 
 //Dataset base size in bytes. Must be a power of 2.
-#define RANDOMX_DATASET_BASE_SIZE  2147483648
+#define RANDOMX_DATASET_BASE_SIZE  536870912 // 2^29 = 512 Mi (bytes), tweaked for Arweave
 
 //Dataset extra size. Must be divisible by 64.
-#define RANDOMX_DATASET_EXTRA_SIZE 33554368
+#define RANDOMX_DATASET_EXTRA_SIZE 31563008 // 493172 (just an arbitrary number) * 64, tweaked for Arweave
 
 //Number of instructions in a RandomX program. Must be divisible by 8.
 #define RANDOMX_PROGRAM_SIZE       256
@@ -82,7 +82,7 @@ Total sum of frequencies must be 256
 */
 
 //Integer instructions
-#define RANDOMX_FREQ_IADD_RS       16
+#define RANDOMX_FREQ_IADD_RS       25
 #define RANDOMX_FREQ_IADD_M         7
 #define RANDOMX_FREQ_ISUB_R        16
 #define RANDOMX_FREQ_ISUB_M         7
@@ -96,23 +96,23 @@ Total sum of frequencies must be 256
 #define RANDOMX_FREQ_INEG_R         2
 #define RANDOMX_FREQ_IXOR_R        15
 #define RANDOMX_FREQ_IXOR_M         5
-#define RANDOMX_FREQ_IROR_R         8
-#define RANDOMX_FREQ_IROL_R         2
+#define RANDOMX_FREQ_IROR_R        10
+#define RANDOMX_FREQ_IROL_R         0
 #define RANDOMX_FREQ_ISWAP_R        4
 
 //Floating point instructions
-#define RANDOMX_FREQ_FSWAP_R        4
-#define RANDOMX_FREQ_FADD_R        16
+#define RANDOMX_FREQ_FSWAP_R        8
+#define RANDOMX_FREQ_FADD_R        20
 #define RANDOMX_FREQ_FADD_M         5
-#define RANDOMX_FREQ_FSUB_R        16
+#define RANDOMX_FREQ_FSUB_R        20
 #define RANDOMX_FREQ_FSUB_M         5
 #define RANDOMX_FREQ_FSCAL_R        6
-#define RANDOMX_FREQ_FMUL_R        32
+#define RANDOMX_FREQ_FMUL_R        20
 #define RANDOMX_FREQ_FDIV_M         4
 #define RANDOMX_FREQ_FSQRT_R        6
 
 //Control instructions
-#define RANDOMX_FREQ_CBRANCH       25
+#define RANDOMX_FREQ_CBRANCH       16
 #define RANDOMX_FREQ_CFROUND        1
 
 //Store instruction
@@ -122,4 +122,4 @@ Total sum of frequencies must be 256
 #define RANDOMX_FREQ_NOP            0
 /*                               ------
                                   256
-*/
+*/
\ No newline at end of file
diff --git a/src/tests/benchmark.cpp b/src/tests/benchmark.cpp
index 148521a5..16c3e9dd 100644
--- a/src/tests/benchmark.cpp
+++ b/src/tests/benchmark.cpp
@@ -395,7 +395,7 @@ int main(int argc, char** argv) {
 		std::cout << "Calculated result: ";
 		result.print(std::cout);
 		if (noncesCount == 1000 && seedValue == 0 && !commit)
-			std::cout << "Reference result:  10b649a3f15c7c7f88277812f2e74b337a0f20ce909af09199cccb960771cfa1" << std::endl;
+			std::cout << "Reference result:  669ae4f2e5e2c0d9cc232ff2c37d41ae113fa302bbf983d9f3342879831b4edf" << std::endl;
 		if (!miningMode) {
 			std::cout << "Performance: " << 1000 * elapsed / noncesCount << " ms per hash" << std::endl;
 		}
diff --git a/src/tests/tests.cpp b/src/tests/tests.cpp
index 5e1b41a3..cdae0007 100644
--- a/src/tests/tests.cpp
+++ b/src/tests/tests.cpp
@@ -477,7 +477,7 @@ int main() {
 		decoder.executeInstruction(ibc, pc, nullptr, config);
 		assert(reg.r[registerDst] == 1);
 	});
-	
+
 	runTest("IXOR_R (decode)", RANDOMX_FREQ_IXOR_R > 0, [&] {
 		randomx::Instruction instr;
 		instr.opcode = randomx::ceil_IXOR_R - 1;
@@ -1078,7 +1078,7 @@ int main() {
 		randomx_calculate_hash_last(vm, &hash3);
 
 		assert(equalsHex(hash1, "639183aae1bf4c9a35884cb46b09cad9175f04efd7684e7262a0ac1c2f0b4e3f"));
-		assert(equalsHex(hash2, "300a0adb47603dedb42228ccb2b211104f4da45af709cd7547cd049e9489c969"));
+		assert(equalsHex(hash2, "de506caf4c69cb93f70a6aab078ce450a2a942e8ca79ca4e0d49e899b2bcbe8e"));
 		assert(equalsHex(hash3, "c36d4ed4191e617309867ed66a443be4075014e2b061bcdaf9ce7b721d2b77a8"));
 	});
 
@@ -1086,7 +1086,7 @@ int main() {
 		rx_set_rounding_mode(RoundToNearest);
 		char hash[RANDOMX_HASH_SIZE];
 		calcStringHash("test key 000", "Lorem ipsum dolor sit amet", &hash);
-		assert(equalsHex(hash, "300a0adb47603dedb42228ccb2b211104f4da45af709cd7547cd049e9489c969"));
+		assert(equalsHex(hash, "de506caf4c69cb93f70a6aab078ce450a2a942e8ca79ca4e0d49e899b2bcbe8e"));
 		assert(rx_get_rounding_mode() == RoundToNearest);
 	});