Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

evmone speedups #11

Closed
wants to merge 25 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
1bbf605
added support for computed gotos
zac-williamson Apr 14, 2019
39747f4
removed unused file
zac-williamson Apr 14, 2019
70d2574
removed comments, added exception handling to log and call methods
zac-williamson Apr 14, 2019
a7c3842
made opcode jump tables singletons
zac-williamson Apr 17, 2019
09c2b33
added global memory paging, that reserves memory ahead of txn execution
zac-williamson Apr 19, 2019
bf44b86
changed analysis objects from std::deque to std::vector
zac-williamson Apr 20, 2019
ab92fc5
removed branch to check for basic blocks
zac-williamson Apr 20, 2019
75db85c
CHECK_BLOCK happy path has 1 if statement, from 3
zac-williamson Apr 20, 2019
192d666
removed superfluous files
zac-williamson Apr 20, 2019
ccd0c77
comments and formatting
zac-williamson Apr 20, 2019
65110ff
removed state.max_potential_memory
zac-williamson Apr 20, 2019
c972866
reactivated analysis tests
zac-williamson Apr 21, 2019
a1caac6
more comemnts and formatting
zac-williamson Apr 22, 2019
9f1b919
typo
zac-williamson Apr 22, 2019
6177bb1
removed comments
zac-williamson Apr 22, 2019
165a93b
all opcode methods only have 1 argument
zac-williamson Apr 22, 2019
5cc423f
reactivated commented out tests
zac-williamson Apr 22, 2019
8264699
updated .gitignore
zac-williamson Apr 26, 2019
a66374d
synced with upstream
zac-williamson Apr 26, 2019
79a9219
evmc enum fix
zac-williamson Apr 26, 2019
e932da6
removed std::max from jump and jumpi opcodes
zac-williamson Apr 26, 2019
507be9a
removed redundant code
zac-williamson Apr 26, 2019
32f6929
added blake2b_huff benchmark
zac-williamson Apr 29, 2019
ea0ec48
updated blake2b_huff contract
zac-williamson May 5, 2019
9689317
Fix compilation warnings
chfast Jul 24, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/build
/cmake-build-*
/.idea
/.vscode/*
3 changes: 3 additions & 0 deletions lib/evmone/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

add_library(evmone
${include_dir}/evmone/evmone.h
constants.hpp
memory.cpp
memory.hpp
analysis.cpp
analysis.hpp
evmone.cpp
Expand Down
326 changes: 226 additions & 100 deletions lib/evmone/analysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,129 +3,255 @@
// Licensed under the Apache License, Version 2.0.

#include "analysis.hpp"
#include "constants.hpp"

#include <evmc/instructions.h>

namespace evmone
{
namespace
{
bool is_terminator(uint8_t c) noexcept
{
return c == OP_JUMP || c == OP_JUMPI || c == OP_STOP || c == OP_RETURN || c == OP_REVERT ||
c == OP_SELFDESTRUCT;
}
} // namespace

int code_analysis::find_jumpdest(int offset) noexcept
void analyze(instruction* instructions, instruction** jumpdest_map, const void** jump_table, evmc_revision rev,
const uint8_t* code, const size_t code_size) noexcept
{
// TODO: Replace with lower_bound().
for (const auto& d : jumpdest_map)
{
if (d.first == offset)
return d.second;
}
return -1;
}
auto* instr_table = evmc_get_instruction_metrics_table(rev);

evmc_call_kind op2call_kind(uint8_t opcode) noexcept
{
switch (opcode)
{
case OP_CREATE:
return EVMC_CREATE;
case OP_CALL:
return EVMC_CALL;
case OP_CALLCODE:
return EVMC_CALLCODE;
case OP_DELEGATECALL:
return EVMC_DELEGATECALL;
case OP_CREATE2:
return EVMC_CREATE2;
default:
return evmc_call_kind(-1);
}
}
// temp variable to track the stack difference within the current basic block
int stack_diff = 0;

code_analysis analyze(
const exec_fn_table& fns, evmc_revision rev, const uint8_t* code, size_t code_size) noexcept
{
code_analysis analysis;
analysis.instrs.reserve(code_size + 1);
// when we map program opcode -> jump label, we apply an offset if
// the opcode is NOT an entry point into a basic block.
// i.e. first 256 entries in jump table = jump destinations that will perform
// validation logic on the given basic block.
// the subsequent 256 entries in jump table = jump destinations that skip this
// (this removes a conditional branch that is normally required for every opcode)
// (even better, this was a branch that was non trivial to predict)
// The variable 'delta' represents this offset.
// The first entry will always be entry into a basic block, so initialize to 0
int delta = 0;

auto* instr_table = evmc_get_instruction_metrics_table(rev);
// temporary variable to cache whether the NEXT entry will be a basic block. Default to 'no'
int next_delta = JUMP_TABLE_CHECK_BOUNDARY;

block_info* block = nullptr;
int instr_index = 0;
// pointer to the current basic block we're working on
block_info* block = &instructions[0].block_data;
// initialize our new block (we don't initialize instructions to default values to save some time)
block->gas_cost = 0;
block->stack_req = 0;
block->stack_max = 0;

// instr_index indexes entries to instructions.
// i is the program counter index.
// we don't use i to index instructions, because some program opcodes (i.e. PUSH) use more
// than 1 byte of bytecode data. Therefore, using i would create a sparse array, where there
// would be groups of 'empty' entries in 'instructions'. Which would (probably?) increase the number of cache misses
size_t instr_index = 0;
for (size_t i = 0; i < code_size; ++i, ++instr_index)
{
// TODO: Loop in reverse order for easier GAS analysis.
const auto c = code[i];
auto& instr = analysis.instrs.emplace_back(fns[c]);

const bool jumpdest = c == OP_JUMPDEST;
if (!block || jumpdest)
{
// Create new block.
block = &analysis.blocks.emplace_back();
instr.block_index = static_cast<int>(analysis.blocks.size() - 1);

if (jumpdest)
analysis.jumpdest_map.emplace_back(static_cast<int>(i), instr_index);
}
// get the current opcode
uint8_t c = code[i];

// get the metrics for the current
auto metrics = instr_table[c];

// update the current block's stack and gas metrics
block->gas_cost += metrics.gas_cost;
auto stack_req = metrics.num_stack_arguments - block->stack_diff;
block->stack_diff += (metrics.num_stack_returned_items - metrics.num_stack_arguments);
auto stack_req = metrics.num_stack_arguments - stack_diff;
stack_diff += (metrics.num_stack_returned_items - metrics.num_stack_arguments);
block->stack_req = std::max(block->stack_req, stack_req);
block->stack_max = std::max(block->stack_max, block->stack_diff);
block->stack_max = std::max(block->stack_max, stack_diff);

// Skip PUSH data.
if (c >= OP_PUSH1 && c <= OP_PUSH32)
// Maybe the compiler can do something clever with this, if we frame it as a giant switch statement...
switch (c)
{
// OPT: bswap data here.
++i;
auto push_size = size_t(c - OP_PUSH1 + 1);
analysis.args_storage.emplace_back();
auto& data = analysis.args_storage.back();

auto leading_zeros = 32 - push_size;
for (auto& b : data)
b = 0;
for (size_t j = 0; j < push_size && (i + j) < code_size; ++j)
data[leading_zeros + j] = code[i + j];
instr.arg.data = &data[0];
i += push_size - 1;
case OP_GAS:
case OP_CREATE:
case OP_CALL:
case OP_CALLCODE:
case OP_DELEGATECALL:
case OP_CREATE2:
case OP_STATICCALL:
{
instructions[instr_index].instruction_data.number = block->gas_cost;
next_delta = JUMP_TABLE_CHECK_BOUNDARY;
break;
}
else if (c >= OP_DUP1 && c <= OP_DUP16)
instr.arg.number = c - OP_DUP1;
else if (c >= OP_SWAP1 && c <= OP_SWAP16)
instr.arg.number = c - OP_SWAP1 + 1;
else if (c == OP_GAS || c == OP_DELEGATECALL || c == OP_CALL || c == OP_CALLCODE ||
c == OP_STATICCALL || c == OP_CREATE || c == OP_CREATE2)
case OP_PUSH1:
case OP_PUSH2:
case OP_PUSH3:
case OP_PUSH4:
case OP_PUSH5:
case OP_PUSH6:
case OP_PUSH7:
case OP_PUSH8:
case OP_PUSH9:
case OP_PUSH10:
case OP_PUSH11:
case OP_PUSH12:
case OP_PUSH13:
case OP_PUSH14:
case OP_PUSH15:
case OP_PUSH16:
case OP_PUSH17:
case OP_PUSH18:
case OP_PUSH19:
case OP_PUSH20:
case OP_PUSH21:
case OP_PUSH22:
case OP_PUSH23:
case OP_PUSH24:
case OP_PUSH25:
case OP_PUSH26:
case OP_PUSH27:
case OP_PUSH28:
case OP_PUSH29:
case OP_PUSH30:
case OP_PUSH31:
case OP_PUSH32:
{
instr.arg.number = static_cast<int>(block->gas_cost);
// TODO: Does not make sense for OP_GAS.
instr.arg.call_kind = op2call_kind(c == OP_STATICCALL ? uint8_t{OP_CALL} : c);
size_t push_size = static_cast<size_t>(c - OP_PUSH1 + 1);
size_t leading_zeroes = static_cast<size_t>(32 - push_size);
size_t copy_size = std::min(push_size, code_size - i - 1);
uint64_t swap_buffer[4] = { 0, 0, 0, 0 };
memcpy((uint8_t*)(&swap_buffer) + leading_zeroes, code + i + 1, copy_size);
uint64_t* push_data = (uint64_t*)(instructions[instr_index].instruction_data.push_data.begin());
push_data[3] = __builtin_bswap64(swap_buffer[0]);
push_data[2] = __builtin_bswap64(swap_buffer[1]);
push_data[1] = __builtin_bswap64(swap_buffer[2]);
push_data[0] = __builtin_bswap64(swap_buffer[3]);
i += push_size;
next_delta = JUMP_TABLE_CHECK_BOUNDARY;
break;
}
else if (c == OP_PC)
instr.arg.number = static_cast<int>(i);
else if (c == OP_EXP)
instr.arg.number = rev >= EVMC_SPURIOUS_DRAGON ? 50 : 10;
else if (c == OP_SSTORE)
instr.arg.number = rev;
else if (c >= OP_LOG0 && c <= OP_LOG4)
instr.arg.number = c - OP_LOG0;
else if (is_terminator(c))
block = nullptr;
}
/**
* TODO: figure out which is faster
* Option 1: have singleton 'dup' and 'swap' opcodes, and use instruction_data to
* identify the stack indices to dup/swap
* Option 2: have explicit 'dup' and 'swap' opcodes for each variant (e.g. dup1, ..., dup16)
*
* Option 1 requires an additional lookup into instruction_data per opcode execution,
* Option 2 requires fetching more code, so there's a reduced chance that the required code
* is in the CPU cache (I think?).
* In addition, the CPU has more branches to predict when jumping to each opcode,
* but the rationale is that the program flow is simple enough
* for the CPU to predict ~100% of the time
* (N.B. how in the blazes can this be tested?)
* I honestly have no idea which is faster, both benchmarks overlap each other.
**/
/*
case OP_DUP1:
case OP_DUP2:
case OP_DUP3:
case OP_DUP4:
case OP_DUP5:
case OP_DUP6:
case OP_DUP7:
case OP_DUP8:
case OP_DUP9:
case OP_DUP10:
case OP_DUP11:
case OP_DUP12:
case OP_DUP13:
case OP_DUP14:
case OP_DUP15:
case OP_DUP16:
{
instr.instruction_data.number = c - OP_DUP1;
break;
}
case OP_SWAP1:
case OP_SWAP2:
case OP_SWAP3:
case OP_SWAP4:
case OP_SWAP5:
case OP_SWAP6:
case OP_SWAP7:
case OP_SWAP8:
case OP_SWAP9:
case OP_SWAP10:
case OP_SWAP11:
case OP_SWAP12:
case OP_SWAP13:
case OP_SWAP14:
case OP_SWAP15:
case OP_SWAP16:
{
instr.instruction_data.number = c - OP_SWAP1 + 1;
break;
}
*/
case OP_PC:
{
instructions[instr_index].instruction_data.number = static_cast<int64_t>(i);
next_delta = JUMP_TABLE_CHECK_BOUNDARY;
break;
}
case OP_STOP:
case OP_JUMP:
case OP_JUMPI:
case OP_RETURN:
case OP_REVERT:
case OP_SELFDESTRUCT:
{
block = &instructions[instr_index + 1].block_data;
block->gas_cost = 0;
block->stack_max = 0;
block->stack_req = 0;
stack_diff = 0;
next_delta = 0;
break;
}
case OP_JUMPDEST:
{
/**
* If this is a jump destination, we want to log it inside jumpdest_map.
* This gives us an O(1) mapping from program counter -> relevant instruction.
* This comes at the expense of using a sparse array,
* so we use a lot of memory for this map (~200kb for a 24kb program),
* and entries are less likely to be cached.
* N.B. we actually map the program counter to the instruction that PRECEEDS
* the actual instruction we want to jump to. This is because our
* DISPATCH macro will increase state.next_instruction before jumping
* We could write a special case for jump opcodes (so DISPATCH doesn't increase the ptr),
* but I figured that if the access pattern into state.next_instruction was uniform,
* the CPU would have an easier time of predicting the branch we're jumping to.
* This is, however, 100% superstition, I have no idea how to measure pipeline stalls during execution
**/
jumpdest_map[i] = &instructions[instr_index - 1];

// Not terminated block.
if (block || (code_size > 0 && code[code_size - 1] == OP_JUMPI))
analysis.instrs.emplace_back(fns[OP_STOP]);
// we added this opcodes gas cost into the current basic block, undo that
// TODO: cache this? Current code is a bit of a cludge to remove a conditional branch
block->gas_cost -= metrics.gas_cost;

return analysis;
}
// and point to the a new basic block
block = &instructions[instr_index].block_data;

// update the new basic block's gas cost with the cost of OP_JUMPDEST
block->gas_cost = metrics.gas_cost;
block->stack_max = 0;
block->stack_req = 0;
stack_diff = 0;
delta = 0;
next_delta = JUMP_TABLE_CHECK_BOUNDARY;
break;
}
default:
{
next_delta = JUMP_TABLE_CHECK_BOUNDARY;
break;
}
}

instructions[instr_index].opcode_dest = jump_table[c + delta];
delta = next_delta;
next_delta = 0;
}

// We want to add an OP_STOP opcode to the end of our program, so that we always terminate
instructions[instr_index].opcode_dest = jump_table[0];

// For good measure, put an OP_STOP opcode in the two penultimate entries.
// We will set state.next_instruction to code_size when we enter an error state
instructions[code_size].opcode_dest = jump_table[0];
instructions[code_size + 1].opcode_dest = jump_table[0];
}
} // namespace evmone
Loading