Skip to content
This repository has been archived by the owner on Nov 6, 2020. It is now read-only.

Ethstore optimizations #6827

Merged
merged 1 commit into from
Oct 20, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ethcore/src/account_provider/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ impl AccountProvider {

/// Checks whether an account with a given address is present.
pub fn has_account(&self, address: Address) -> Result<bool, Error> {
Ok(self.accounts()?.iter().any(|&a| a == address))
Ok(self.sstore.account_ref(&address).is_ok() && !self.blacklisted_accounts.contains(&address))
}

/// Returns addresses of all accounts.
Expand Down
8 changes: 2 additions & 6 deletions ethcore/src/miner/miner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -647,10 +647,6 @@ impl Miner {
condition: Option<TransactionCondition>,
transaction_queue: &mut BanningTransactionQueue,
) -> Vec<Result<TransactionImportResult, Error>> {
let accounts = self.accounts.as_ref()
.and_then(|provider| provider.accounts().ok())
.map(|accounts| accounts.into_iter().collect::<HashSet<_>>());

let best_block_header = client.best_block_header().decode();
let insertion_time = client.chain_info().best_block_number;

Expand All @@ -669,8 +665,8 @@ impl Miner {
Err(e)
},
Ok(transaction) => {
let origin = accounts.as_ref().and_then(|accounts| {
match accounts.contains(&transaction.sender()) {
let origin = self.accounts.as_ref().and_then(|accounts| {
match accounts.has_account(transaction.sender()).unwrap_or(false) {
true => Some(TransactionOrigin::Local),
false => None,
}
Expand Down
9 changes: 8 additions & 1 deletion ethstore/src/dir/disk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,13 @@ impl<T> DiskDirectory<T> where T: KeyFileManager {
Ok(hasher.finish())
}

fn last_modification_date(&self) -> Result<u64, Error> {
use std::time::{Duration, UNIX_EPOCH};
let duration = fs::metadata(&self.path)?.modified()?.duration_since(UNIX_EPOCH).unwrap_or(Duration::default());
let timestamp = duration.as_secs() ^ (duration.subsec_nanos() as u64);
Ok(timestamp)
}

/// all accounts found in keys directory
fn files_content(&self) -> Result<HashMap<PathBuf, SafeAccount>, Error> {
// it's not done using one iterator cause
Expand Down Expand Up @@ -226,7 +233,7 @@ impl<T> KeyDirectory for DiskDirectory<T> where T: KeyFileManager {
}

fn unique_repr(&self) -> Result<u64, Error> {
self.files_hash()
self.last_modification_date()
}
}

Expand Down
39 changes: 27 additions & 12 deletions ethstore/src/ethstore.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use std::collections::{BTreeMap, HashMap};
use std::mem;
use std::path::PathBuf;
use parking_lot::{Mutex, RwLock};
use std::time::{Instant, Duration};

use crypto::KEY_ITERATIONS;
use random::Random;
Expand All @@ -28,6 +29,8 @@ use presale::PresaleWallet;
use json::{self, Uuid, OpaqueKeyFile};
use {import, Error, SimpleSecretStore, SecretStore, SecretVaultRef, StoreAccountRef, Derivation, OpaqueSecret};

const REFRESH_TIME_SEC: u64 = 5;

/// Accounts store.
pub struct EthStore {
store: EthMultiStore,
Expand Down Expand Up @@ -245,7 +248,12 @@ pub struct EthMultiStore {
// order lock: cache, then vaults
cache: RwLock<BTreeMap<StoreAccountRef, Vec<SafeAccount>>>,
vaults: Mutex<HashMap<String, Box<VaultKeyDirectory>>>,
dir_hash: Mutex<Option<u64>>,
timestamp: Mutex<Timestamp>,
}

struct Timestamp {
dir_hash: Option<u64>,
last_checked: Instant,
}

impl EthMultiStore {
Expand All @@ -261,20 +269,27 @@ impl EthMultiStore {
vaults: Mutex::new(HashMap::new()),
iterations: iterations,
cache: Default::default(),
dir_hash: Default::default(),
timestamp: Mutex::new(Timestamp {
dir_hash: None,
last_checked: Instant::now(),
}),
};
store.reload_accounts()?;
Ok(store)
}

fn reload_if_changed(&self) -> Result<(), Error> {
let mut last_dir_hash = self.dir_hash.lock();
let dir_hash = Some(self.dir.unique_repr()?);
if *last_dir_hash == dir_hash {
return Ok(())
let mut last_timestamp = self.timestamp.lock();
let now = Instant::now();
if (now - last_timestamp.last_checked) > Duration::from_secs(REFRESH_TIME_SEC) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would rather store last_timestamp.valid_till to avoid subtraction (and possible overflow in case the time is adjusted by the system):

if last_timestamp.valid_till < now {

}

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instant is monotonic. It is not affected by clock changes and is guaranteed to be never decreased.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair enough.

let dir_hash = Some(self.dir.unique_repr()?);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This appears to be creating a disk IO read problem. Because every 5 seconds you end up looping and hashing all files in the keystore...if this is a large number of files (in my case 150,000)...in the end causes total server crash.

Is there a better way to detect changes in the keystore dir? Possibly only detect a change when new key is added via personal_newAccount... calls. Or simple file count?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unique_repr is changed to return directory' last modification date in this PR. So no more hashing here

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@AdvancedStyle are you saying your server crashes with the changes from this PR?

Copy link

@AdvancedStyle AdvancedStyle Oct 20, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, doing more testing and will provide more information shortly.

Basically once I compile/run latest master with this PR it fixes the peer connections, but then crashes the server (with noticeably huge amount of disk reading).

But i'll do a bit more testing and see if I can gather some actually error messages from the server crash

Copy link

@AdvancedStyle AdvancedStyle Oct 20, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, i've been using iostat / iotop / tracefile (https://gitlab.com/ole.tange/tangetools/tree/master/tracefile) to montor disk IO and here are the results:

With using latest master the iostat util is quite low (less than 5%) however the tracefiles shows that parity is constantly accessing the keystore files...but it is not causing high disk util and not causing server crash..only causing peers to drop/timeout and syncing to stop.

After PR #6827 the iostat util goes through the roof (50-100% disk util all the time):

iostat output sample: avg-cpu: %user %nice %system %iowait %steal %idle 23.14 0.00 20.48 13.56 0.27 42.55

Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
vda 0.00 0.00 5339.00 0.00 49176.00 0.00 18.42 0.75 0.14 0.14 0.00 0.09 49.60

avg-cpu: %user %nice %system %iowait %steal %idle
20.21 0.00 17.32 14.70 0.26 47.51

Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
vda 0.00 17.00 5192.00 3.00 46776.00 80.00 18.04 0.74 0.14 0.14 2.67 0.11 55.60

avg-cpu: %user %nice %system %iowait %steal %idle
25.07 0.00 15.04 10.55 0.53 48.81

Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
vda 0.00 28.00 4083.00 2.00 38940.00 120.00 19.12 0.60 0.15 0.15 2.00 0.10 42.40

avg-cpu: %user %nice %system %iowait %steal %idle
22.13 0.00 18.67 15.73 0.27 43.20

Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
vda 0.00 10.00 3744.00 4.00 32000.00 56.00 17.11 0.83 0.22 0.21 1.00 0.17 64.00

avg-cpu: %user %nice %system %iowait %steal %idle
24.42 0.00 15.32 32.21 0.26 27.79

Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
vda 0.00 13.00 2987.00 4.00 28780.00 68.00 19.29 2.07 0.70 0.70 1.00 0.27 80.80

But the strange thing is the tracefile (strace script) is not really seeing a lot of file access on the process (parity is no longer accessing the keystore files individuals)... it only appears to be accessing the keystore dir...so why all the disk IO?:

Sample of tracefile output:

/home/ethereum/.local/share/io.parity.ethereum/chains/ethereum/db/906a34e69aec8c0d/overlayrecent/db/484066.sst
/home/ethereum/.local/share/io.parity.ethereum/chains/ethereum/db/906a34e69aec8c0d/overlayrecent/db/484067.log
/home/ethereum/.local/share/io.parity.ethereum/chains/ethereum/db/906a34e69aec8c0d/overlayrecent/db/484068.sst
/home/ethereum/.local/share/io.parity.ethereum/chains/ethereum/db/906a34e69aec8c0d/overlayrecent/db/484066.sst
/home/ethereum/.local/share/io.parity.ethereum/chains/ethereum/db/906a34e69aec8c0d/overlayrecent/db/484068.sst
/home/ethereum/.local/share/io.parity.ethereum/chains/ethereum/db/906a34e69aec8c0d/overlayrecent/db/484069.sst
/home/ethereum/.local/share/io.parity.ethereum/chains/ethereum/db/906a34e69aec8c0d/overlayrecent/db/484069.sst
/home/ethereum/.local/share/io.parity.ethereum/chains/ethereum/db/906a34e69aec8c0d/overlayrecent/db/484066.sst
/home/ethereum/.local/share/io.parity.ethereum/chains/ethereum/db/906a34e69aec8c0d/overlayrecent/db/484064.sst
/home/ethereum/.ethereum/keystore/ethereum
/home/ethereum/.ethereum/keystore/ethereum
/home/ethereum/.ethereum/keystore/ethereum
/home/ethereum/.ethereum/keystore/ethereum
/home/ethereum/.local/share/io.parity.ethereum/chains/ethereum/db/906a34e69aec8c0d/overlayrecent/db/484070.log
/sys/devices/system/cpu/online
/home/ethereum/.local/share/io.parity.ethereum/chains/ethereum/db/906a34e69aec8c0d/overlayrecent/db/484071.sst
/home/ethereum/.local/share/io.parity.ethereum/chains/ethereum/db/906a34e69aec8c0d/overlayrecent/db/484071.sst
/home/ethereum/.local/share/io.parity.ethereum/chains/ethereum/db/906a34e69aec8c0d/overlayrecent/db/484072.sst
/home/ethereum/.local/share/io.parity.ethereum/chains/ethereum/db/906a34e69aec8c0d/overlayrecent/db/484072.sst
/home/ethereum/.local/share/io.parity.ethereum/chains/ethereum/db/906a34e69aec8c0d/overlayrecent/db/484071.sst
/home/ethereum/.local/share/io.parity.ethereum/chains/ethereum/db/906a34e69aec8c0d/overlayrecent/db/484068.sst
/home/ethereum/.ethereum/keystore/ethereum
/home/ethereum/.ethereum/keystore/ethereum

VM then hangs / crashes after a few minutes with hung_task_timeout_secs

INFO: task kworker/[...] blocked for more than 120 seconds.
 Not tainted [...]
 kernel: "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.

And a bunch of similar hung tasks.

Conclusion:
For some reason the modified time check on the keystore dir might be creating a lot of IO read?

Copy link

@AdvancedStyle AdvancedStyle Oct 20, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean at first was testing with master (no IO read problem / no crash).

Then i compiled with this PR #6827 (git fetch origin pull/6827/head:sometest && git checkout sometest and compiled)

The IO read problem only occurred after compiling with PR #6827

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The IO must be just the database IO. Does the machine have HDD or SSD? How exactly does it crash? Is there a backtrace printed?

Copy link

@AdvancedStyle AdvancedStyle Oct 20, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SSD. Server running on a KVM virtual machine (Debian host, Ubuntu 16 guest).

The entire virtual machine just locks up; SSH non-responsive and virtmanager console non-responsive, then it dumps a series of error messages in the virtmanager console similar to:

INFO: task kworker/[...] blocked for more than 120 seconds.
Not tainted [...]
kernel: "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.

Note, i'm running parity with params:

screen -d -m parity --cache-size 4096 --allow-ips public --jsonrpc-apis web3,eth,net,parity,traces,rpc,personal --geth --keys-path /home/ethereum/.ethereum/keystore

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've decided to clone the VM and try to delete/resync the chain from 0 to see if that helps, and so far it seems to be doing ok (no IO read problems so far)...will post back shortly

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have resynced the chain from 0 using PR #6827 and now disk IO is normal again and all is working.

So everything is fine now (fingers crossed).

The issue of slow key generation (issue #6839 is also fixed)

last_timestamp.last_checked = now;
if last_timestamp.dir_hash == dir_hash {
return Ok(())
}
self.reload_accounts()?;
last_timestamp.dir_hash = dir_hash;
}
self.reload_accounts()?;
*last_dir_hash = dir_hash;
Ok(())
}

Expand Down Expand Up @@ -455,11 +470,11 @@ impl SimpleSecretStore for EthMultiStore {
}

fn account_ref(&self, address: &Address) -> Result<StoreAccountRef, Error> {
use std::collections::Bound;
self.reload_if_changed()?;
self.cache.read().keys()
.find(|r| &r.address == address)
.cloned()
.ok_or(Error::InvalidAccount)
let cache = self.cache.read();
let mut r = cache.range((Bound::Included(*address), Bound::Included(*address)));
r.next().ok_or(Error::InvalidAccount).map(|(k, _)| k.clone())
}

fn accounts(&self) -> Result<Vec<StoreAccountRef>, Error> {
Expand Down
19 changes: 16 additions & 3 deletions ethstore/src/secret_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

use std::hash::{Hash, Hasher};
use std::path::PathBuf;
use std::cmp::Ordering;
use ethkey::{Address, Message, Signature, Secret, Public};
use Error;
use json::{Uuid, OpaqueKeyFile};
Expand All @@ -32,12 +33,24 @@ pub enum SecretVaultRef {
}

/// Stored account reference
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
#[derive(Debug, Clone, PartialEq, Eq, Ord)]
pub struct StoreAccountRef {
/// Vault reference
pub vault: SecretVaultRef,
/// Account address
pub address: Address,
/// Vault reference
pub vault: SecretVaultRef,
}

impl PartialOrd for StoreAccountRef {
fn partial_cmp(&self, other: &StoreAccountRef) -> Option<Ordering> {
Some(self.address.cmp(&other.address).then_with(|| self.vault.cmp(&other.vault)))
}
}

impl ::std::borrow::Borrow<Address> for StoreAccountRef {
fn borrow(&self) -> &Address {
&self.address
}
}

/// Simple Secret Store API
Expand Down