-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
replay: reload tower if set-identity during startup #35173
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -578,6 +578,21 @@ impl ReplayStage { | |
let _exit = Finalizer::new(exit.clone()); | ||
let mut identity_keypair = cluster_info.keypair().clone(); | ||
let mut my_pubkey = identity_keypair.pubkey(); | ||
if my_pubkey != tower.node_pubkey { | ||
// set-identity was called during the startup procedure, ensure the tower is consistent | ||
// before starting the loop. further calls to set-identity will reload the tower in the loop | ||
let my_old_pubkey = tower.node_pubkey; | ||
tower = Self::load_tower( | ||
tower_storage.as_ref(), | ||
&my_pubkey, | ||
&vote_account, | ||
&bank_forks, | ||
); | ||
warn!( | ||
"Identity changed during startup from {} to {}", | ||
my_old_pubkey, my_pubkey | ||
); | ||
} | ||
let (mut progress, mut heaviest_subtree_fork_choice) = | ||
Self::initialize_progress_and_fork_choice_with_locked_bank_forks( | ||
&bank_forks, | ||
|
@@ -983,28 +998,12 @@ impl ReplayStage { | |
my_pubkey = identity_keypair.pubkey(); | ||
|
||
// Load the new identity's tower | ||
tower = Tower::restore(tower_storage.as_ref(), &my_pubkey) | ||
.and_then(|restored_tower| { | ||
let root_bank = bank_forks.read().unwrap().root_bank(); | ||
let slot_history = root_bank.get_slot_history(); | ||
restored_tower.adjust_lockouts_after_replay( | ||
root_bank.slot(), | ||
&slot_history, | ||
) | ||
}) | ||
.unwrap_or_else(|err| { | ||
if err.is_file_missing() { | ||
Tower::new_from_bankforks( | ||
&bank_forks.read().unwrap(), | ||
&my_pubkey, | ||
&vote_account, | ||
) | ||
} else { | ||
error!("Failed to load tower for {}: {}", my_pubkey, err); | ||
std::process::exit(1); | ||
} | ||
}); | ||
|
||
tower = Self::load_tower( | ||
tower_storage.as_ref(), | ||
&my_pubkey, | ||
&vote_account, | ||
&bank_forks, | ||
); | ||
// Ensure the validator can land votes with the new identity before | ||
// becoming leader | ||
has_new_vote_been_rooted = !wait_for_vote_to_start_leader; | ||
|
@@ -1154,6 +1153,32 @@ impl ReplayStage { | |
}) | ||
} | ||
|
||
fn load_tower( | ||
tower_storage: &dyn TowerStorage, | ||
node_pubkey: &Pubkey, | ||
vote_account: &Pubkey, | ||
bank_forks: &Arc<RwLock<BankForks>>, | ||
) -> Tower { | ||
Tower::restore(tower_storage, node_pubkey) | ||
.and_then(|restored_tower| { | ||
let root_bank = bank_forks.read().unwrap().root_bank(); | ||
let slot_history = root_bank.get_slot_history(); | ||
restored_tower.adjust_lockouts_after_replay(root_bank.slot(), &slot_history) | ||
}) | ||
.unwrap_or_else(|err| { | ||
if err.is_file_missing() { | ||
Tower::new_from_bankforks( | ||
&bank_forks.read().unwrap(), | ||
node_pubkey, | ||
vote_account, | ||
) | ||
} else { | ||
error!("Failed to load tower for {}: {}", node_pubkey, err); | ||
std::process::exit(1); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why exit instead of panic! ? Would returning an error be better? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's just rust style guidelines: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. however i haven't really had to debug these scenarios before, if you think the unwind from panic is helpful we can go with that. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the difference is small, but panic! is probably better style because we use it more: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sounds good, i'll change to panic in a separate PR, don't want to backport changes to existing behavior. |
||
} | ||
}) | ||
} | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a test maybe? |
||
fn check_for_vote_only_mode( | ||
heaviest_bank_slot: Slot, | ||
forks_root: Slot, | ||
|
@@ -4230,9 +4255,9 @@ pub(crate) mod tests { | |
crate::{ | ||
consensus::{ | ||
progress_map::{ValidatorStakeInfo, RETRANSMIT_BASE_DELAY_MS}, | ||
tower_storage::NullTowerStorage, | ||
tower_storage::{FileTowerStorage, NullTowerStorage}, | ||
tree_diff::TreeDiff, | ||
Tower, | ||
Tower, VOTE_THRESHOLD_DEPTH, | ||
}, | ||
replay_stage::ReplayStage, | ||
vote_simulator::{self, VoteSimulator}, | ||
|
@@ -4254,7 +4279,7 @@ pub(crate) mod tests { | |
}, | ||
solana_runtime::{ | ||
accounts_background_service::AbsRequestSender, | ||
commitment::BlockCommitment, | ||
commitment::{BlockCommitment, VOTE_THRESHOLD_SIZE}, | ||
genesis_utils::{GenesisConfigInfo, ValidatorVoteKeypairs}, | ||
}, | ||
solana_sdk::{ | ||
|
@@ -4278,6 +4303,7 @@ pub(crate) mod tests { | |
iter, | ||
sync::{atomic::AtomicU64, Arc, RwLock}, | ||
}, | ||
tempfile::tempdir, | ||
trees::{tr, Tree}, | ||
}; | ||
|
||
|
@@ -8598,4 +8624,54 @@ pub(crate) mod tests { | |
assert_eq!(reset_fork, Some(4)); | ||
assert_eq!(failures, vec![HeaviestForkFailures::LockedOut(4),]); | ||
} | ||
|
||
#[test] | ||
fn test_tower_load_missing() { | ||
let tower_file = tempdir().unwrap().into_path(); | ||
let tower_storage = FileTowerStorage::new(tower_file); | ||
let node_pubkey = Pubkey::new_unique(); | ||
let vote_account = Pubkey::new_unique(); | ||
let tree = tr(0) / (tr(1) / (tr(3) / (tr(4))) / (tr(2) / (tr(5) / (tr(6))))); | ||
let generate_votes = |pubkeys: Vec<Pubkey>| { | ||
pubkeys | ||
.into_iter() | ||
.zip(iter::once(vec![0, 1, 2, 5, 6]).chain(iter::repeat(vec![0, 1, 3, 4]).take(2))) | ||
.collect() | ||
}; | ||
let (vote_simulator, _blockstore) = | ||
setup_forks_from_tree(tree, 3, Some(Box::new(generate_votes))); | ||
let bank_forks = vote_simulator.bank_forks; | ||
|
||
let tower = | ||
ReplayStage::load_tower(&tower_storage, &node_pubkey, &vote_account, &bank_forks); | ||
let expected_tower = Tower::new_for_tests(VOTE_THRESHOLD_DEPTH, VOTE_THRESHOLD_SIZE); | ||
assert_eq!(tower.vote_state, expected_tower.vote_state); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, do we test that the new tower has new identity somewhere? |
||
assert_eq!(tower.node_pubkey, node_pubkey); | ||
} | ||
|
||
#[test] | ||
fn test_tower_load() { | ||
let tower_file = tempdir().unwrap().into_path(); | ||
let tower_storage = FileTowerStorage::new(tower_file); | ||
let node_keypair = Keypair::new(); | ||
let node_pubkey = node_keypair.pubkey(); | ||
let vote_account = Pubkey::new_unique(); | ||
let tree = tr(0) / (tr(1) / (tr(3) / (tr(4))) / (tr(2) / (tr(5) / (tr(6))))); | ||
let generate_votes = |pubkeys: Vec<Pubkey>| { | ||
pubkeys | ||
.into_iter() | ||
.zip(iter::once(vec![0, 1, 2, 5, 6]).chain(iter::repeat(vec![0, 1, 3, 4]).take(2))) | ||
.collect() | ||
}; | ||
let (vote_simulator, _blockstore) = | ||
setup_forks_from_tree(tree, 3, Some(Box::new(generate_votes))); | ||
let bank_forks = vote_simulator.bank_forks; | ||
let expected_tower = Tower::new_random(node_pubkey); | ||
expected_tower.save(&tower_storage, &node_keypair).unwrap(); | ||
|
||
let tower = | ||
ReplayStage::load_tower(&tower_storage, &node_pubkey, &vote_account, &bank_forks); | ||
assert_eq!(tower.vote_state, expected_tower.vote_state); | ||
assert_eq!(tower.node_pubkey, expected_tower.node_pubkey); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we need a warn! here but not in the loop?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we do warn in loop see line 1011
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, if you always warn you can put that in the common function I guess
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
would prefer not to, as the helper is just a generic load of the tower. it doesn't need to know anything about the previous pubkey.