From 2abd03ef330c8b55e73755a7ef4b43baf1451657 Mon Sep 17 00:00:00 2001 From: Sebastian Kunert Date: Mon, 5 Aug 2024 09:40:43 +0200 Subject: [PATCH] beefy: Tolerate pruned state on runtime API call (#5197) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While working on #5129 I noticed that after warp sync, nodes would print: ``` 2024-07-29 17:59:23.898 ERROR ⋮beefy: 🥩 Error: ConsensusReset. Restarting voter. ``` After some debugging I found that we enter the following loop: 1. Wait for beefy pallet to be available: Pallet is detected available directly after warp sync since we are at the tip. 2. Wait for headers from tip to beefy genesis to be available: During this time we don't process finality notifications, since we later want to inspect all the headers for authority set changes. 3. Gap sync finishes, route to beefy genesis is available. 4. The worker starts acting, tries to fetch beefy genesis block. It fails, since we are acting on old finality notifications where the state is already pruned. 5. Whole beefy subsystem is being restarted, loading the state from db again and iterating a lot of headers. This already happened before #5129. --- prdoc/pr_5197.prdoc | 16 ++++++++++++++++ substrate/client/consensus/beefy/src/lib.rs | 5 +++-- .../client/consensus/beefy/src/worker.rs | 19 ++++++++++++------- 3 files changed, 31 insertions(+), 9 deletions(-) create mode 100644 prdoc/pr_5197.prdoc diff --git a/prdoc/pr_5197.prdoc b/prdoc/pr_5197.prdoc new file mode 100644 index 000000000000..40e25cf70dd1 --- /dev/null +++ b/prdoc/pr_5197.prdoc @@ -0,0 +1,16 @@ +# Schema: Polkadot SDK PRDoc Schema (prdoc) v1.0.0 +# See doc at https://raw.githubusercontent.com/paritytech/polkadot-sdk/master/prdoc/schema_user.json + +title: Prevent `ConsensusReset` by tolerating runtime API errors in BEEFY + +doc: + - audience: Node Operator + description: | + After warp sync, the BEEFY worker was trying to execute runtime calls on + blocks which had their state already pruned. This led to an error and restarting + of the beefy subsystem in a loop. After this PR, the worker tolerates call errors and therefore prevents this + worker restart loop. + +crates: + - name: sc-consensus-beefy + bump: minor diff --git a/substrate/client/consensus/beefy/src/lib.rs b/substrate/client/consensus/beefy/src/lib.rs index f7c999aad591..30cdd494905f 100644 --- a/substrate/client/consensus/beefy/src/lib.rs +++ b/substrate/client/consensus/beefy/src/lib.rs @@ -32,7 +32,7 @@ use crate::{ metrics::register_metrics, }; use futures::{stream::Fuse, FutureExt, StreamExt}; -use log::{debug, error, info, warn}; +use log::{debug, error, info, trace, warn}; use parking_lot::Mutex; use prometheus_endpoint::Registry; use sc_client_api::{Backend, BlockBackend, BlockchainEvents, FinalityNotification, Finalizer}; @@ -451,7 +451,8 @@ where state.set_best_grandpa(best_grandpa.clone()); // Overwrite persisted data with newly provided `min_block_delta`. state.set_min_block_delta(min_block_delta); - debug!(target: LOG_TARGET, "🥩 Loading BEEFY voter state from db: {:?}.", state); + debug!(target: LOG_TARGET, "🥩 Loading BEEFY voter state from db."); + trace!(target: LOG_TARGET, "🥩 Loaded state: {:?}.", state); // Make sure that all the headers that we need have been synced. let mut new_sessions = vec![]; diff --git a/substrate/client/consensus/beefy/src/worker.rs b/substrate/client/consensus/beefy/src/worker.rs index ca714ffadd77..3c7f3b1b6efa 100644 --- a/substrate/client/consensus/beefy/src/worker.rs +++ b/substrate/client/consensus/beefy/src/worker.rs @@ -458,13 +458,18 @@ where notification.tree_route, ); - self.runtime - .runtime_api() - .beefy_genesis(notification.hash) - .ok() - .flatten() - .filter(|genesis| *genesis == self.persisted_state.pallet_genesis) - .ok_or(Error::ConsensusReset)?; + match self.runtime.runtime_api().beefy_genesis(notification.hash) { + Ok(Some(genesis)) if genesis != self.persisted_state.pallet_genesis => { + debug!(target: LOG_TARGET, "🥩 ConsensusReset detected. Expected genesis: {}, found genesis: {}", self.persisted_state.pallet_genesis, genesis); + return Err(Error::ConsensusReset) + }, + Ok(_) => {}, + Err(api_error) => { + // This can happen in case the block was already pruned. + // Mostly after warp sync when finality notifications are piled up. + debug!(target: LOG_TARGET, "🥩 Unable to check beefy genesis: {}", api_error); + }, + } let mut new_session_added = false; if *header.number() > self.best_grandpa_block() {