-
Notifications
You must be signed in to change notification settings - Fork 3.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[NHC] Add Fetchers, overhaul Evaluator inputs
- Loading branch information
Showing
38 changed files
with
1,416 additions
and
1,102 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,213 @@ | ||
// Copyright (c) Aptos | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
/// These evaluators are only valuable in certain contexts. For example, this is | ||
/// not a useful evaluator for node registration for the AITs, since each node | ||
/// is running in their own isolated network, where no consensus is occurring. | ||
/// This is useful for the AIT itself though, where the nodes are participating | ||
/// in a real network. | ||
use crate::{ | ||
checker::{CheckResult, Checker}, | ||
get_provider, | ||
provider::{ | ||
metrics::{get_metric, GetMetricResult, Label, MetricsProvider}, | ||
Provider, ProviderCollection, | ||
}, | ||
}; | ||
use anyhow::Result; | ||
use once_cell::sync::Lazy; | ||
use prometheus_parse::Scrape; | ||
use serde::{Deserialize, Serialize}; | ||
|
||
use super::traits::CheckerError; | ||
|
||
/// Evaluator for minimum number of peers. | ||
#[derive(Clone, Debug, Deserialize, Serialize)] | ||
pub struct MinimumPeersCheckerConfig { | ||
#[serde(default)] | ||
pub required: bool, | ||
|
||
/// The minimum number of inbound connections required to be able to pass. | ||
/// For fullnodes, it only matters that this is greater than zero if the | ||
/// node operator wants to seed data to other nodes. | ||
#[serde(default = "MinimumPeersCheckerConfig::default_minimum_peers_inbound")] | ||
pub minimum_peers_inbound: u64, | ||
|
||
/// The minimum number of outbound connections required to be able to pass. | ||
/// This must be greater than zero for the node to be able to synchronize. | ||
#[serde(default = "MinimumPeersCheckerConfig::default_minimum_peers_outbound")] | ||
pub minimum_peers_outbound: u64, | ||
} | ||
|
||
impl MinimumPeersCheckerConfig { | ||
pub fn default_minimum_peers_inbound() -> u64 { | ||
0 | ||
} | ||
|
||
pub fn default_minimum_peers_outbound() -> u64 { | ||
1 | ||
} | ||
} | ||
|
||
#[derive(Debug)] | ||
pub struct MinimumPeersChecker { | ||
config: MinimumPeersCheckerConfig, | ||
} | ||
|
||
impl MinimumPeersChecker { | ||
pub fn new(config: MinimumPeersCheckerConfig) -> Self { | ||
Self { config } | ||
} | ||
|
||
#[allow(clippy::comparison_chain)] | ||
fn build_evaluation( | ||
&self, | ||
connections: u64, | ||
minimum: u64, | ||
connection_type: &ConnectionType, | ||
) -> CheckResult { | ||
let name = connection_type.get_name(); | ||
let particle = connection_type.get_particle(); | ||
let opposite_particle = connection_type.get_opposite_particle(); | ||
let explanation = format!( | ||
"There are {} {} connections {} other nodes {} the target node (the minimum is {}).", | ||
connections, name, particle, opposite_particle, minimum | ||
); | ||
if connections >= minimum { | ||
CheckResult::new( | ||
format!( | ||
"There are sufficient {} connections {} the target node", | ||
name, particle | ||
), | ||
100, | ||
explanation, | ||
) | ||
} else { | ||
CheckResult::new( | ||
format!( | ||
"There are not enough {} connections {} the target node", | ||
name, particle | ||
), | ||
50, | ||
format!("{} Try setting explicit peers.", explanation), | ||
) | ||
.links(vec![ | ||
"https://aptos.dev/nodes/full-node/troubleshooting-fullnode-setup".to_string(), | ||
]) | ||
} | ||
} | ||
|
||
fn default_minimum_inbound() -> u64 { | ||
0 | ||
} | ||
|
||
fn default_minimum_outbound() -> u64 { | ||
1 | ||
} | ||
} | ||
|
||
#[async_trait::async_trait] | ||
impl Checker for MinimumPeersChecker { | ||
async fn check(&self, input: &ProviderCollection) -> Result<Vec<CheckResult>, CheckerError> { | ||
let target_metrics_provider = get_provider!( | ||
input.target_metrics_provider, | ||
self.config.required, | ||
MetricsProvider | ||
); | ||
let scrape = target_metrics_provider.provide().await?; | ||
let (inbound_connections, outbound_connections) = match get_metrics(&scrape) { | ||
Ok((inbound_connections, outbound_connections)) => { | ||
(inbound_connections, outbound_connections) | ||
} | ||
Err(evaluation_results) => return Ok(evaluation_results), | ||
}; | ||
|
||
Ok(vec![ | ||
self.build_evaluation( | ||
inbound_connections, | ||
self.config.minimum_peers_inbound, | ||
&ConnectionType::Inbound, | ||
), | ||
self.build_evaluation( | ||
outbound_connections, | ||
self.config.minimum_peers_outbound, | ||
&ConnectionType::Outbound, | ||
), | ||
]) | ||
} | ||
} | ||
|
||
////////////////////////////////////////////////////////////////////////////// | ||
// Helpers. | ||
////////////////////////////////////////////////////////////////////////////// | ||
|
||
const METRIC: &str = "aptos_connections"; | ||
|
||
static INBOUND_LABEL: Lazy<Label> = Lazy::new(|| Label { | ||
key: "direction", | ||
value: "inbound", | ||
}); | ||
static OUTBOUND_LABEL: Lazy<Label> = Lazy::new(|| Label { | ||
key: "direction", | ||
value: "outbound", | ||
}); | ||
|
||
enum ConnectionType { | ||
Inbound, | ||
Outbound, | ||
} | ||
|
||
impl ConnectionType { | ||
fn get_name(&self) -> &'static str { | ||
match &self { | ||
ConnectionType::Inbound => "inbound", | ||
ConnectionType::Outbound => "outbound", | ||
} | ||
} | ||
|
||
fn get_particle(&self) -> &'static str { | ||
match &self { | ||
ConnectionType::Inbound => "from", | ||
ConnectionType::Outbound => "to", | ||
} | ||
} | ||
|
||
fn get_opposite_particle(&self) -> &'static str { | ||
match &self { | ||
ConnectionType::Inbound => "to", | ||
ConnectionType::Outbound => "from", | ||
} | ||
} | ||
} | ||
|
||
/// Given a Scrape, pull the metrics telling us the number of inbound and | ||
/// outbound connections. | ||
fn get_metrics(metrics: &Scrape) -> Result<(u64, u64), Vec<CheckResult>> { | ||
let result_on_missing_fn = || { | ||
CheckResult::new( | ||
"Missing metric".to_string(), | ||
0, | ||
format!( | ||
"The metrics from the node are missing the metric: {}", | ||
METRIC | ||
), | ||
) | ||
}; | ||
let (inbound, outbound) = ( | ||
get_metric(metrics, METRIC, Some(&INBOUND_LABEL), result_on_missing_fn), | ||
get_metric(metrics, METRIC, Some(&OUTBOUND_LABEL), result_on_missing_fn), | ||
); | ||
if let (GetMetricResult::Present(inbound), GetMetricResult::Present(outbound)) = | ||
(&inbound, &outbound) | ||
{ | ||
return Ok((*inbound, *outbound)); | ||
} | ||
let mut evaluation_results = vec![]; | ||
if let GetMetricResult::Missing(evaluation_result) = inbound { | ||
evaluation_results.push(evaluation_result); | ||
} | ||
if let GetMetricResult::Missing(evaluation_result) = outbound { | ||
evaluation_results.push(evaluation_result); | ||
} | ||
Err(evaluation_results) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
// Copyright (c) Aptos | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
mod minimum_peers; | ||
mod traits; | ||
mod types; | ||
|
||
use serde::{Deserialize, Serialize}; | ||
pub use traits::{Checker, CheckerError}; | ||
pub use types::{CheckResult, CheckSummary}; | ||
|
||
use self::minimum_peers::{MinimumPeersChecker, MinimumPeersCheckerConfig}; | ||
|
||
/// This enum lets us represent all the different Bypassers in a config. | ||
/// This should only be used at config reading time. | ||
#[derive(Clone, Debug, Deserialize, Serialize)] | ||
#[serde(tag = "type")] | ||
pub enum CheckerConfig { | ||
MinimumPeers(MinimumPeersCheckerConfig), | ||
} | ||
|
||
impl CheckerConfig { | ||
pub fn try_into_boxed_checker(self) -> Result<Box<dyn Checker>, anyhow::Error> { | ||
match self { | ||
Self::MinimumPeers(config) => Ok(Box::new(MinimumPeersChecker::new(config))), | ||
} | ||
} | ||
} | ||
|
||
pub fn build_checkers(checkers: &[CheckerConfig]) -> Result<Vec<Box<dyn Checker>>, anyhow::Error> { | ||
checkers | ||
.iter() | ||
.map(|checker| checker.try_into_boxed_checker()) | ||
.collect() | ||
} | ||
|
||
#[derive(Clone, Debug, Deserialize, Serialize)] | ||
pub struct CommonCheckerArgs { | ||
pub required: bool, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
// Copyright (c) Aptos | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
use std::fmt::Debug; | ||
use thiserror::Error; | ||
|
||
use crate::provider::{ProviderCollection, ProviderError}; | ||
|
||
use super::CheckResult; | ||
|
||
/// A Checker is a component of NHC that is responsible for checking a | ||
/// particular aspect of the node under investigation, be that metrics, | ||
/// system information, API checks, load tests, etc. | ||
#[async_trait::async_trait] | ||
pub trait Checker: Debug + Sync + Send { | ||
/// This function is expected to take in a ProviderCollection | ||
/// and return a vec of evaluation results. It should only return | ||
/// errors when there is something wrong with NHC itself or the | ||
/// baseline node. If something is unexpected with the target, | ||
/// we expect this function to return an EvaluationResult indicating | ||
/// as such. | ||
async fn check( | ||
&self, | ||
input: &ProviderCollection, | ||
) -> anyhow::Result<Vec<CheckResult>, CheckerError>; | ||
} | ||
|
||
#[derive(Error, Debug)] | ||
pub enum CheckerError { | ||
#[error("Provider failed to return data: {0:#}")] | ||
ProviderError(#[from] ProviderError), | ||
|
||
#[error("Something went wrong hitting endpoint {0}: {1:#}")] | ||
RetryableEndpointError(&'static str, #[source] anyhow::Error), | ||
|
||
#[error("Something went wrong hitting endpoint {0}: {1:#}")] | ||
NonRetryableEndpointError(&'static str, #[source] anyhow::Error), | ||
} | ||
|
||
impl CheckerError { | ||
pub fn is_retryable(&self) -> bool { | ||
match self { | ||
CheckerError::ProviderError(error) => error.is_retryable(), | ||
CheckerError::RetryableEndpointError(_, __) => true, | ||
CheckerError::NonRetryableEndpointError(_, _) => false, | ||
} | ||
} | ||
} |
Oops, something went wrong.