diff --git a/Cargo.lock b/Cargo.lock index 6d976452e4c3..40e2d9ebf3da 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2534,6 +2534,15 @@ dependencies = [ "clap_derive 4.4.7", ] +[[package]] +name = "clap-num" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488557e97528174edaa2ee268b23a809e0c598213a4bbcb4f34575a46fda147e" +dependencies = [ + "num-traits", +] + [[package]] name = "clap_builder" version = "4.4.11" @@ -2747,6 +2756,17 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" +[[package]] +name = "colored" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2674ec482fbc38012cf31e6c42ba0177b431a0cb6f15fe40efa5aab1bda516f6" +dependencies = [ + "is-terminal", + "lazy_static", + "windows-sys 0.48.0", +] + [[package]] name = "comfy-table" version = "7.0.1" @@ -11922,6 +11942,7 @@ dependencies = [ "sp-core", "sp-keyring", "thiserror", + "tokio", "tracing-gum", ] @@ -12651,6 +12672,8 @@ dependencies = [ "async-trait", "futures", "parking_lot 0.12.1", + "polkadot-erasure-coding", + "polkadot-node-primitives", "polkadot-node-subsystem", "polkadot-node-subsystem-util", "polkadot-primitives", @@ -13266,6 +13289,52 @@ dependencies = [ "sp-core", ] +[[package]] +name = "polkadot-subsystem-bench" +version = "1.0.0" +dependencies = [ + "assert_matches", + "async-trait", + "clap 4.4.11", + "clap-num", + "color-eyre", + "colored", + "env_logger 0.9.3", + "futures", + "futures-timer", + "itertools 0.11.0", + "log", + "orchestra", + "parity-scale-codec", + "paste", + "polkadot-availability-recovery", + "polkadot-erasure-coding", + "polkadot-node-metrics", + "polkadot-node-network-protocol", + "polkadot-node-primitives", + "polkadot-node-subsystem", + "polkadot-node-subsystem-test-helpers", + "polkadot-node-subsystem-types", + "polkadot-node-subsystem-util", + "polkadot-overseer", + "polkadot-primitives", + "polkadot-primitives-test-helpers", + "prometheus", + "rand 0.8.5", + "sc-keystore", + "sc-network", + "sc-service", + "serde", + "serde_yaml", + "sp-application-crypto", + "sp-core", + "sp-keyring", + "sp-keystore", + "substrate-prometheus-endpoint", + "tokio", + "tracing-gum", +] + [[package]] name = "polkadot-test-client" version = "1.0.0" @@ -16739,6 +16808,19 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_yaml" +version = "0.9.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3cc7a1570e38322cfe4154732e5110f887ea57e22b76f4bfd32b5bdd3368666c" +dependencies = [ + "indexmap 2.0.0", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + [[package]] name = "serial_test" version = "2.0.0" @@ -19417,9 +19499,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.32.0" +version = "1.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17ed6077ed6cd6c74735e21f37eb16dc3935f96878b1fe961074089cc80893f9" +checksum = "4f38200e3ef7995e5ef13baec2f432a6da0aa9ac495b2c0e8f3b7eec2c92d653" dependencies = [ "backtrace", "bytes", @@ -20027,6 +20109,12 @@ dependencies = [ "subtle 2.4.1", ] +[[package]] +name = "unsafe-libyaml" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28467d3e1d3c6586d8f25fa243f544f5800fec42d97032474e17222c2b75cfa" + [[package]] name = "unsigned-varint" version = "0.7.1" diff --git a/Cargo.toml b/Cargo.toml index 0091c2d7c8be..e3b6e3aadfe5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -151,6 +151,7 @@ members = [ "polkadot/node/primitives", "polkadot/node/service", "polkadot/node/subsystem", + "polkadot/node/subsystem-bench", "polkadot/node/subsystem-test-helpers", "polkadot/node/subsystem-types", "polkadot/node/subsystem-util", diff --git a/polkadot/node/network/availability-recovery/Cargo.toml b/polkadot/node/network/availability-recovery/Cargo.toml index b97572181b0b..063d75275ca5 100644 --- a/polkadot/node/network/availability-recovery/Cargo.toml +++ b/polkadot/node/network/availability-recovery/Cargo.toml @@ -11,6 +11,7 @@ workspace = true [dependencies] futures = "0.3.21" +tokio = "1.24.2" schnellru = "0.2.1" rand = "0.8.5" fatality = "0.0.6" @@ -40,3 +41,6 @@ sc-network = { path = "../../../../substrate/client/network" } polkadot-node-subsystem-test-helpers = { path = "../../subsystem-test-helpers" } polkadot-primitives-test-helpers = { path = "../../../primitives/test-helpers" } + +[features] +subsystem-benchmarks = [] diff --git a/polkadot/node/network/availability-recovery/src/lib.rs b/polkadot/node/network/availability-recovery/src/lib.rs index 4a658449f09c..fb8064878f4f 100644 --- a/polkadot/node/network/availability-recovery/src/lib.rs +++ b/polkadot/node/network/availability-recovery/src/lib.rs @@ -65,7 +65,7 @@ mod error; mod futures_undead; mod metrics; mod task; -use metrics::Metrics; +pub use metrics::Metrics; #[cfg(test)] mod tests; @@ -603,7 +603,8 @@ impl AvailabilityRecoverySubsystem { } } - async fn run(self, mut ctx: Context) -> SubsystemResult<()> { + /// Starts the inner subsystem loop. + pub async fn run(self, mut ctx: Context) -> SubsystemResult<()> { let mut state = State::default(); let Self { mut req_receiver, @@ -681,6 +682,7 @@ impl AvailabilityRecoverySubsystem { &mut state, signal, ).await? { + gum::debug!(target: LOG_TARGET, "subsystem concluded"); return Ok(()); } FromOrchestra::Communication { msg } => { @@ -845,12 +847,17 @@ async fn erasure_task_thread( let _ = sender.send(maybe_data); }, None => { - gum::debug!( + gum::trace!( target: LOG_TARGET, "Erasure task channel closed. Node shutting down ?", ); break }, } + + // In benchmarks this is a very hot loop not yielding at all. + // To update CPU metrics for the task we need to yield. + #[cfg(feature = "subsystem-benchmarks")] + tokio::task::yield_now().await; } } diff --git a/polkadot/node/network/availability-recovery/src/metrics.rs b/polkadot/node/network/availability-recovery/src/metrics.rs index aa7216739507..d82a8f9ae5fa 100644 --- a/polkadot/node/network/availability-recovery/src/metrics.rs +++ b/polkadot/node/network/availability-recovery/src/metrics.rs @@ -29,7 +29,10 @@ struct MetricsInner { /// /// Gets incremented on each sent chunk requests. chunk_requests_issued: Counter, - + /// Total number of bytes recovered + /// + /// Gets incremented on each succesful recovery + recovered_bytes_total: Counter, /// A counter for finished chunk requests. /// /// Split by result: @@ -133,9 +136,10 @@ impl Metrics { } /// A full recovery succeeded. - pub fn on_recovery_succeeded(&self) { + pub fn on_recovery_succeeded(&self, bytes: usize) { if let Some(metrics) = &self.0 { - metrics.full_recoveries_finished.with_label_values(&["success"]).inc() + metrics.full_recoveries_finished.with_label_values(&["success"]).inc(); + metrics.recovered_bytes_total.inc_by(bytes as u64) } } @@ -171,6 +175,13 @@ impl metrics::Metrics for Metrics { )?, registry, )?, + recovered_bytes_total: prometheus::register( + Counter::new( + "polkadot_parachain_availability_recovery_bytes_total", + "Total number of bytes recovered", + )?, + registry, + )?, chunk_requests_finished: prometheus::register( CounterVec::new( Opts::new( diff --git a/polkadot/node/network/availability-recovery/src/task.rs b/polkadot/node/network/availability-recovery/src/task.rs index f705d5c0e4cf..c300c221da5c 100644 --- a/polkadot/node/network/availability-recovery/src/task.rs +++ b/polkadot/node/network/availability-recovery/src/task.rs @@ -23,6 +23,7 @@ use crate::{ PostRecoveryCheck, LOG_TARGET, }; use futures::{channel::oneshot, SinkExt}; +use parity_scale_codec::Encode; #[cfg(not(test))] use polkadot_node_network_protocol::request_response::CHUNK_REQUEST_TIMEOUT; use polkadot_node_network_protocol::request_response::{ @@ -432,7 +433,7 @@ where return Err(err) }, Ok(data) => { - self.params.metrics.on_recovery_succeeded(); + self.params.metrics.on_recovery_succeeded(data.encoded_size()); return Ok(data) }, } diff --git a/polkadot/node/network/availability-recovery/src/tests.rs b/polkadot/node/network/availability-recovery/src/tests.rs index 63ccf0e94f91..1cb52757bac9 100644 --- a/polkadot/node/network/availability-recovery/src/tests.rs +++ b/polkadot/node/network/availability-recovery/src/tests.rs @@ -24,12 +24,12 @@ use parity_scale_codec::Encode; use polkadot_node_network_protocol::request_response::{ self as req_res, IncomingRequest, Recipient, ReqProtocolNames, Requests, }; +use polkadot_node_subsystem_test_helpers::derive_erasure_chunks_with_proofs_and_root; use super::*; use sc_network::{config::RequestResponseConfig, IfDisconnected, OutboundFailure, RequestFailure}; -use polkadot_erasure_coding::{branches, obtain_chunks_v1 as obtain_chunks}; use polkadot_node_primitives::{BlockData, PoV, Proof}; use polkadot_node_subsystem::messages::{ AllMessages, NetworkBridgeTxMessage, RuntimeApiMessage, RuntimeApiRequest, @@ -456,33 +456,6 @@ fn validator_authority_id(val_ids: &[Sr25519Keyring]) -> Vec), -) -> (Vec, Hash) { - let mut chunks: Vec> = obtain_chunks(n_validators, available_data).unwrap(); - - for (i, chunk) in chunks.iter_mut().enumerate() { - alter_chunk(i, chunk) - } - - // create proofs for each erasure chunk - let branches = branches(chunks.as_ref()); - - let root = branches.root(); - let erasure_chunks = branches - .enumerate() - .map(|(index, (proof, chunk))| ErasureChunk { - chunk: chunk.to_vec(), - index: ValidatorIndex(index as _), - proof: Proof::try_from(proof).unwrap(), - }) - .collect::>(); - - (erasure_chunks, root) -} - impl Default for TestState { fn default() -> Self { let validators = vec![ diff --git a/polkadot/node/overseer/src/lib.rs b/polkadot/node/overseer/src/lib.rs index da99546a44f7..f4eddf1f41ce 100644 --- a/polkadot/node/overseer/src/lib.rs +++ b/polkadot/node/overseer/src/lib.rs @@ -276,6 +276,7 @@ impl From> for BlockInfo { /// An event from outside the overseer scope, such /// as the substrate framework or user interaction. +#[derive(Debug)] pub enum Event { /// A new block was imported. /// @@ -300,6 +301,7 @@ pub enum Event { } /// Some request from outer world. +#[derive(Debug)] pub enum ExternalRequest { /// Wait for the activation of a particular hash /// and be notified by means of the return channel. diff --git a/polkadot/node/subsystem-bench/Cargo.toml b/polkadot/node/subsystem-bench/Cargo.toml new file mode 100644 index 000000000000..08d1a31adf55 --- /dev/null +++ b/polkadot/node/subsystem-bench/Cargo.toml @@ -0,0 +1,61 @@ +[package] +name = "polkadot-subsystem-bench" +description = "Subsystem performance benchmark client" +version = "1.0.0" +authors.workspace = true +edition.workspace = true +license.workspace = true +readme = "README.md" +publish = false + +[[bin]] +name = "subsystem-bench" +path = "src/subsystem-bench.rs" + +# Prevent rustdoc error. Already documented from top-level Cargo.toml. +doc = false + +[dependencies] +polkadot-node-subsystem = { path = "../subsystem" } +polkadot-node-subsystem-util = { path = "../subsystem-util" } +polkadot-node-subsystem-types = { path = "../subsystem-types" } +polkadot-node-primitives = { path = "../primitives" } +polkadot-primitives = { path = "../../primitives" } +polkadot-node-network-protocol = { path = "../network/protocol" } +polkadot-availability-recovery = { path = "../network/availability-recovery", features = ["subsystem-benchmarks"] } +color-eyre = { version = "0.6.1", default-features = false } +polkadot-overseer = { path = "../overseer" } +colored = "2.0.4" +assert_matches = "1.5" +async-trait = "0.1.57" +sp-keystore = { path = "../../../substrate/primitives/keystore" } +sc-keystore = { path = "../../../substrate/client/keystore" } +sp-core = { path = "../../../substrate/primitives/core" } +clap = { version = "4.4.6", features = ["derive"] } +futures = "0.3.21" +futures-timer = "3.0.2" +gum = { package = "tracing-gum", path = "../gum" } +polkadot-erasure-coding = { package = "polkadot-erasure-coding", path = "../../erasure-coding" } +log = "0.4.17" +env_logger = "0.9.0" +rand = "0.8.5" +parity-scale-codec = { version = "3.6.1", features = ["derive", "std"] } +tokio = "1.24.2" +clap-num = "1.0.2" +polkadot-node-subsystem-test-helpers = { path = "../subsystem-test-helpers" } +sp-keyring = { path = "../../../substrate/primitives/keyring" } +sp-application-crypto = { path = "../../../substrate/primitives/application-crypto" } +sc-network = { path = "../../../substrate/client/network" } +sc-service = { path = "../../../substrate/client/service" } +polkadot-node-metrics = { path = "../metrics" } +itertools = "0.11.0" +polkadot-primitives-test-helpers = { path = "../../primitives/test-helpers" } +prometheus_endpoint = { package = "substrate-prometheus-endpoint", path = "../../../substrate/utils/prometheus" } +prometheus = { version = "0.13.0", default-features = false } +serde = "1.0.192" +serde_yaml = "0.9" +paste = "1.0.14" +orchestra = { version = "0.3.3", default-features = false, features = ["futures_channel"] } + +[features] +default = [] diff --git a/polkadot/node/subsystem-bench/README.md b/polkadot/node/subsystem-bench/README.md new file mode 100644 index 000000000000..21844853334b --- /dev/null +++ b/polkadot/node/subsystem-bench/README.md @@ -0,0 +1,216 @@ +# Subsystem benchmark client + +Run parachain consensus stress and performance tests on your development machine. + +## Motivation + +The parachain consensus node implementation spans across many modules which we call subsystems. Each subsystem is +responsible for a small part of logic of the parachain consensus pipeline, but in general the most load and +performance issues are localized in just a few core subsystems like `availability-recovery`, `approval-voting` or +`dispute-coordinator`. In the absence of such a tool, we would run large test nets to load/stress test these parts of +the system. Setting up and making sense of the amount of data produced by such a large test is very expensive, hard +to orchestrate and is a huge development time sink. + +This tool aims to solve the problem by making it easy to: + +- set up and run core subsystem load tests locally on your development machine +- iterate and conclude faster when benchmarking new optimizations or comparing implementations +- automate and keep track of performance regressions in CI runs +- simulate various networking topologies, bandwidth and connectivity issues + +## Test environment setup + +`cargo build --profile=testnet --bin subsystem-bench -p polkadot-subsystem-bench` + +The output binary will be placed in `target/testnet/subsystem-bench`. + +### Test metrics + +Subsystem, CPU usage and network metrics are exposed via a prometheus endpoint during the test execution. +A small subset of these collected metrics are displayed in the CLI, but for an in depth analysys of the test results, +a local Grafana/Prometheus stack is needed. + +### Install Prometheus + +Please follow the [official installation guide](https://prometheus.io/docs/prometheus/latest/installation/) for your +platform/OS. + +After succesfully installing and starting up Prometheus, we need to alter it's configuration such that it +will scrape the benchmark prometheus endpoint `127.0.0.1:9999`. Please check the prometheus official documentation +regarding the location of `prometheus.yml`. On MacOS for example the full path `/opt/homebrew/etc/prometheus.yml` + +prometheus.yml: + +``` +global: + scrape_interval: 5s + +scrape_configs: + - job_name: "prometheus" + static_configs: + - targets: ["localhost:9090"] + - job_name: "subsystem-bench" + scrape_interval: 0s500ms + static_configs: + - targets: ['localhost:9999'] +``` + +To complete this step restart Prometheus server such that it picks up the new configuration. + +### Install and setup Grafana + +Follow the [installation guide](https://grafana.com/docs/grafana/latest/setup-grafana/installation/) relevant +to your operating system. + +Once you have the installation up and running, configure the local Prometheus as a data source by following +[this guide](https://grafana.com/docs/grafana/latest/datasources/prometheus/configure-prometheus-data-source/) + +#### Import dashboards + +Follow [this guide](https://grafana.com/docs/grafana/latest/dashboards/manage-dashboards/#export-and-import-dashboards) +to import the dashboards from the repository `grafana` folder. + +## How to run a test + +To run a test, you need to first choose a test objective. Currently, we support the following: + +``` +target/testnet/subsystem-bench --help +The almighty Subsystem Benchmark Tool™️ + +Usage: subsystem-bench [OPTIONS] + +Commands: + data-availability-read Benchmark availability recovery strategies + +``` + +Note: `test-sequence` is a special test objective that wraps up an arbitrary number of test objectives. It is tipically + used to run a suite of tests defined in a `yaml` file like in this [example](examples/availability_read.yaml). + +### Standard test options + +``` +Options: + --network The type of network to be emulated [default: ideal] [possible values: + ideal, healthy, degraded] + --n-cores Number of cores to fetch availability for [default: 100] + --n-validators Number of validators to fetch chunks from [default: 500] + --min-pov-size The minimum pov size in KiB [default: 5120] + --max-pov-size The maximum pov size bytes [default: 5120] + -n, --num-blocks The number of blocks the test is going to run [default: 1] + -p, --peer-bandwidth The bandwidth of simulated remote peers in KiB + -b, --bandwidth The bandwidth of our simulated node in KiB + --peer-error Simulated conection error ratio [0-100] + --peer-min-latency Minimum remote peer latency in milliseconds [0-5000] + --peer-max-latency Maximum remote peer latency in milliseconds [0-5000] + -h, --help Print help + -V, --version Print version +``` + +These apply to all test objectives, except `test-sequence` which relies on the values being specified in a file. + +### Test objectives + +Each test objective can have it's specific configuration options, in contrast with the standard test options. + +For `data-availability-read` the recovery strategy to be used is configurable. + +``` +target/testnet/subsystem-bench data-availability-read --help +Benchmark availability recovery strategies + +Usage: subsystem-bench data-availability-read [OPTIONS] + +Options: + -f, --fetch-from-backers Turbo boost AD Read by fetching the full availability datafrom backers first. Saves CPU + as we don't need to re-construct from chunks. Tipically this is only faster if nodes + have enough bandwidth + -h, --help Print help +``` + +### Understanding the test configuration + +A single test configuration `TestConfiguration` struct applies to a single run of a certain test objective. + +The configuration describes the following important parameters that influence the test duration and resource +usage: + +- how many validators are on the emulated network (`n_validators`) +- how many cores per block the subsystem will have to do work on (`n_cores`) +- for how many blocks the test should run (`num_blocks`) + +From the perspective of the subsystem under test, this means that it will receive an `ActiveLeavesUpdate` signal +followed by an arbitrary amount of messages. This process repeats itself for `num_blocks`. The messages are generally +test payloads pre-generated before the test run, or constructed on pre-genereated payloads. For example the +`AvailabilityRecoveryMessage::RecoverAvailableData` message includes a `CandidateReceipt` which is generated before +the test is started. + +### Example run + +Let's run an availabilty read test which will recover availability for 10 cores with max PoV size on a 500 +node validator network. + +``` + target/testnet/subsystem-bench --n-cores 10 data-availability-read +[2023-11-28T09:01:59Z INFO subsystem_bench::core::display] n_validators = 500, n_cores = 10, pov_size = 5120 - 5120, + error = 0, latency = None +[2023-11-28T09:01:59Z INFO subsystem-bench::availability] Generating template candidate index=0 pov_size=5242880 +[2023-11-28T09:01:59Z INFO subsystem-bench::availability] Created test environment. +[2023-11-28T09:01:59Z INFO subsystem-bench::availability] Pre-generating 10 candidates. +[2023-11-28T09:02:01Z INFO subsystem-bench::core] Initializing network emulation for 500 peers. +[2023-11-28T09:02:01Z INFO substrate_prometheus_endpoint] 〽️ Prometheus exporter started at 127.0.0.1:9999 +[2023-11-28T09:02:01Z INFO subsystem-bench::availability] Current block 1/1 +[2023-11-28T09:02:01Z INFO subsystem_bench::availability] 10 recoveries pending +[2023-11-28T09:02:04Z INFO subsystem_bench::availability] Block time 3231ms +[2023-11-28T09:02:04Z INFO subsystem-bench::availability] Sleeping till end of block (2768ms) +[2023-11-28T09:02:07Z INFO subsystem_bench::availability] All blocks processed in 6001ms +[2023-11-28T09:02:07Z INFO subsystem_bench::availability] Throughput: 51200 KiB/block +[2023-11-28T09:02:07Z INFO subsystem_bench::availability] Block time: 6001 ms +[2023-11-28T09:02:07Z INFO subsystem_bench::availability] + + Total received from network: 66 MiB + Total sent to network: 58 KiB + Total subsystem CPU usage 4.16s + CPU usage per block 4.16s + Total test environment CPU usage 0.00s + CPU usage per block 0.00s +``` + +`Block time` in the context of `data-availability-read` has a different meaning. It measures the amount of time it +took the subsystem to finish processing all of the messages sent in the context of the current test block. + +### Test logs + +You can select log target, subtarget and verbosity just like with Polkadot node CLI, simply setting +`RUST_LOOG="parachain=debug"` turns on debug logs for all parachain consensus subsystems in the test. + +### View test metrics + +Assuming the Grafana/Prometheus stack installation steps completed succesfully, you should be able to +view the test progress in real time by accessing [this link](http://localhost:3000/goto/SM5B8pNSR?orgId=1). + +Now run +`target/testnet/subsystem-bench test-sequence --path polkadot/node/subsystem-bench/examples/availability_read.yaml` +and view the metrics in real time and spot differences between different `n_valiator` values. + +## Create new test objectives + +This tool is intended to make it easy to write new test objectives that focus individual subsystems, +or even multiple subsystems (for example `approval-distribution` and `approval-voting`). + +A special kind of test objectives are performance regression tests for the CI pipeline. These should be sequences +of tests that check the performance characteristics (such as CPU usage, speed) of the subsystem under test in both +happy and negative scenarios (low bandwidth, network errors and low connectivity). + +### Reusable test components + +To faster write a new test objective you need to use some higher level wrappers and logic: `TestEnvironment`, +`TestConfiguration`, `TestAuthorities`, `NetworkEmulator`. To create the `TestEnvironment` you will +need to also build an `Overseer`, but that should be easy using the mockups for subsystems in`core::mock`. + +### Mocking + +Ideally we want to have a single mock implementation for subsystems that can be minimally configured to +be used in different tests. A good example is `runtime-api` which currently only responds to session information +requests based on static data. It can be easily extended to service other requests. diff --git a/polkadot/node/subsystem-bench/examples/availability_read.yaml b/polkadot/node/subsystem-bench/examples/availability_read.yaml new file mode 100644 index 000000000000..311ea972141f --- /dev/null +++ b/polkadot/node/subsystem-bench/examples/availability_read.yaml @@ -0,0 +1,57 @@ +TestConfiguration: +# Test 1 +- objective: !DataAvailabilityRead + fetch_from_backers: false + n_validators: 300 + n_cores: 20 + min_pov_size: 5120 + max_pov_size: 5120 + peer_bandwidth: 52428800 + bandwidth: 52428800 + latency: + min_latency: + secs: 0 + nanos: 1000000 + max_latency: + secs: 0 + nanos: 100000000 + error: 3 + num_blocks: 3 + +# Test 2 +- objective: !DataAvailabilityRead + fetch_from_backers: false + n_validators: 500 + n_cores: 20 + min_pov_size: 5120 + max_pov_size: 5120 + peer_bandwidth: 52428800 + bandwidth: 52428800 + latency: + min_latency: + secs: 0 + nanos: 1000000 + max_latency: + secs: 0 + nanos: 100000000 + error: 3 + num_blocks: 3 + +# Test 3 +- objective: !DataAvailabilityRead + fetch_from_backers: false + n_validators: 1000 + n_cores: 20 + min_pov_size: 5120 + max_pov_size: 5120 + peer_bandwidth: 52428800 + bandwidth: 52428800 + latency: + min_latency: + secs: 0 + nanos: 1000000 + max_latency: + secs: 0 + nanos: 100000000 + error: 3 + num_blocks: 3 diff --git a/polkadot/node/subsystem-bench/grafana/availability-read.json b/polkadot/node/subsystem-bench/grafana/availability-read.json new file mode 100644 index 000000000000..31c4ad3c7952 --- /dev/null +++ b/polkadot/node/subsystem-bench/grafana/availability-read.json @@ -0,0 +1,1874 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "Subsystem and test environment metrics", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 2, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": 60000, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 90, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "editorMode": "code", + "expr": "subsystem_benchmark_n_validators{}", + "instant": false, + "legendFormat": "n_vaidators", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "editorMode": "code", + "expr": "subsystem_benchmark_n_cores{}", + "hide": false, + "instant": false, + "legendFormat": "n_cores", + "range": true, + "refId": "B" + } + ], + "title": "Test configuration", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 31, + "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], + "title": "Overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$data_source" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 57, + "interval": "1s", + "options": { + "legend": { + "calcs": [ + "mean", + "min", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.0.2", + "repeat": "nodename", + "targets": [ + { + "datasource": { + "uid": "$data_source" + }, + "editorMode": "code", + "expr": "sum(rate(substrate_tasks_polling_duration_sum{}[2s])) by ($cpu_group_by)", + "interval": "", + "legendFormat": "{{task_group}}", + "range": true, + "refId": "A" + } + ], + "title": "All tasks CPU usage breakdown", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$data_source" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 6 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 93, + "interval": "1s", + "options": { + "legend": { + "calcs": [ + "mean", + "min", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.0.2", + "targets": [ + { + "datasource": { + "uid": "$data_source" + }, + "editorMode": "code", + "expr": "increase(substrate_tasks_polling_duration_sum{task_group=\"availability-recovery\"}[6s])", + "interval": "", + "legendFormat": "{{task_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Availability subsystem CPU usage per block", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$data_source" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 10, + "type": "log" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 94, + "interval": "1s", + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.0.2", + "targets": [ + { + "datasource": { + "uid": "$data_source" + }, + "editorMode": "code", + "expr": "sum(substrate_tasks_polling_duration_sum{}) by ($cpu_group_by)", + "interval": "", + "legendFormat": "{{task_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Total CPU burn", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$data_source" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 10, + "type": "log" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "dark-red", + "value": 6000 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 95, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.0.2", + "targets": [ + { + "datasource": { + "uid": "$data_source" + }, + "editorMode": "code", + "expr": "subsystem_benchmark_block_time", + "interval": "", + "legendFormat": "Instant block time", + "range": true, + "refId": "A" + } + ], + "title": "All candidates in block recovery time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 100, + "gradientMode": "hue", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 2, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 89, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "editorMode": "code", + "expr": "sum(rate(subsystem_benchmark_network_peer_total_bytes_received{}[5s]))", + "instant": false, + "legendFormat": "Received", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "editorMode": "code", + "expr": "sum(rate(subsystem_benchmark_network_peer_total_bytes_sent{}[5s]))", + "hide": false, + "instant": false, + "legendFormat": "Sent", + "range": true, + "refId": "B" + } + ], + "title": "Emulated network throughput ", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 15, + "w": 12, + "x": 0, + "y": 52 + }, + "id": 88, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "editorMode": "code", + "expr": "rate(subsystem_benchmark_network_peer_total_bytes_received{}[10s])", + "instant": false, + "legendFormat": "Received by {{peer}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "editorMode": "code", + "expr": "rate(subsystem_benchmark_network_peer_total_bytes_sent{}[10s])", + "hide": false, + "instant": false, + "legendFormat": "Sent by {{peer}}", + "range": true, + "refId": "B" + } + ], + "title": "Emulated peer throughput", + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateInferno", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 15, + "w": 12, + "x": 12, + "y": 52 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 92, + "interval": "1s", + "legend": { + "show": true + }, + "maxDataPoints": 1340, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": { + "decimals": 0 + }, + "color": { + "exponent": 0.5, + "fill": "#b4ff00", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Inferno", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 0, + "reverse": false, + "unit": "bytes" + } + }, + "pluginVersion": "10.1.1", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(increase(subsystem_benchmark_pov_size_bucket{}[$__rate_interval])) by (le)", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "title": "Recovered PoV sizes", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 0, + "transformations": [], + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "description": "Number of erasure-encoded chunks of data belonging to candidate blocks. ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic", + "seriesBy": "max" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "chunks/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 67 + }, + "id": 43, + "interval": "1s", + "maxDataPoints": 1340, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.2.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(polkadot_parachain_availability_recovery_chunk_requests_issued{}[10s]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Chunks requested", + "queryType": "randomWalk", + "refId": "B" + } + ], + "title": "Availability", + "transformations": [], + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 77 + }, + "id": 35, + "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], + "title": "Availability subystem metrics", + "type": "row" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateInferno", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 78 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 68, + "interval": "1s", + "legend": { + "show": true + }, + "maxDataPoints": 1340, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": { + "decimals": 0 + }, + "color": { + "exponent": 0.5, + "fill": "#b4ff00", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Inferno", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 0, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.1.1", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(increase(polkadot_parachain_availability_recovery_time_total_bucket{}[$__rate_interval])) by (le)", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "title": "Time to recover a PoV", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 0, + "transformations": [], + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateInferno", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 78 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 67, + "interval": "1s", + "legend": { + "show": true + }, + "maxDataPoints": 1340, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": { + "decimals": 0 + }, + "color": { + "exponent": 0.5, + "fill": "#b4ff00", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Inferno", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 0, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.1.1", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(increase(polkadot_parachain_availability_recovery_time_chunk_request_bucket{}[$__rate_interval])) by (le)", + "format": "heatmap", + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Chunk request duration", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 0, + "transformations": [], + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 0, + "format": "bitfields", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic", + "seriesBy": "max" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 88 + }, + "id": 85, + "interval": "1s", + "maxDataPoints": 1340, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.2.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "rate(polkadot_parachain_availability_recovery_bytes_total{}[30s])", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Bytes recovered", + "queryType": "randomWalk", + "refId": "B" + } + ], + "title": "Recovery throughtput", + "transformations": [], + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateInferno", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 88 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 84, + "interval": "1s", + "legend": { + "show": true + }, + "maxDataPoints": 1340, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": { + "decimals": 0 + }, + "color": { + "exponent": 0.5, + "fill": "#b4ff00", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Inferno", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 0, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.1.1", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(increase(polkadot_parachain_availability_reencode_chunks_bucket{}[$__rate_interval])) by (le)", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "title": "Re-encoding chunks timing", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 0, + "transformations": [], + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateInferno", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 98 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 83, + "interval": "1s", + "legend": { + "show": true + }, + "maxDataPoints": 1340, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": { + "decimals": 0 + }, + "color": { + "exponent": 0.5, + "fill": "#b4ff00", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Inferno", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 0, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.1.1", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(increase(polkadot_parachain_availability_recovery_time_erasure_recovery_bucket{}[$__rate_interval])) by (le)", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "title": "Erasure recovery (no I/O)", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 0, + "transformations": [], + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "description": "Number of erasure-encoded chunks of data belonging to candidate blocks. ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic", + "seriesBy": "max" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "stepAfter", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 108 + }, + "id": 86, + "interval": "1s", + "maxDataPoints": 1340, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.2.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(polkadot_parachain_availability_recovery_recoveries_finished{}[1s]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Finished", + "queryType": "randomWalk", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(polkadot_parachain_availability_recovery_recovieries_started{}[1s]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Started", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Recoveries", + "transformations": [], + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 118 + }, + "id": 2, + "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], + "title": "Approval voting", + "type": "row" + } + ], + "refresh": false, + "schemaVersion": 38, + "style": "dark", + "tags": [ + "subsystem", + "benchmark" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "hide": 0, + "includeAll": false, + "label": "Source of data", + "multi": false, + "name": "data_source", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "selected": true, + "text": "task_name", + "value": "task_name" + }, + "description": "Sum CPU usage by task name or task group.", + "hide": 0, + "includeAll": false, + "label": "Group CPU usage", + "multi": false, + "name": "cpu_group_by", + "options": [ + { + "selected": true, + "text": "task_name", + "value": "task_name" + }, + { + "selected": false, + "text": "task_group", + "value": "task_group" + } + ], + "query": "task_name, task_group", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "2023-11-28T13:05:32.794Z", + "to": "2023-11-28T13:06:56.173Z" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s" + ] + }, + "timezone": "utc", + "title": "Data Availability Read", + "uid": "asdadasd1", + "version": 58, + "weekStart": "" +} \ No newline at end of file diff --git a/polkadot/node/subsystem-bench/grafana/task-cpu-usage.json b/polkadot/node/subsystem-bench/grafana/task-cpu-usage.json new file mode 100644 index 000000000000..90763444abf1 --- /dev/null +++ b/polkadot/node/subsystem-bench/grafana/task-cpu-usage.json @@ -0,0 +1,755 @@ +{ + "annotations": { + "list": [ + { + "$$hashKey": "object:326", + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "limit": 100, + "name": "Annotations & Alerts", + "showIn": 0, + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + }, + { + "$$hashKey": "object:327", + "datasource": { + "uid": "$data_source" + }, + "enable": true, + "expr": "increase(${metric_namespace}_tasks_ended_total{reason=\"panic\", node=~\"${nodename}\"}[10m])", + "hide": true, + "iconColor": "rgba(255, 96, 96, 1)", + "limit": 100, + "name": "Task panics", + "rawQuery": "SELECT\n extract(epoch from time_column) AS time,\n text_column as text,\n tags_column as tags\nFROM\n metric_table\nWHERE\n $__timeFilter(time_column)\n", + "showIn": 0, + "step": "10m", + "tags": [], + "textFormat": "{{node}} - {{task_name}}", + "titleFormat": "Panic!", + "type": "tags" + }, + { + "$$hashKey": "object:621", + "datasource": { + "uid": "$data_source" + }, + "enable": true, + "expr": "changes(${metric_namespace}_process_start_time_seconds{node=~\"${nodename}\"}[10m])", + "hide": false, + "iconColor": "#8AB8FF", + "name": "Node reboots", + "showIn": 0, + "step": "10m", + "textFormat": "{{node}}", + "titleFormat": "Reboots" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 1, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 29, + "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], + "title": "Tasks", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 3, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 1 + }, + "hiddenSeries": false, + "id": 11, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "repeat": "nodename", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$data_source" + }, + "editorMode": "code", + "expr": "sum(rate(substrate_tasks_polling_duration_sum{}[$__rate_interval])) by (task_name)", + "interval": "", + "legendFormat": "{{task_name}}", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "CPU time spent on each task", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2721", + "format": "percentunit", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:2722", + "format": "short", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 3, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 10 + }, + "hiddenSeries": false, + "id": 30, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "repeat": "nodename", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$data_source" + }, + "editorMode": "code", + "expr": "rate(substrate_tasks_polling_duration_count{}[$__rate_interval])", + "interval": "", + "legendFormat": "{{task_name}}", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Task polling rate per second", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2571", + "format": "cps", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:2572", + "format": "short", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 16 + }, + "hiddenSeries": false, + "id": 43, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": true, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "repeat": "nodename", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$data_source" + }, + "editorMode": "code", + "expr": "increase(substrate_tasks_polling_duration_sum{}[$__rate_interval]) / increase(substrate_tasks_polling_duration_count{}[$__rate_interval])", + "interval": "", + "legendFormat": "{{task_name}}", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Average time it takes to call Future::poll()", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2571", + "format": "s", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:2572", + "format": "short", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 22 + }, + "hiddenSeries": false, + "id": 15, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": true, + "values": true + }, + "lines": false, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "repeat": "nodename", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": true, + "targets": [ + { + "datasource": { + "uid": "$data_source" + }, + "editorMode": "code", + "expr": "increase(substrate_tasks_spawned_total{}[$__rate_interval])", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{task_name}}", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Number of tasks started", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:771", + "format": "short", + "logBase": 10, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:772", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 28 + }, + "hiddenSeries": false, + "id": 2, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "repeat": "nodename", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$data_source" + }, + "editorMode": "code", + "expr": "substrate_tasks_spawned_total{} - sum(substrate_tasks_ended_total{}) without(reason)\n\n# Fallback if tasks_ended_total is null for that task\nor on(task_name) substrate_tasks_spawned_total{}", + "interval": "", + "legendFormat": "{{task_name}}", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Number of tasks running", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:919", + "format": "short", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:920", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 34 + }, + "hiddenSeries": false, + "id": 7, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "repeat": "nodename", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": true, + "targets": [ + { + "datasource": { + "uid": "$data_source" + }, + "editorMode": "code", + "expr": "irate(substrate_tasks_polling_duration_bucket{le=\"+Inf\"}[$__rate_interval])\n - ignoring(le)\n irate(substrate_tasks_polling_duration_bucket{le=\"1.024\"}[$__rate_interval]) > 0", + "interval": "", + "legendFormat": "{{task_name}}", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Number of calls to `Future::poll` that took more than one second", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:3040", + "format": "cps", + "label": "Calls to `Future::poll`/second", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:3041", + "format": "short", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 40 + }, + "id": 27, + "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], + "title": "Unbounded Channels", + "type": "row" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "utc", + "title": "Substrate Service Tasks with substrate prefix", + "uid": "S7sc-M_Gk", + "version": 17, + "weekStart": "" + } \ No newline at end of file diff --git a/polkadot/node/subsystem-bench/src/availability/cli.rs b/polkadot/node/subsystem-bench/src/availability/cli.rs new file mode 100644 index 000000000000..65df8c1552aa --- /dev/null +++ b/polkadot/node/subsystem-bench/src/availability/cli.rs @@ -0,0 +1,37 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use serde::{Deserialize, Serialize}; + +#[derive(clap::ValueEnum, Clone, Copy, Debug, PartialEq)] +#[value(rename_all = "kebab-case")] +#[non_exhaustive] +pub enum NetworkEmulation { + Ideal, + Healthy, + Degraded, +} + +#[derive(Debug, Clone, Serialize, Deserialize, clap::Parser)] +#[clap(rename_all = "kebab-case")] +#[allow(missing_docs)] +pub struct DataAvailabilityReadOptions { + #[clap(short, long, default_value_t = false)] + /// Turbo boost AD Read by fetching the full availability datafrom backers first. Saves CPU as + /// we don't need to re-construct from chunks. Tipically this is only faster if nodes have + /// enough bandwidth. + pub fetch_from_backers: bool, +} diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs new file mode 100644 index 000000000000..7c81b9313659 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -0,0 +1,339 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . +use itertools::Itertools; +use std::{collections::HashMap, iter::Cycle, ops::Sub, sync::Arc, time::Instant}; + +use crate::TestEnvironment; +use polkadot_node_subsystem::{Overseer, OverseerConnector, SpawnGlue}; +use polkadot_node_subsystem_test_helpers::derive_erasure_chunks_with_proofs_and_root; +use polkadot_overseer::Handle as OverseerHandle; +use sc_network::request_responses::ProtocolConfig; + +use colored::Colorize; + +use futures::{channel::oneshot, stream::FuturesUnordered, StreamExt}; +use polkadot_node_metrics::metrics::Metrics; + +use polkadot_availability_recovery::AvailabilityRecoverySubsystem; + +use crate::GENESIS_HASH; +use parity_scale_codec::Encode; +use polkadot_node_network_protocol::request_response::{IncomingRequest, ReqProtocolNames}; +use polkadot_node_primitives::{BlockData, PoV}; +use polkadot_node_subsystem::messages::{AllMessages, AvailabilityRecoveryMessage}; + +use crate::core::{ + environment::TestEnvironmentDependencies, + mock::{ + av_store, + network_bridge::{self, MockNetworkBridgeTx, NetworkAvailabilityState}, + runtime_api, MockAvailabilityStore, MockRuntimeApi, + }, +}; + +use super::core::{configuration::TestConfiguration, mock::dummy_builder, network::*}; + +const LOG_TARGET: &str = "subsystem-bench::availability"; + +use polkadot_node_primitives::{AvailableData, ErasureChunk}; + +use super::{cli::TestObjective, core::mock::AlwaysSupportsParachains}; +use polkadot_node_subsystem_test_helpers::mock::new_block_import_info; +use polkadot_primitives::{ + CandidateHash, CandidateReceipt, GroupIndex, Hash, HeadData, PersistedValidationData, +}; +use polkadot_primitives_test_helpers::{dummy_candidate_receipt, dummy_hash}; +use sc_service::SpawnTaskHandle; + +mod cli; +pub use cli::{DataAvailabilityReadOptions, NetworkEmulation}; + +fn build_overseer( + spawn_task_handle: SpawnTaskHandle, + runtime_api: MockRuntimeApi, + av_store: MockAvailabilityStore, + network_bridge: MockNetworkBridgeTx, + availability_recovery: AvailabilityRecoverySubsystem, +) -> (Overseer, AlwaysSupportsParachains>, OverseerHandle) { + let overseer_connector = OverseerConnector::with_event_capacity(64000); + let dummy = dummy_builder!(spawn_task_handle); + let builder = dummy + .replace_runtime_api(|_| runtime_api) + .replace_availability_store(|_| av_store) + .replace_network_bridge_tx(|_| network_bridge) + .replace_availability_recovery(|_| availability_recovery); + + let (overseer, raw_handle) = + builder.build_with_connector(overseer_connector).expect("Should not fail"); + + (overseer, OverseerHandle::new(raw_handle)) +} + +/// Takes a test configuration and uses it to creates the `TestEnvironment`. +pub fn prepare_test( + config: TestConfiguration, + state: &mut TestState, +) -> (TestEnvironment, ProtocolConfig) { + prepare_test_inner(config, state, TestEnvironmentDependencies::default()) +} + +fn prepare_test_inner( + config: TestConfiguration, + state: &mut TestState, + dependencies: TestEnvironmentDependencies, +) -> (TestEnvironment, ProtocolConfig) { + // Generate test authorities. + let test_authorities = config.generate_authorities(); + + let runtime_api = runtime_api::MockRuntimeApi::new(config.clone(), test_authorities.clone()); + + let av_store = + av_store::MockAvailabilityStore::new(state.chunks.clone(), state.candidate_hashes.clone()); + + let availability_state = NetworkAvailabilityState { + candidate_hashes: state.candidate_hashes.clone(), + available_data: state.available_data.clone(), + chunks: state.chunks.clone(), + }; + + let network = NetworkEmulator::new(&config, &dependencies, &test_authorities); + + let network_bridge_tx = network_bridge::MockNetworkBridgeTx::new( + config.clone(), + availability_state, + network.clone(), + ); + + let use_fast_path = match &state.config().objective { + TestObjective::DataAvailabilityRead(options) => options.fetch_from_backers, + _ => panic!("Unexpected objective"), + }; + + let (collation_req_receiver, req_cfg) = + IncomingRequest::get_config_receiver(&ReqProtocolNames::new(GENESIS_HASH, None)); + + let subsystem = if use_fast_path { + AvailabilityRecoverySubsystem::with_fast_path( + collation_req_receiver, + Metrics::try_register(&dependencies.registry).unwrap(), + ) + } else { + AvailabilityRecoverySubsystem::with_chunks_only( + collation_req_receiver, + Metrics::try_register(&dependencies.registry).unwrap(), + ) + }; + + let (overseer, overseer_handle) = build_overseer( + dependencies.task_manager.spawn_handle(), + runtime_api, + av_store, + network_bridge_tx, + subsystem, + ); + + (TestEnvironment::new(dependencies, config, network, overseer, overseer_handle), req_cfg) +} + +#[derive(Clone)] +pub struct TestState { + // Full test configuration + config: TestConfiguration, + // A cycle iterator on all PoV sizes used in the test. + pov_sizes: Cycle>, + // Generated candidate receipts to be used in the test + candidates: Cycle>, + // Map from pov size to candidate index + pov_size_to_candidate: HashMap, + // Map from generated candidate hashes to candidate index in `available_data` + // and `chunks`. + candidate_hashes: HashMap, + // Per candidate index receipts. + candidate_receipt_templates: Vec, + // Per candidate index `AvailableData` + available_data: Vec, + // Per candiadte index chunks + chunks: Vec>, +} + +impl TestState { + fn config(&self) -> &TestConfiguration { + &self.config + } + + pub fn next_candidate(&mut self) -> Option { + let candidate = self.candidates.next(); + let candidate_hash = candidate.as_ref().unwrap().hash(); + gum::trace!(target: LOG_TARGET, "Next candidate selected {:?}", candidate_hash); + candidate + } + + /// Generate candidates to be used in the test. + fn generate_candidates(&mut self) { + let count = self.config.n_cores * self.config.num_blocks; + gum::info!(target: LOG_TARGET,"{}", format!("Pre-generating {} candidates.", count).bright_blue()); + + // Generate all candidates + self.candidates = (0..count) + .map(|index| { + let pov_size = self.pov_sizes.next().expect("This is a cycle; qed"); + let candidate_index = *self + .pov_size_to_candidate + .get(&pov_size) + .expect("pov_size always exists; qed"); + let mut candidate_receipt = + self.candidate_receipt_templates[candidate_index].clone(); + + // Make it unique. + candidate_receipt.descriptor.relay_parent = Hash::from_low_u64_be(index as u64); + // Store the new candidate in the state + self.candidate_hashes.insert(candidate_receipt.hash(), candidate_index); + + gum::debug!(target: LOG_TARGET, candidate_hash = ?candidate_receipt.hash(), "new candidate"); + + candidate_receipt + }) + .collect::>() + .into_iter() + .cycle(); + } + + pub fn new(config: &TestConfiguration) -> Self { + let config = config.clone(); + + let mut chunks = Vec::new(); + let mut available_data = Vec::new(); + let mut candidate_receipt_templates = Vec::new(); + let mut pov_size_to_candidate = HashMap::new(); + + // we use it for all candidates. + let persisted_validation_data = PersistedValidationData { + parent_head: HeadData(vec![7, 8, 9]), + relay_parent_number: Default::default(), + max_pov_size: 1024, + relay_parent_storage_root: Default::default(), + }; + + // For each unique pov we create a candidate receipt. + for (index, pov_size) in config.pov_sizes().iter().cloned().unique().enumerate() { + gum::info!(target: LOG_TARGET, index, pov_size, "{}", "Generating template candidate".bright_blue()); + + let mut candidate_receipt = dummy_candidate_receipt(dummy_hash()); + let pov = PoV { block_data: BlockData(vec![index as u8; pov_size]) }; + + let new_available_data = AvailableData { + validation_data: persisted_validation_data.clone(), + pov: Arc::new(pov), + }; + + let (new_chunks, erasure_root) = derive_erasure_chunks_with_proofs_and_root( + config.n_validators, + &new_available_data, + |_, _| {}, + ); + + candidate_receipt.descriptor.erasure_root = erasure_root; + + chunks.push(new_chunks); + available_data.push(new_available_data); + pov_size_to_candidate.insert(pov_size, index); + candidate_receipt_templates.push(candidate_receipt); + } + + let pov_sizes = config.pov_sizes().to_owned(); + let pov_sizes = pov_sizes.into_iter().cycle(); + gum::info!(target: LOG_TARGET, "{}","Created test environment.".bright_blue()); + + let mut _self = Self { + config, + available_data, + candidate_receipt_templates, + chunks, + pov_size_to_candidate, + pov_sizes, + candidate_hashes: HashMap::new(), + candidates: Vec::new().into_iter().cycle(), + }; + + _self.generate_candidates(); + _self + } +} + +pub async fn benchmark_availability_read(env: &mut TestEnvironment, mut state: TestState) { + let config = env.config().clone(); + + env.import_block(new_block_import_info(Hash::repeat_byte(1), 1)).await; + + let start_marker = Instant::now(); + let mut batch = FuturesUnordered::new(); + let mut availability_bytes = 0u128; + + env.metrics().set_n_validators(config.n_validators); + env.metrics().set_n_cores(config.n_cores); + + for block_num in 0..env.config().num_blocks { + gum::info!(target: LOG_TARGET, "Current block {}/{}", block_num + 1, env.config().num_blocks); + env.metrics().set_current_block(block_num); + + let block_start_ts = Instant::now(); + for candidate_num in 0..config.n_cores as u64 { + let candidate = + state.next_candidate().expect("We always send up to n_cores*num_blocks; qed"); + let (tx, rx) = oneshot::channel(); + batch.push(rx); + + let message = AllMessages::AvailabilityRecovery( + AvailabilityRecoveryMessage::RecoverAvailableData( + candidate.clone(), + 1, + Some(GroupIndex( + candidate_num as u32 % (std::cmp::max(5, config.n_cores) / 5) as u32, + )), + tx, + ), + ); + env.send_message(message).await; + } + + gum::info!("{}", format!("{} recoveries pending", batch.len()).bright_black()); + while let Some(completed) = batch.next().await { + let available_data = completed.unwrap().unwrap(); + env.metrics().on_pov_size(available_data.encoded_size()); + availability_bytes += available_data.encoded_size() as u128; + } + + let block_time = Instant::now().sub(block_start_ts).as_millis() as u64; + env.metrics().set_block_time(block_time); + gum::info!("All work for block completed in {}", format!("{:?}ms", block_time).cyan()); + } + + let duration: u128 = start_marker.elapsed().as_millis(); + let availability_bytes = availability_bytes / 1024; + gum::info!("All blocks processed in {}", format!("{:?}ms", duration).cyan()); + gum::info!( + "Throughput: {}", + format!("{} KiB/block", availability_bytes / env.config().num_blocks as u128).bright_red() + ); + gum::info!( + "Block time: {}", + format!("{} ms", start_marker.elapsed().as_millis() / env.config().num_blocks as u128) + .red() + ); + + gum::info!("{}", &env); + env.stop().await; +} diff --git a/polkadot/node/subsystem-bench/src/cli.rs b/polkadot/node/subsystem-bench/src/cli.rs new file mode 100644 index 000000000000..3352f33a3503 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/cli.rs @@ -0,0 +1,60 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . +use super::availability::DataAvailabilityReadOptions; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Serialize, Deserialize, clap::Parser)] +#[clap(rename_all = "kebab-case")] +#[allow(missing_docs)] +pub struct TestSequenceOptions { + #[clap(short, long, ignore_case = true)] + pub path: String, +} + +/// Define the supported benchmarks targets +#[derive(Debug, Clone, clap::Parser, Serialize, Deserialize)] +#[command(rename_all = "kebab-case")] +pub enum TestObjective { + /// Benchmark availability recovery strategies. + DataAvailabilityRead(DataAvailabilityReadOptions), + /// Run a test sequence specified in a file + TestSequence(TestSequenceOptions), +} + +#[derive(Debug, clap::Parser)] +#[clap(rename_all = "kebab-case")] +#[allow(missing_docs)] +pub struct StandardTestOptions { + #[clap(long, ignore_case = true, default_value_t = 100)] + /// Number of cores to fetch availability for. + pub n_cores: usize, + + #[clap(long, ignore_case = true, default_value_t = 500)] + /// Number of validators to fetch chunks from. + pub n_validators: usize, + + #[clap(long, ignore_case = true, default_value_t = 5120)] + /// The minimum pov size in KiB + pub min_pov_size: usize, + + #[clap(long, ignore_case = true, default_value_t = 5120)] + /// The maximum pov size bytes + pub max_pov_size: usize, + + #[clap(short, long, ignore_case = true, default_value_t = 1)] + /// The number of blocks the test is going to run. + pub num_blocks: usize, +} diff --git a/polkadot/node/subsystem-bench/src/core/configuration.rs b/polkadot/node/subsystem-bench/src/core/configuration.rs new file mode 100644 index 000000000000..164addb51900 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/core/configuration.rs @@ -0,0 +1,262 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . +// +//! Test configuration definition and helpers. +use super::*; +use keyring::Keyring; +use std::{path::Path, time::Duration}; + +pub use crate::cli::TestObjective; +use polkadot_primitives::{AuthorityDiscoveryId, ValidatorId}; +use rand::{distributions::Uniform, prelude::Distribution, thread_rng}; +use serde::{Deserialize, Serialize}; + +pub fn random_pov_size(min_pov_size: usize, max_pov_size: usize) -> usize { + random_uniform_sample(min_pov_size, max_pov_size) +} + +fn random_uniform_sample + From>(min_value: T, max_value: T) -> T { + Uniform::from(min_value.into()..=max_value.into()) + .sample(&mut thread_rng()) + .into() +} + +/// Peer response latency configuration. +#[derive(Clone, Debug, Default, Serialize, Deserialize)] +pub struct PeerLatency { + /// Min latency for `NetworkAction` completion. + pub min_latency: Duration, + /// Max latency or `NetworkAction` completion. + pub max_latency: Duration, +} + +// Default PoV size in KiB. +fn default_pov_size() -> usize { + 5120 +} + +// Default bandwidth in bytes +fn default_bandwidth() -> usize { + 52428800 +} + +// Default connectivity percentage +fn default_connectivity() -> usize { + 100 +} + +/// The test input parameters +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct TestConfiguration { + /// The test objective + pub objective: TestObjective, + /// Number of validators + pub n_validators: usize, + /// Number of cores + pub n_cores: usize, + /// The min PoV size + #[serde(default = "default_pov_size")] + pub min_pov_size: usize, + /// The max PoV size, + #[serde(default = "default_pov_size")] + pub max_pov_size: usize, + /// Randomly sampled pov_sizes + #[serde(skip)] + pov_sizes: Vec, + /// The amount of bandiwdth remote validators have. + #[serde(default = "default_bandwidth")] + pub peer_bandwidth: usize, + /// The amount of bandiwdth our node has. + #[serde(default = "default_bandwidth")] + pub bandwidth: usize, + /// Optional peer emulation latency + #[serde(default)] + pub latency: Option, + /// Error probability, applies to sending messages to the emulated network peers + #[serde(default)] + pub error: usize, + /// Connectivity ratio, the percentage of peers we are not connected to, but ar part of + /// the topology. + #[serde(default = "default_connectivity")] + pub connectivity: usize, + /// Number of blocks to run the test for + pub num_blocks: usize, +} + +fn generate_pov_sizes(count: usize, min_kib: usize, max_kib: usize) -> Vec { + (0..count).map(|_| random_pov_size(min_kib * 1024, max_kib * 1024)).collect() +} + +#[derive(Serialize, Deserialize)] +pub struct TestSequence { + #[serde(rename(serialize = "TestConfiguration", deserialize = "TestConfiguration"))] + test_configurations: Vec, +} + +impl TestSequence { + pub fn into_vec(self) -> Vec { + self.test_configurations + .into_iter() + .map(|mut config| { + config.pov_sizes = + generate_pov_sizes(config.n_cores, config.min_pov_size, config.max_pov_size); + config + }) + .collect() + } +} + +impl TestSequence { + pub fn new_from_file(path: &Path) -> std::io::Result { + let string = String::from_utf8(std::fs::read(path)?).expect("File is valid UTF8"); + Ok(serde_yaml::from_str(&string).expect("File is valid test sequence YA")) + } +} + +/// Helper struct for authority related state. +#[derive(Clone)] +pub struct TestAuthorities { + pub keyrings: Vec, + pub validator_public: Vec, + pub validator_authority_id: Vec, +} + +impl TestConfiguration { + #[allow(unused)] + pub fn write_to_disk(&self) { + // Serialize a slice of configurations + let yaml = serde_yaml::to_string(&TestSequence { test_configurations: vec![self.clone()] }) + .unwrap(); + std::fs::write("last_test.yaml", yaml).unwrap(); + } + + pub fn pov_sizes(&self) -> &[usize] { + &self.pov_sizes + } + + /// Generates the authority keys we need for the network emulation. + pub fn generate_authorities(&self) -> TestAuthorities { + let keyrings = (0..self.n_validators) + .map(|peer_index| Keyring::new(format!("Node{}", peer_index))) + .collect::>(); + + // Generate `AuthorityDiscoveryId`` for each peer + let validator_public: Vec = keyrings + .iter() + .map(|keyring: &Keyring| keyring.clone().public().into()) + .collect::>(); + + let validator_authority_id: Vec = keyrings + .iter() + .map(|keyring| keyring.clone().public().into()) + .collect::>(); + + TestAuthorities { keyrings, validator_public, validator_authority_id } + } + + /// An unconstrained standard configuration matching Polkadot/Kusama + pub fn ideal_network( + objective: TestObjective, + num_blocks: usize, + n_validators: usize, + n_cores: usize, + min_pov_size: usize, + max_pov_size: usize, + ) -> TestConfiguration { + Self { + objective, + n_cores, + n_validators, + pov_sizes: generate_pov_sizes(n_cores, min_pov_size, max_pov_size), + bandwidth: 50 * 1024 * 1024, + peer_bandwidth: 50 * 1024 * 1024, + // No latency + latency: None, + error: 0, + num_blocks, + min_pov_size, + max_pov_size, + connectivity: 100, + } + } + + pub fn healthy_network( + objective: TestObjective, + num_blocks: usize, + n_validators: usize, + n_cores: usize, + min_pov_size: usize, + max_pov_size: usize, + ) -> TestConfiguration { + Self { + objective, + n_cores, + n_validators, + pov_sizes: generate_pov_sizes(n_cores, min_pov_size, max_pov_size), + bandwidth: 50 * 1024 * 1024, + peer_bandwidth: 50 * 1024 * 1024, + latency: Some(PeerLatency { + min_latency: Duration::from_millis(1), + max_latency: Duration::from_millis(100), + }), + error: 3, + num_blocks, + min_pov_size, + max_pov_size, + connectivity: 95, + } + } + + pub fn degraded_network( + objective: TestObjective, + num_blocks: usize, + n_validators: usize, + n_cores: usize, + min_pov_size: usize, + max_pov_size: usize, + ) -> TestConfiguration { + Self { + objective, + n_cores, + n_validators, + pov_sizes: generate_pov_sizes(n_cores, min_pov_size, max_pov_size), + bandwidth: 50 * 1024 * 1024, + peer_bandwidth: 50 * 1024 * 1024, + latency: Some(PeerLatency { + min_latency: Duration::from_millis(10), + max_latency: Duration::from_millis(500), + }), + error: 33, + num_blocks, + min_pov_size, + max_pov_size, + connectivity: 67, + } + } +} + +/// Produce a randomized duration between `min` and `max`. +pub fn random_latency(maybe_peer_latency: Option<&PeerLatency>) -> Option { + maybe_peer_latency.map(|peer_latency| { + Uniform::from(peer_latency.min_latency..=peer_latency.max_latency).sample(&mut thread_rng()) + }) +} + +/// Generate a random error based on `probability`. +/// `probability` should be a number between 0 and 100. +pub fn random_error(probability: usize) -> bool { + Uniform::from(0..=99).sample(&mut thread_rng()) < probability +} diff --git a/polkadot/node/subsystem-bench/src/core/display.rs b/polkadot/node/subsystem-bench/src/core/display.rs new file mode 100644 index 000000000000..d600cc484c14 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/core/display.rs @@ -0,0 +1,191 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . +// +//! Display implementations and helper methods for parsing prometheus metrics +//! to a format that can be displayed in the CLI. +//! +//! Currently histogram buckets are skipped. +use super::{configuration::TestConfiguration, LOG_TARGET}; +use colored::Colorize; +use prometheus::{ + proto::{MetricFamily, MetricType}, + Registry, +}; +use std::fmt::Display; + +#[derive(Default)] +pub struct MetricCollection(Vec); + +impl From> for MetricCollection { + fn from(metrics: Vec) -> Self { + MetricCollection(metrics) + } +} + +impl MetricCollection { + pub fn all(&self) -> &Vec { + &self.0 + } + + /// Sums up all metrics with the given name in the collection + pub fn sum_by(&self, name: &str) -> f64 { + self.all() + .iter() + .filter(|metric| metric.name == name) + .map(|metric| metric.value) + .sum() + } + + pub fn subset_with_label_value(&self, label_name: &str, label_value: &str) -> MetricCollection { + self.0 + .iter() + .filter_map(|metric| { + if let Some(index) = metric.label_names.iter().position(|label| label == label_name) + { + if Some(&String::from(label_value)) == metric.label_values.get(index) { + Some(metric.clone()) + } else { + None + } + } else { + None + } + }) + .collect::>() + .into() + } +} + +impl Display for MetricCollection { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f)?; + let metrics = self.all(); + for metric in metrics { + writeln!(f, "{}", metric)?; + } + Ok(()) + } +} +#[derive(Debug, Clone)] +pub struct TestMetric { + name: String, + label_names: Vec, + label_values: Vec, + value: f64, +} + +impl Display for TestMetric { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "({} = {}) [{:?}, {:?}]", + self.name.cyan(), + format!("{}", self.value).white(), + self.label_names, + self.label_values + ) + } +} + +// Returns `false` if metric should be skipped. +fn check_metric_family(mf: &MetricFamily) -> bool { + if mf.get_metric().is_empty() { + gum::error!(target: LOG_TARGET, "MetricFamily has no metrics: {:?}", mf); + return false + } + if mf.get_name().is_empty() { + gum::error!(target: LOG_TARGET, "MetricFamily has no name: {:?}", mf); + return false + } + + true +} + +pub fn parse_metrics(registry: &Registry) -> MetricCollection { + let metric_families = registry.gather(); + let mut test_metrics = Vec::new(); + for mf in metric_families { + if !check_metric_family(&mf) { + continue + } + + let name: String = mf.get_name().into(); + let metric_type = mf.get_field_type(); + for m in mf.get_metric() { + let (label_names, label_values): (Vec, Vec) = m + .get_label() + .iter() + .map(|pair| (String::from(pair.get_name()), String::from(pair.get_value()))) + .unzip(); + + match metric_type { + MetricType::COUNTER => { + test_metrics.push(TestMetric { + name: name.clone(), + label_names, + label_values, + value: m.get_counter().get_value(), + }); + }, + MetricType::GAUGE => { + test_metrics.push(TestMetric { + name: name.clone(), + label_names, + label_values, + value: m.get_gauge().get_value(), + }); + }, + MetricType::HISTOGRAM => { + let h = m.get_histogram(); + let h_name = name.clone() + "_sum"; + test_metrics.push(TestMetric { + name: h_name, + label_names: label_names.clone(), + label_values: label_values.clone(), + value: h.get_sample_sum(), + }); + + let h_name = name.clone() + "_count"; + test_metrics.push(TestMetric { + name: h_name, + label_names, + label_values, + value: h.get_sample_sum(), + }); + }, + MetricType::SUMMARY => { + unimplemented!(); + }, + MetricType::UNTYPED => { + unimplemented!(); + }, + } + } + } + test_metrics.into() +} + +pub fn display_configuration(test_config: &TestConfiguration) { + gum::info!( + "{}, {}, {}, {}, {}", + format!("n_validators = {}", test_config.n_validators).blue(), + format!("n_cores = {}", test_config.n_cores).blue(), + format!("pov_size = {} - {}", test_config.min_pov_size, test_config.max_pov_size) + .bright_black(), + format!("error = {}", test_config.error).bright_black(), + format!("latency = {:?}", test_config.latency).bright_black(), + ); +} diff --git a/polkadot/node/subsystem-bench/src/core/environment.rs b/polkadot/node/subsystem-bench/src/core/environment.rs new file mode 100644 index 000000000000..247596474078 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/core/environment.rs @@ -0,0 +1,333 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . +//! Test environment implementation +use crate::{ + core::{mock::AlwaysSupportsParachains, network::NetworkEmulator}, + TestConfiguration, +}; +use colored::Colorize; +use core::time::Duration; +use futures::FutureExt; +use polkadot_overseer::{BlockInfo, Handle as OverseerHandle}; + +use polkadot_node_subsystem::{messages::AllMessages, Overseer, SpawnGlue, TimeoutExt}; +use polkadot_node_subsystem_types::Hash; +use polkadot_node_subsystem_util::metrics::prometheus::{ + self, Gauge, Histogram, PrometheusError, Registry, U64, +}; + +use sc_network::peer_store::LOG_TARGET; +use sc_service::{SpawnTaskHandle, TaskManager}; +use std::{ + fmt::Display, + net::{Ipv4Addr, SocketAddr}, +}; +use tokio::runtime::Handle; + +const MIB: f64 = 1024.0 * 1024.0; + +/// Test environment/configuration metrics +#[derive(Clone)] +pub struct TestEnvironmentMetrics { + /// Number of bytes sent per peer. + n_validators: Gauge, + /// Number of received sent per peer. + n_cores: Gauge, + /// PoV size + pov_size: Histogram, + /// Current block + current_block: Gauge, + /// Current block + block_time: Gauge, +} + +impl TestEnvironmentMetrics { + pub fn new(registry: &Registry) -> Result { + let mut buckets = prometheus::exponential_buckets(16384.0, 2.0, 9) + .expect("arguments are always valid; qed"); + buckets.extend(vec![5.0 * MIB, 6.0 * MIB, 7.0 * MIB, 8.0 * MIB, 9.0 * MIB, 10.0 * MIB]); + + Ok(Self { + n_validators: prometheus::register( + Gauge::new( + "subsystem_benchmark_n_validators", + "Total number of validators in the test", + )?, + registry, + )?, + n_cores: prometheus::register( + Gauge::new( + "subsystem_benchmark_n_cores", + "Number of cores we fetch availability for each block", + )?, + registry, + )?, + current_block: prometheus::register( + Gauge::new("subsystem_benchmark_current_block", "The current test block")?, + registry, + )?, + block_time: prometheus::register( + Gauge::new("subsystem_benchmark_block_time", "The time it takes for the target subsystems(s) to complete all the requests in a block")?, + registry, + )?, + pov_size: prometheus::register( + Histogram::with_opts( + prometheus::HistogramOpts::new( + "subsystem_benchmark_pov_size", + "The compressed size of the proof of validity of a candidate", + ) + .buckets(buckets), + )?, + registry, + )?, + }) + } + + pub fn set_n_validators(&self, n_validators: usize) { + self.n_validators.set(n_validators as u64); + } + + pub fn set_n_cores(&self, n_cores: usize) { + self.n_cores.set(n_cores as u64); + } + + pub fn set_current_block(&self, current_block: usize) { + self.current_block.set(current_block as u64); + } + + pub fn set_block_time(&self, block_time_ms: u64) { + self.block_time.set(block_time_ms); + } + + pub fn on_pov_size(&self, pov_size: usize) { + self.pov_size.observe(pov_size as f64); + } +} + +fn new_runtime() -> tokio::runtime::Runtime { + tokio::runtime::Builder::new_multi_thread() + .thread_name("subsystem-bench") + .enable_all() + .thread_stack_size(3 * 1024 * 1024) + .build() + .unwrap() +} + +/// Wrapper for dependencies +pub struct TestEnvironmentDependencies { + pub registry: Registry, + pub task_manager: TaskManager, + pub runtime: tokio::runtime::Runtime, +} + +impl Default for TestEnvironmentDependencies { + fn default() -> Self { + let runtime = new_runtime(); + let registry = Registry::new(); + let task_manager: TaskManager = + TaskManager::new(runtime.handle().clone(), Some(®istry)).unwrap(); + + Self { runtime, registry, task_manager } + } +} + +// A dummy genesis hash +pub const GENESIS_HASH: Hash = Hash::repeat_byte(0xff); + +// We use this to bail out sending messages to the subsystem if it is overloaded such that +// the time of flight is breaches 5s. +// This should eventually be a test parameter. +const MAX_TIME_OF_FLIGHT: Duration = Duration::from_millis(5000); + +/// The test environment is the high level wrapper of all things required to test +/// a certain subsystem. +/// +/// ## Mockups +/// The overseer is passed in during construction and it can host an arbitrary number of +/// real subsystems instances and the corresponding mocked instances such that the real +/// subsystems can get their messages answered. +/// +/// As the subsystem's performance depends on network connectivity, the test environment +/// emulates validator nodes on the network, see `NetworkEmulator`. The network emulation +/// is configurable in terms of peer bandwidth, latency and connection error rate using +/// uniform distribution sampling. +/// +/// +/// ## Usage +/// `TestEnvironment` is used in tests to send `Overseer` messages or signals to the subsystem +/// under test. +/// +/// ## Collecting test metrics +/// +/// ### Prometheus +/// A prometheus endpoint is exposed while the test is running. A local Prometheus instance +/// can scrape it every 1s and a Grafana dashboard is the preferred way of visualizing +/// the performance characteristics of the subsystem. +/// +/// ### CLI +/// A subset of the Prometheus metrics are printed at the end of the test. +pub struct TestEnvironment { + /// Test dependencies + dependencies: TestEnvironmentDependencies, + /// A runtime handle + runtime_handle: tokio::runtime::Handle, + /// A handle to the lovely overseer + overseer_handle: OverseerHandle, + /// The test configuration. + config: TestConfiguration, + /// A handle to the network emulator. + network: NetworkEmulator, + /// Configuration/env metrics + metrics: TestEnvironmentMetrics, +} + +impl TestEnvironment { + /// Create a new test environment + pub fn new( + dependencies: TestEnvironmentDependencies, + config: TestConfiguration, + network: NetworkEmulator, + overseer: Overseer, AlwaysSupportsParachains>, + overseer_handle: OverseerHandle, + ) -> Self { + let metrics = TestEnvironmentMetrics::new(&dependencies.registry) + .expect("Metrics need to be registered"); + + let spawn_handle = dependencies.task_manager.spawn_handle(); + spawn_handle.spawn_blocking("overseer", "overseer", overseer.run().boxed()); + + let registry_clone = dependencies.registry.clone(); + dependencies.task_manager.spawn_handle().spawn_blocking( + "prometheus", + "test-environment", + async move { + prometheus_endpoint::init_prometheus( + SocketAddr::new(std::net::IpAddr::V4(Ipv4Addr::LOCALHOST), 9999), + registry_clone, + ) + .await + .unwrap(); + }, + ); + + TestEnvironment { + runtime_handle: dependencies.runtime.handle().clone(), + dependencies, + overseer_handle, + config, + network, + metrics, + } + } + + pub fn config(&self) -> &TestConfiguration { + &self.config + } + + pub fn network(&self) -> &NetworkEmulator { + &self.network + } + + pub fn registry(&self) -> &Registry { + &self.dependencies.registry + } + + pub fn metrics(&self) -> &TestEnvironmentMetrics { + &self.metrics + } + + pub fn runtime(&self) -> Handle { + self.runtime_handle.clone() + } + + // Send a message to the subsystem under test environment. + pub async fn send_message(&mut self, msg: AllMessages) { + self.overseer_handle + .send_msg(msg, LOG_TARGET) + .timeout(MAX_TIME_OF_FLIGHT) + .await + .unwrap_or_else(|| { + panic!("{}ms maximum time of flight breached", MAX_TIME_OF_FLIGHT.as_millis()) + }); + } + + // Send an `ActiveLeavesUpdate` signal to all subsystems under test. + pub async fn import_block(&mut self, block: BlockInfo) { + self.overseer_handle + .block_imported(block) + .timeout(MAX_TIME_OF_FLIGHT) + .await + .unwrap_or_else(|| { + panic!("{}ms maximum time of flight breached", MAX_TIME_OF_FLIGHT.as_millis()) + }); + } + + // Stop overseer and subsystems. + pub async fn stop(&mut self) { + self.overseer_handle.stop().await; + } +} + +impl Display for TestEnvironment { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let stats = self.network().stats(); + + writeln!(f, "\n")?; + writeln!( + f, + "Total received from network: {}", + format!( + "{} MiB", + stats + .iter() + .enumerate() + .map(|(_index, stats)| stats.tx_bytes_total as u128) + .sum::() / (1024 * 1024) + ) + .cyan() + )?; + writeln!( + f, + "Total sent to network: {}", + format!("{} KiB", stats[0].tx_bytes_total / (1024)).cyan() + )?; + + let test_metrics = super::display::parse_metrics(self.registry()); + let subsystem_cpu_metrics = + test_metrics.subset_with_label_value("task_group", "availability-recovery"); + let total_cpu = subsystem_cpu_metrics.sum_by("substrate_tasks_polling_duration_sum"); + writeln!(f, "Total subsystem CPU usage {}", format!("{:.2}s", total_cpu).bright_purple())?; + writeln!( + f, + "CPU usage per block {}", + format!("{:.2}s", total_cpu / self.config().num_blocks as f64).bright_purple() + )?; + + let test_env_cpu_metrics = + test_metrics.subset_with_label_value("task_group", "test-environment"); + let total_cpu = test_env_cpu_metrics.sum_by("substrate_tasks_polling_duration_sum"); + writeln!( + f, + "Total test environment CPU usage {}", + format!("{:.2}s", total_cpu).bright_purple() + )?; + writeln!( + f, + "CPU usage per block {}", + format!("{:.2}s", total_cpu / self.config().num_blocks as f64).bright_purple() + ) + } +} diff --git a/polkadot/node/subsystem-bench/src/core/keyring.rs b/polkadot/node/subsystem-bench/src/core/keyring.rs new file mode 100644 index 000000000000..2d9aa348a922 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/core/keyring.rs @@ -0,0 +1,40 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +pub use sp_core::sr25519; +use sp_core::{ + sr25519::{Pair, Public}, + Pair as PairT, +}; +/// Set of test accounts. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Keyring { + name: String, +} + +impl Keyring { + pub fn new(name: String) -> Keyring { + Self { name } + } + + pub fn pair(self) -> Pair { + Pair::from_string(&format!("//{}", self.name), None).expect("input is always good; qed") + } + + pub fn public(self) -> Public { + self.pair().public() + } +} diff --git a/polkadot/node/subsystem-bench/src/core/mock/av_store.rs b/polkadot/node/subsystem-bench/src/core/mock/av_store.rs new file mode 100644 index 000000000000..a471230f1b3f --- /dev/null +++ b/polkadot/node/subsystem-bench/src/core/mock/av_store.rs @@ -0,0 +1,137 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . +//! +//! A generic av store subsystem mockup suitable to be used in benchmarks. + +use parity_scale_codec::Encode; +use polkadot_primitives::CandidateHash; + +use std::collections::HashMap; + +use futures::{channel::oneshot, FutureExt}; + +use polkadot_node_primitives::ErasureChunk; + +use polkadot_node_subsystem::{ + messages::AvailabilityStoreMessage, overseer, SpawnedSubsystem, SubsystemError, +}; + +use polkadot_node_subsystem_types::OverseerSignal; + +pub struct AvailabilityStoreState { + candidate_hashes: HashMap, + chunks: Vec>, +} + +const LOG_TARGET: &str = "subsystem-bench::av-store-mock"; + +/// A mock of the availability store subsystem. This one also generates all the +/// candidates that a +pub struct MockAvailabilityStore { + state: AvailabilityStoreState, +} + +impl MockAvailabilityStore { + pub fn new( + chunks: Vec>, + candidate_hashes: HashMap, + ) -> MockAvailabilityStore { + Self { state: AvailabilityStoreState { chunks, candidate_hashes } } + } + + async fn respond_to_query_all_request( + &self, + candidate_hash: CandidateHash, + send_chunk: impl Fn(usize) -> bool, + tx: oneshot::Sender>, + ) { + let candidate_index = self + .state + .candidate_hashes + .get(&candidate_hash) + .expect("candidate was generated previously; qed"); + gum::debug!(target: LOG_TARGET, ?candidate_hash, candidate_index, "Candidate mapped to index"); + + let v = self + .state + .chunks + .get(*candidate_index) + .unwrap() + .iter() + .filter(|c| send_chunk(c.index.0 as usize)) + .cloned() + .collect(); + + let _ = tx.send(v); + } +} + +#[overseer::subsystem(AvailabilityStore, error=SubsystemError, prefix=self::overseer)] +impl MockAvailabilityStore { + fn start(self, ctx: Context) -> SpawnedSubsystem { + let future = self.run(ctx).map(|_| Ok(())).boxed(); + + SpawnedSubsystem { name: "test-environment", future } + } +} + +#[overseer::contextbounds(AvailabilityStore, prefix = self::overseer)] +impl MockAvailabilityStore { + async fn run(self, mut ctx: Context) { + gum::debug!(target: LOG_TARGET, "Subsystem running"); + loop { + let msg = ctx.recv().await.expect("Overseer never fails us"); + + match msg { + orchestra::FromOrchestra::Signal(signal) => + if signal == OverseerSignal::Conclude { + return + }, + orchestra::FromOrchestra::Communication { msg } => match msg { + AvailabilityStoreMessage::QueryAvailableData(candidate_hash, tx) => { + gum::debug!(target: LOG_TARGET, candidate_hash = ?candidate_hash, "Responding to QueryAvailableData"); + + // We never have the full available data. + let _ = tx.send(None); + }, + AvailabilityStoreMessage::QueryAllChunks(candidate_hash, tx) => { + // We always have our own chunk. + gum::debug!(target: LOG_TARGET, candidate_hash = ?candidate_hash, "Responding to QueryAllChunks"); + self.respond_to_query_all_request(candidate_hash, |index| index == 0, tx) + .await; + }, + AvailabilityStoreMessage::QueryChunkSize(candidate_hash, tx) => { + gum::debug!(target: LOG_TARGET, candidate_hash = ?candidate_hash, "Responding to QueryChunkSize"); + + let candidate_index = self + .state + .candidate_hashes + .get(&candidate_hash) + .expect("candidate was generated previously; qed"); + gum::debug!(target: LOG_TARGET, ?candidate_hash, candidate_index, "Candidate mapped to index"); + + let chunk_size = + self.state.chunks.get(*candidate_index).unwrap()[0].encoded_size(); + let _ = tx.send(Some(chunk_size)); + }, + _ => { + unimplemented!("Unexpected av-store message") + }, + }, + } + } + } +} diff --git a/polkadot/node/subsystem-bench/src/core/mock/dummy.rs b/polkadot/node/subsystem-bench/src/core/mock/dummy.rs new file mode 100644 index 000000000000..0628368a49c0 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/core/mock/dummy.rs @@ -0,0 +1,98 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . +//! Dummy subsystem mocks. +use paste::paste; + +use futures::FutureExt; +use polkadot_node_subsystem::{overseer, SpawnedSubsystem, SubsystemError}; +use std::time::Duration; +use tokio::time::sleep; + +const LOG_TARGET: &str = "subsystem-bench::mockery"; + +macro_rules! mock { + // Just query by relay parent + ($subsystem_name:ident) => { + paste! { + pub struct [] {} + #[overseer::subsystem($subsystem_name, error=SubsystemError, prefix=self::overseer)] + impl [] { + fn start(self, ctx: Context) -> SpawnedSubsystem { + let future = self.run(ctx).map(|_| Ok(())).boxed(); + + // The name will appear in substrate CPU task metrics as `task_group`.` + SpawnedSubsystem { name: "test-environment", future } + } + } + + #[overseer::contextbounds($subsystem_name, prefix = self::overseer)] + impl [] { + async fn run(self, mut ctx: Context) { + let mut count_total_msg = 0; + loop { + futures::select!{ + msg = ctx.recv().fuse() => { + match msg.unwrap() { + orchestra::FromOrchestra::Signal(signal) => { + match signal { + polkadot_node_subsystem_types::OverseerSignal::Conclude => {return}, + _ => {} + } + }, + orchestra::FromOrchestra::Communication { msg } => { + gum::debug!(target: LOG_TARGET, msg = ?msg, "mocked subsystem received message"); + } + } + + count_total_msg +=1; + } + _ = sleep(Duration::from_secs(6)).fuse() => { + if count_total_msg > 0 { + gum::trace!(target: LOG_TARGET, "Subsystem {} processed {} messages since last time", stringify!($subsystem_name), count_total_msg); + } + count_total_msg = 0; + } + } + } + } + } + } + }; +} + +mock!(AvailabilityStore); +mock!(StatementDistribution); +mock!(BitfieldSigning); +mock!(BitfieldDistribution); +mock!(Provisioner); +mock!(NetworkBridgeRx); +mock!(CollationGeneration); +mock!(CollatorProtocol); +mock!(GossipSupport); +mock!(DisputeDistribution); +mock!(DisputeCoordinator); +mock!(ProspectiveParachains); +mock!(PvfChecker); +mock!(CandidateBacking); +mock!(AvailabilityDistribution); +mock!(CandidateValidation); +mock!(AvailabilityRecovery); +mock!(NetworkBridgeTx); +mock!(ChainApi); +mock!(ChainSelection); +mock!(ApprovalVoting); +mock!(ApprovalDistribution); +mock!(RuntimeApi); diff --git a/polkadot/node/subsystem-bench/src/core/mock/mod.rs b/polkadot/node/subsystem-bench/src/core/mock/mod.rs new file mode 100644 index 000000000000..d59642e96058 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/core/mock/mod.rs @@ -0,0 +1,77 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use polkadot_node_subsystem::HeadSupportsParachains; +use polkadot_node_subsystem_types::Hash; + +pub mod av_store; +pub mod dummy; +pub mod network_bridge; +pub mod runtime_api; + +pub use av_store::*; +pub use network_bridge::*; +pub use runtime_api::*; + +pub struct AlwaysSupportsParachains {} +#[async_trait::async_trait] +impl HeadSupportsParachains for AlwaysSupportsParachains { + async fn head_supports_parachains(&self, _head: &Hash) -> bool { + true + } +} + +// An orchestra with dummy subsystems +macro_rules! dummy_builder { + ($spawn_task_handle: ident) => {{ + use super::core::mock::dummy::*; + + // Initialize a mock overseer. + // All subsystem except approval_voting and approval_distribution are mock subsystems. + Overseer::builder() + .approval_voting(MockApprovalVoting {}) + .approval_distribution(MockApprovalDistribution {}) + .availability_recovery(MockAvailabilityRecovery {}) + .candidate_validation(MockCandidateValidation {}) + .chain_api(MockChainApi {}) + .chain_selection(MockChainSelection {}) + .dispute_coordinator(MockDisputeCoordinator {}) + .runtime_api(MockRuntimeApi {}) + .network_bridge_tx(MockNetworkBridgeTx {}) + .availability_distribution(MockAvailabilityDistribution {}) + .availability_store(MockAvailabilityStore {}) + .pvf_checker(MockPvfChecker {}) + .candidate_backing(MockCandidateBacking {}) + .statement_distribution(MockStatementDistribution {}) + .bitfield_signing(MockBitfieldSigning {}) + .bitfield_distribution(MockBitfieldDistribution {}) + .provisioner(MockProvisioner {}) + .network_bridge_rx(MockNetworkBridgeRx {}) + .collation_generation(MockCollationGeneration {}) + .collator_protocol(MockCollatorProtocol {}) + .gossip_support(MockGossipSupport {}) + .dispute_distribution(MockDisputeDistribution {}) + .prospective_parachains(MockProspectiveParachains {}) + .activation_external_listeners(Default::default()) + .span_per_active_leaf(Default::default()) + .active_leaves(Default::default()) + .metrics(Default::default()) + .supports_parachains(AlwaysSupportsParachains {}) + .spawner(SpawnGlue($spawn_task_handle)) + }}; +} + +pub(crate) use dummy_builder; diff --git a/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs b/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs new file mode 100644 index 000000000000..b106b832011a --- /dev/null +++ b/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs @@ -0,0 +1,323 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . +//! +//! A generic av store subsystem mockup suitable to be used in benchmarks. + +use futures::Future; +use parity_scale_codec::Encode; +use polkadot_node_subsystem_types::OverseerSignal; +use std::{collections::HashMap, pin::Pin}; + +use futures::FutureExt; + +use polkadot_node_primitives::{AvailableData, ErasureChunk}; + +use polkadot_primitives::CandidateHash; +use sc_network::{OutboundFailure, RequestFailure}; + +use polkadot_node_subsystem::{ + messages::NetworkBridgeTxMessage, overseer, SpawnedSubsystem, SubsystemError, +}; + +use polkadot_node_network_protocol::request_response::{ + self as req_res, v1::ChunkResponse, Requests, +}; +use polkadot_primitives::AuthorityDiscoveryId; + +use crate::core::{ + configuration::{random_error, random_latency, TestConfiguration}, + network::{NetworkAction, NetworkEmulator, RateLimit}, +}; + +/// The availability store state of all emulated peers. +/// The network bridge tx mock will respond to requests as if the request is being serviced +/// by a remote peer on the network +pub struct NetworkAvailabilityState { + pub candidate_hashes: HashMap, + pub available_data: Vec, + pub chunks: Vec>, +} + +const LOG_TARGET: &str = "subsystem-bench::network-bridge-tx-mock"; + +/// A mock of the network bridge tx subsystem. +pub struct MockNetworkBridgeTx { + /// The test configurationg + config: TestConfiguration, + /// The network availability state + availabilty: NetworkAvailabilityState, + /// A network emulator instance + network: NetworkEmulator, +} + +impl MockNetworkBridgeTx { + pub fn new( + config: TestConfiguration, + availabilty: NetworkAvailabilityState, + network: NetworkEmulator, + ) -> MockNetworkBridgeTx { + Self { config, availabilty, network } + } + + fn not_connected_response( + &self, + authority_discovery_id: &AuthorityDiscoveryId, + future: Pin + Send>>, + ) -> NetworkAction { + // The network action will send the error after a random delay expires. + return NetworkAction::new( + authority_discovery_id.clone(), + future, + 0, + // Generate a random latency based on configuration. + random_latency(self.config.latency.as_ref()), + ) + } + /// Returns an `NetworkAction` corresponding to the peer sending the response. If + /// the peer is connected, the error is sent with a randomized latency as defined in + /// configuration. + fn respond_to_send_request( + &mut self, + request: Requests, + ingress_tx: &mut tokio::sync::mpsc::UnboundedSender, + ) -> NetworkAction { + let ingress_tx = ingress_tx.clone(); + + match request { + Requests::ChunkFetchingV1(outgoing_request) => { + let authority_discovery_id = match outgoing_request.peer { + req_res::Recipient::Authority(authority_discovery_id) => authority_discovery_id, + _ => unimplemented!("Peer recipient not supported yet"), + }; + // Account our sent request bytes. + self.network.peer_stats(0).inc_sent(outgoing_request.payload.encoded_size()); + + // If peer is disconnected return an error + if !self.network.is_peer_connected(&authority_discovery_id) { + // We always send `NotConnected` error and we ignore `IfDisconnected` value in + // the caller. + let future = async move { + let _ = outgoing_request + .pending_response + .send(Err(RequestFailure::NotConnected)); + } + .boxed(); + return self.not_connected_response(&authority_discovery_id, future) + } + + // Account for remote received request bytes. + self.network + .peer_stats_by_id(&authority_discovery_id) + .inc_received(outgoing_request.payload.encoded_size()); + + let validator_index: usize = outgoing_request.payload.index.0 as usize; + let candidate_hash = outgoing_request.payload.candidate_hash; + + let candidate_index = self + .availabilty + .candidate_hashes + .get(&candidate_hash) + .expect("candidate was generated previously; qed"); + gum::warn!(target: LOG_TARGET, ?candidate_hash, candidate_index, "Candidate mapped to index"); + + let chunk: ChunkResponse = self.availabilty.chunks.get(*candidate_index).unwrap() + [validator_index] + .clone() + .into(); + let mut size = chunk.encoded_size(); + + let response = if random_error(self.config.error) { + // Error will not account to any bandwidth used. + size = 0; + Err(RequestFailure::Network(OutboundFailure::ConnectionClosed)) + } else { + Ok(req_res::v1::ChunkFetchingResponse::from(Some(chunk)).encode()) + }; + + let authority_discovery_id_clone = authority_discovery_id.clone(); + + let future = async move { + let _ = outgoing_request.pending_response.send(response); + } + .boxed(); + + let future_wrapper = async move { + // Forward the response to the ingress channel of our node. + // On receive side we apply our node receiving rate limit. + let action = + NetworkAction::new(authority_discovery_id_clone, future, size, None); + ingress_tx.send(action).unwrap(); + } + .boxed(); + + NetworkAction::new( + authority_discovery_id, + future_wrapper, + size, + // Generate a random latency based on configuration. + random_latency(self.config.latency.as_ref()), + ) + }, + Requests::AvailableDataFetchingV1(outgoing_request) => { + let candidate_hash = outgoing_request.payload.candidate_hash; + let candidate_index = self + .availabilty + .candidate_hashes + .get(&candidate_hash) + .expect("candidate was generated previously; qed"); + gum::debug!(target: LOG_TARGET, ?candidate_hash, candidate_index, "Candidate mapped to index"); + + let authority_discovery_id = match outgoing_request.peer { + req_res::Recipient::Authority(authority_discovery_id) => authority_discovery_id, + _ => unimplemented!("Peer recipient not supported yet"), + }; + + // Account our sent request bytes. + self.network.peer_stats(0).inc_sent(outgoing_request.payload.encoded_size()); + + // If peer is disconnected return an error + if !self.network.is_peer_connected(&authority_discovery_id) { + let future = async move { + let _ = outgoing_request + .pending_response + .send(Err(RequestFailure::NotConnected)); + } + .boxed(); + return self.not_connected_response(&authority_discovery_id, future) + } + + // Account for remote received request bytes. + self.network + .peer_stats_by_id(&authority_discovery_id) + .inc_received(outgoing_request.payload.encoded_size()); + + let available_data = + self.availabilty.available_data.get(*candidate_index).unwrap().clone(); + + let size = available_data.encoded_size(); + + let response = if random_error(self.config.error) { + Err(RequestFailure::Network(OutboundFailure::ConnectionClosed)) + } else { + Ok(req_res::v1::AvailableDataFetchingResponse::from(Some(available_data)) + .encode()) + }; + + let future = async move { + let _ = outgoing_request.pending_response.send(response); + } + .boxed(); + + let authority_discovery_id_clone = authority_discovery_id.clone(); + + let future_wrapper = async move { + // Forward the response to the ingress channel of our node. + // On receive side we apply our node receiving rate limit. + let action = + NetworkAction::new(authority_discovery_id_clone, future, size, None); + ingress_tx.send(action).unwrap(); + } + .boxed(); + + NetworkAction::new( + authority_discovery_id, + future_wrapper, + size, + // Generate a random latency based on configuration. + random_latency(self.config.latency.as_ref()), + ) + }, + _ => panic!("received an unexpected request"), + } + } +} + +#[overseer::subsystem(NetworkBridgeTx, error=SubsystemError, prefix=self::overseer)] +impl MockNetworkBridgeTx { + fn start(self, ctx: Context) -> SpawnedSubsystem { + let future = self.run(ctx).map(|_| Ok(())).boxed(); + + SpawnedSubsystem { name: "test-environment", future } + } +} + +#[overseer::contextbounds(NetworkBridgeTx, prefix = self::overseer)] +impl MockNetworkBridgeTx { + async fn run(mut self, mut ctx: Context) { + let (mut ingress_tx, mut ingress_rx) = + tokio::sync::mpsc::unbounded_channel::(); + + // Initialize our node bandwidth limits. + let mut rx_limiter = RateLimit::new(10, self.config.bandwidth); + + let our_network = self.network.clone(); + + // This task will handle node messages receipt from the simulated network. + ctx.spawn_blocking( + "network-receive", + async move { + while let Some(action) = ingress_rx.recv().await { + let size = action.size(); + + // account for our node receiving the data. + our_network.inc_received(size); + rx_limiter.reap(size).await; + action.run().await; + } + } + .boxed(), + ) + .expect("We never fail to spawn tasks"); + + // Main subsystem loop. + loop { + let msg = ctx.recv().await.expect("Overseer never fails us"); + + match msg { + orchestra::FromOrchestra::Signal(signal) => + if signal == OverseerSignal::Conclude { + return + }, + orchestra::FromOrchestra::Communication { msg } => match msg { + NetworkBridgeTxMessage::SendRequests(requests, _if_disconnected) => { + for request in requests { + gum::debug!(target: LOG_TARGET, request = ?request, "Processing request"); + self.network.inc_sent(request_size(&request)); + let action = self.respond_to_send_request(request, &mut ingress_tx); + + // Will account for our node sending the request over the emulated + // network. + self.network.submit_peer_action(action.peer(), action); + } + }, + _ => { + unimplemented!("Unexpected network bridge message") + }, + }, + } + } + } +} + +// A helper to determine the request payload size. +fn request_size(request: &Requests) -> usize { + match request { + Requests::ChunkFetchingV1(outgoing_request) => outgoing_request.payload.encoded_size(), + Requests::AvailableDataFetchingV1(outgoing_request) => + outgoing_request.payload.encoded_size(), + _ => unimplemented!("received an unexpected request"), + } +} diff --git a/polkadot/node/subsystem-bench/src/core/mock/runtime_api.rs b/polkadot/node/subsystem-bench/src/core/mock/runtime_api.rs new file mode 100644 index 000000000000..d664ebead3cc --- /dev/null +++ b/polkadot/node/subsystem-bench/src/core/mock/runtime_api.rs @@ -0,0 +1,110 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . +//! +//! A generic runtime api subsystem mockup suitable to be used in benchmarks. + +use polkadot_primitives::{GroupIndex, IndexedVec, SessionInfo, ValidatorIndex}; + +use polkadot_node_subsystem::{ + messages::{RuntimeApiMessage, RuntimeApiRequest}, + overseer, SpawnedSubsystem, SubsystemError, +}; +use polkadot_node_subsystem_types::OverseerSignal; + +use crate::core::configuration::{TestAuthorities, TestConfiguration}; +use futures::FutureExt; + +const LOG_TARGET: &str = "subsystem-bench::runtime-api-mock"; + +pub struct RuntimeApiState { + authorities: TestAuthorities, +} + +pub struct MockRuntimeApi { + state: RuntimeApiState, + config: TestConfiguration, +} + +impl MockRuntimeApi { + pub fn new(config: TestConfiguration, authorities: TestAuthorities) -> MockRuntimeApi { + Self { state: RuntimeApiState { authorities }, config } + } + + fn session_info(&self) -> SessionInfo { + let all_validators = (0..self.config.n_validators) + .map(|i| ValidatorIndex(i as _)) + .collect::>(); + + let validator_groups = all_validators.chunks(5).map(Vec::from).collect::>(); + + SessionInfo { + validators: self.state.authorities.validator_public.clone().into(), + discovery_keys: self.state.authorities.validator_authority_id.clone(), + validator_groups: IndexedVec::>::from(validator_groups), + assignment_keys: vec![], + n_cores: self.config.n_cores as u32, + zeroth_delay_tranche_width: 0, + relay_vrf_modulo_samples: 0, + n_delay_tranches: 0, + no_show_slots: 0, + needed_approvals: 0, + active_validator_indices: vec![], + dispute_period: 6, + random_seed: [0u8; 32], + } + } +} + +#[overseer::subsystem(RuntimeApi, error=SubsystemError, prefix=self::overseer)] +impl MockRuntimeApi { + fn start(self, ctx: Context) -> SpawnedSubsystem { + let future = self.run(ctx).map(|_| Ok(())).boxed(); + + SpawnedSubsystem { name: "test-environment", future } + } +} + +#[overseer::contextbounds(RuntimeApi, prefix = self::overseer)] +impl MockRuntimeApi { + async fn run(self, mut ctx: Context) { + loop { + let msg = ctx.recv().await.expect("Overseer never fails us"); + + match msg { + orchestra::FromOrchestra::Signal(signal) => + if signal == OverseerSignal::Conclude { + return + }, + orchestra::FromOrchestra::Communication { msg } => { + gum::debug!(target: LOG_TARGET, msg=?msg, "recv message"); + + match msg { + RuntimeApiMessage::Request( + _request, + RuntimeApiRequest::SessionInfo(_session_index, sender), + ) => { + let _ = sender.send(Ok(Some(self.session_info()))); + }, + // Long term TODO: implement more as needed. + _ => { + unimplemented!("Unexpected runtime-api message") + }, + } + }, + } + } + } +} diff --git a/polkadot/node/subsystem-bench/src/core/mod.rs b/polkadot/node/subsystem-bench/src/core/mod.rs new file mode 100644 index 000000000000..282788d143b4 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/core/mod.rs @@ -0,0 +1,24 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +const LOG_TARGET: &str = "subsystem-bench::core"; + +pub mod configuration; +pub mod display; +pub mod environment; +pub mod keyring; +pub mod mock; +pub mod network; diff --git a/polkadot/node/subsystem-bench/src/core/network.rs b/polkadot/node/subsystem-bench/src/core/network.rs new file mode 100644 index 000000000000..c4e20b421d34 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/core/network.rs @@ -0,0 +1,485 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . +use super::{ + configuration::{TestAuthorities, TestConfiguration}, + environment::TestEnvironmentDependencies, + *, +}; +use colored::Colorize; +use polkadot_primitives::AuthorityDiscoveryId; +use prometheus_endpoint::U64; +use rand::{seq::SliceRandom, thread_rng}; +use sc_service::SpawnTaskHandle; +use std::{ + collections::HashMap, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, + }, + time::{Duration, Instant}, +}; +use tokio::sync::mpsc::UnboundedSender; + +// An emulated node egress traffic rate_limiter. +#[derive(Debug)] +pub struct RateLimit { + // How often we refill credits in buckets + tick_rate: usize, + // Total ticks + total_ticks: usize, + // Max refill per tick + max_refill: usize, + // Available credit. We allow for bursts over 1/tick_rate of `cps` budget, but we + // account it by negative credit. + credits: isize, + // When last refilled. + last_refill: Instant, +} + +impl RateLimit { + // Create a new `RateLimit` from a `cps` (credits per second) budget and + // `tick_rate`. + pub fn new(tick_rate: usize, cps: usize) -> Self { + // Compute how much refill for each tick + let max_refill = cps / tick_rate; + RateLimit { + tick_rate, + total_ticks: 0, + max_refill, + // A fresh start + credits: max_refill as isize, + last_refill: Instant::now(), + } + } + + pub async fn refill(&mut self) { + // If this is called to early, we need to sleep until next tick. + let now = Instant::now(); + let next_tick_delta = + (self.last_refill + Duration::from_millis(1000 / self.tick_rate as u64)) - now; + + // Sleep until next tick. + if !next_tick_delta.is_zero() { + gum::trace!(target: LOG_TARGET, "need to sleep {}ms", next_tick_delta.as_millis()); + tokio::time::sleep(next_tick_delta).await; + } + + self.total_ticks += 1; + self.credits += self.max_refill as isize; + self.last_refill = Instant::now(); + } + + // Reap credits from the bucket. + // Blocks if credits budged goes negative during call. + pub async fn reap(&mut self, amount: usize) { + self.credits -= amount as isize; + + if self.credits >= 0 { + return + } + + while self.credits < 0 { + gum::trace!(target: LOG_TARGET, "Before refill: {:?}", &self); + self.refill().await; + gum::trace!(target: LOG_TARGET, "After refill: {:?}", &self); + } + } +} + +#[cfg(test)] +mod tests { + use std::time::Instant; + + use super::RateLimit; + + #[tokio::test] + async fn test_expected_rate() { + let tick_rate = 200; + let budget = 1_000_000; + // rate must not exceeed 100 credits per second + let mut rate_limiter = RateLimit::new(tick_rate, budget); + let mut total_sent = 0usize; + let start = Instant::now(); + + let mut reap_amount = 0; + while rate_limiter.total_ticks < tick_rate { + reap_amount += 1; + reap_amount %= 100; + + rate_limiter.reap(reap_amount).await; + total_sent += reap_amount; + } + + let end = Instant::now(); + + println!("duration: {}", (end - start).as_millis()); + + // Allow up to `budget/max_refill` error tolerance + let lower_bound = budget as u128 * ((end - start).as_millis() / 1000u128); + let upper_bound = budget as u128 * + ((end - start).as_millis() / 1000u128 + rate_limiter.max_refill as u128); + assert!(total_sent as u128 >= lower_bound); + assert!(total_sent as u128 <= upper_bound); + } +} + +// A network peer emulator. It spawns a task that accepts `NetworkActions` and +// executes them with a configurable delay and bandwidth constraints. Tipically +// these actions wrap a future that performs a channel send to the subsystem(s) under test. +#[derive(Clone)] +struct PeerEmulator { + // The queue of requests waiting to be served by the emulator + actions_tx: UnboundedSender, +} + +impl PeerEmulator { + pub fn new( + bandwidth: usize, + spawn_task_handle: SpawnTaskHandle, + stats: Arc, + ) -> Self { + let (actions_tx, mut actions_rx) = tokio::sync::mpsc::unbounded_channel(); + + spawn_task_handle + .clone() + .spawn("peer-emulator", "test-environment", async move { + // Rate limit peer send. + let mut rate_limiter = RateLimit::new(10, bandwidth); + loop { + let stats_clone = stats.clone(); + let maybe_action: Option = actions_rx.recv().await; + if let Some(action) = maybe_action { + let size = action.size(); + rate_limiter.reap(size).await; + if let Some(latency) = action.latency { + spawn_task_handle.spawn( + "peer-emulator-latency", + "test-environment", + async move { + tokio::time::sleep(latency).await; + action.run().await; + stats_clone.inc_sent(size); + }, + ) + } else { + action.run().await; + stats_clone.inc_sent(size); + } + } else { + break + } + } + }); + + Self { actions_tx } + } + + // Queue a send request from the emulated peer. + pub fn send(&mut self, action: NetworkAction) { + self.actions_tx.send(action).expect("peer emulator task lives"); + } +} + +pub type ActionFuture = std::pin::Pin + std::marker::Send>>; +/// An network action to be completed by the emulator task. +pub struct NetworkAction { + // The function that performs the action + run: ActionFuture, + // The payload size that we simulate sending/receiving from a peer + size: usize, + // Peer which should run the action. + peer: AuthorityDiscoveryId, + // The amount of time to delay the polling `run` + latency: Option, +} + +unsafe impl Send for NetworkAction {} + +/// Book keeping of sent and received bytes. +pub struct PeerEmulatorStats { + rx_bytes_total: AtomicU64, + tx_bytes_total: AtomicU64, + metrics: Metrics, + peer_index: usize, +} + +impl PeerEmulatorStats { + pub(crate) fn new(peer_index: usize, metrics: Metrics) -> Self { + Self { + metrics, + rx_bytes_total: AtomicU64::from(0), + tx_bytes_total: AtomicU64::from(0), + peer_index, + } + } + + pub fn inc_sent(&self, bytes: usize) { + self.tx_bytes_total.fetch_add(bytes as u64, Ordering::Relaxed); + self.metrics.on_peer_sent(self.peer_index, bytes); + } + + pub fn inc_received(&self, bytes: usize) { + self.rx_bytes_total.fetch_add(bytes as u64, Ordering::Relaxed); + self.metrics.on_peer_received(self.peer_index, bytes); + } + + pub fn sent(&self) -> u64 { + self.tx_bytes_total.load(Ordering::Relaxed) + } + + pub fn received(&self) -> u64 { + self.rx_bytes_total.load(Ordering::Relaxed) + } +} + +#[derive(Debug, Default)] +pub struct PeerStats { + pub rx_bytes_total: u64, + pub tx_bytes_total: u64, +} +impl NetworkAction { + pub fn new( + peer: AuthorityDiscoveryId, + run: ActionFuture, + size: usize, + latency: Option, + ) -> Self { + Self { run, size, peer, latency } + } + + pub fn size(&self) -> usize { + self.size + } + + pub async fn run(self) { + self.run.await; + } + + pub fn peer(&self) -> AuthorityDiscoveryId { + self.peer.clone() + } +} + +/// The state of a peer on the emulated network. +#[derive(Clone)] +enum Peer { + Connected(PeerEmulator), + Disconnected(PeerEmulator), +} + +impl Peer { + pub fn disconnect(&mut self) { + let new_self = match self { + Peer::Connected(peer) => Peer::Disconnected(peer.clone()), + _ => return, + }; + *self = new_self; + } + + pub fn is_connected(&self) -> bool { + matches!(self, Peer::Connected(_)) + } + + pub fn emulator(&mut self) -> &mut PeerEmulator { + match self { + Peer::Connected(ref mut emulator) => emulator, + Peer::Disconnected(ref mut emulator) => emulator, + } + } +} + +/// Mocks the network bridge and an arbitrary number of connected peer nodes. +/// Implements network latency, bandwidth and connection errors. +#[derive(Clone)] +pub struct NetworkEmulator { + // Per peer network emulation. + peers: Vec, + /// Per peer stats. + stats: Vec>, + /// Each emulated peer is a validator. + validator_authority_ids: HashMap, +} + +impl NetworkEmulator { + pub fn new( + config: &TestConfiguration, + dependencies: &TestEnvironmentDependencies, + authorities: &TestAuthorities, + ) -> Self { + let n_peers = config.n_validators; + gum::info!(target: LOG_TARGET, "{}",format!("Initializing emulation for a {} peer network.", n_peers).bright_blue()); + gum::info!(target: LOG_TARGET, "{}",format!("connectivity {}%, error {}%", config.connectivity, config.error).bright_black()); + + let metrics = + Metrics::new(&dependencies.registry).expect("Metrics always register succesfully"); + let mut validator_authority_id_mapping = HashMap::new(); + + // Create a `PeerEmulator` for each peer. + let (stats, mut peers): (_, Vec<_>) = (0..n_peers) + .zip(authorities.validator_authority_id.clone()) + .map(|(peer_index, authority_id)| { + validator_authority_id_mapping.insert(authority_id, peer_index); + let stats = Arc::new(PeerEmulatorStats::new(peer_index, metrics.clone())); + ( + stats.clone(), + Peer::Connected(PeerEmulator::new( + config.peer_bandwidth, + dependencies.task_manager.spawn_handle(), + stats, + )), + ) + }) + .unzip(); + + let connected_count = config.n_validators as f64 / (100.0 / config.connectivity as f64); + + let (_connected, to_disconnect) = + peers.partial_shuffle(&mut thread_rng(), connected_count as usize); + + for peer in to_disconnect { + peer.disconnect(); + } + + gum::info!(target: LOG_TARGET, "{}",format!("Network created, connected validator count {}", connected_count).bright_black()); + + Self { peers, stats, validator_authority_ids: validator_authority_id_mapping } + } + + pub fn is_peer_connected(&self, peer: &AuthorityDiscoveryId) -> bool { + self.peer(peer).is_connected() + } + + pub fn submit_peer_action(&mut self, peer: AuthorityDiscoveryId, action: NetworkAction) { + let index = self + .validator_authority_ids + .get(&peer) + .expect("all test authorities are valid; qed"); + + let peer = self.peers.get_mut(*index).expect("We just retrieved the index above; qed"); + + // Only actions of size 0 are allowed on disconnected peers. + // Typically this are delayed error response sends. + if action.size() > 0 && !peer.is_connected() { + gum::warn!(target: LOG_TARGET, peer_index = index, "Attempted to send data from a disconnected peer, operation ignored"); + return + } + + peer.emulator().send(action); + } + + // Returns the sent/received stats for `peer_index`. + pub fn peer_stats(&self, peer_index: usize) -> Arc { + self.stats[peer_index].clone() + } + + // Helper to get peer index by `AuthorityDiscoveryId` + fn peer_index(&self, peer: &AuthorityDiscoveryId) -> usize { + *self + .validator_authority_ids + .get(peer) + .expect("all test authorities are valid; qed") + } + + // Return the Peer entry for a given `AuthorityDiscoveryId`. + fn peer(&self, peer: &AuthorityDiscoveryId) -> &Peer { + &self.peers[self.peer_index(peer)] + } + // Returns the sent/received stats for `peer`. + pub fn peer_stats_by_id(&mut self, peer: &AuthorityDiscoveryId) -> Arc { + let peer_index = self.peer_index(peer); + + self.stats[peer_index].clone() + } + + // Returns the sent/received stats for all peers. + pub fn stats(&self) -> Vec { + let r = self + .stats + .iter() + .map(|stats| PeerStats { + rx_bytes_total: stats.received(), + tx_bytes_total: stats.sent(), + }) + .collect::>(); + r + } + + // Increment bytes sent by our node (the node that contains the subsystem under test) + pub fn inc_sent(&self, bytes: usize) { + // Our node always is peer 0. + self.peer_stats(0).inc_sent(bytes); + } + + // Increment bytes received by our node (the node that contains the subsystem under test) + pub fn inc_received(&self, bytes: usize) { + // Our node always is peer 0. + self.peer_stats(0).inc_received(bytes); + } +} + +use polkadot_node_subsystem_util::metrics::prometheus::{ + self, CounterVec, Opts, PrometheusError, Registry, +}; + +/// Emulated network metrics. +#[derive(Clone)] +pub(crate) struct Metrics { + /// Number of bytes sent per peer. + peer_total_sent: CounterVec, + /// Number of received sent per peer. + peer_total_received: CounterVec, +} + +impl Metrics { + pub fn new(registry: &Registry) -> Result { + Ok(Self { + peer_total_sent: prometheus::register( + CounterVec::new( + Opts::new( + "subsystem_benchmark_network_peer_total_bytes_sent", + "Total number of bytes a peer has sent.", + ), + &["peer"], + )?, + registry, + )?, + peer_total_received: prometheus::register( + CounterVec::new( + Opts::new( + "subsystem_benchmark_network_peer_total_bytes_received", + "Total number of bytes a peer has received.", + ), + &["peer"], + )?, + registry, + )?, + }) + } + + /// Increment total sent for a peer. + pub fn on_peer_sent(&self, peer_index: usize, bytes: usize) { + self.peer_total_sent + .with_label_values(vec![format!("node{}", peer_index).as_str()].as_slice()) + .inc_by(bytes as u64); + } + + /// Increment total receioved for a peer. + pub fn on_peer_received(&self, peer_index: usize, bytes: usize) { + self.peer_total_received + .with_label_values(vec![format!("node{}", peer_index).as_str()].as_slice()) + .inc_by(bytes as u64); + } +} diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs new file mode 100644 index 000000000000..da7e5441f748 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -0,0 +1,186 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! A tool for running subsystem benchmark tests designed for development and +//! CI regression testing. +use clap::Parser; +use color_eyre::eyre; + +use colored::Colorize; +use std::{path::Path, time::Duration}; + +pub(crate) mod availability; +pub(crate) mod cli; +pub(crate) mod core; + +use availability::{prepare_test, NetworkEmulation, TestState}; +use cli::TestObjective; + +use core::{ + configuration::TestConfiguration, + environment::{TestEnvironment, GENESIS_HASH}, +}; + +use clap_num::number_range; + +use crate::core::display::display_configuration; + +fn le_100(s: &str) -> Result { + number_range(s, 0, 100) +} + +fn le_5000(s: &str) -> Result { + number_range(s, 0, 5000) +} + +#[derive(Debug, Parser)] +#[allow(missing_docs)] +struct BenchCli { + #[arg(long, value_enum, ignore_case = true, default_value_t = NetworkEmulation::Ideal)] + /// The type of network to be emulated + pub network: NetworkEmulation, + + #[clap(flatten)] + pub standard_configuration: cli::StandardTestOptions, + + #[clap(short, long)] + /// The bandwidth of simulated remote peers in KiB + pub peer_bandwidth: Option, + + #[clap(short, long)] + /// The bandwidth of our simulated node in KiB + pub bandwidth: Option, + + #[clap(long, value_parser=le_100)] + /// Simulated conection error ratio [0-100]. + pub peer_error: Option, + + #[clap(long, value_parser=le_5000)] + /// Minimum remote peer latency in milliseconds [0-5000]. + pub peer_min_latency: Option, + + #[clap(long, value_parser=le_5000)] + /// Maximum remote peer latency in milliseconds [0-5000]. + pub peer_max_latency: Option, + + #[command(subcommand)] + pub objective: cli::TestObjective, +} + +impl BenchCli { + fn launch(self) -> eyre::Result<()> { + let configuration = self.standard_configuration; + let mut test_config = match self.objective { + TestObjective::TestSequence(options) => { + let test_sequence = + core::configuration::TestSequence::new_from_file(Path::new(&options.path)) + .expect("File exists") + .into_vec(); + let num_steps = test_sequence.len(); + gum::info!( + "{}", + format!("Sequence contains {} step(s)", num_steps).bright_purple() + ); + for (index, test_config) in test_sequence.into_iter().enumerate() { + gum::info!("{}", format!("Step {}/{}", index + 1, num_steps).bright_purple(),); + display_configuration(&test_config); + + let mut state = TestState::new(&test_config); + let (mut env, _protocol_config) = prepare_test(test_config, &mut state); + env.runtime() + .block_on(availability::benchmark_availability_read(&mut env, state)); + } + return Ok(()) + }, + TestObjective::DataAvailabilityRead(ref _options) => match self.network { + NetworkEmulation::Healthy => TestConfiguration::healthy_network( + self.objective, + configuration.num_blocks, + configuration.n_validators, + configuration.n_cores, + configuration.min_pov_size, + configuration.max_pov_size, + ), + NetworkEmulation::Degraded => TestConfiguration::degraded_network( + self.objective, + configuration.num_blocks, + configuration.n_validators, + configuration.n_cores, + configuration.min_pov_size, + configuration.max_pov_size, + ), + NetworkEmulation::Ideal => TestConfiguration::ideal_network( + self.objective, + configuration.num_blocks, + configuration.n_validators, + configuration.n_cores, + configuration.min_pov_size, + configuration.max_pov_size, + ), + }, + }; + + let mut latency_config = test_config.latency.clone().unwrap_or_default(); + + if let Some(latency) = self.peer_min_latency { + latency_config.min_latency = Duration::from_millis(latency); + } + + if let Some(latency) = self.peer_max_latency { + latency_config.max_latency = Duration::from_millis(latency); + } + + if let Some(error) = self.peer_error { + test_config.error = error; + } + + if let Some(bandwidth) = self.peer_bandwidth { + // CLI expects bw in KiB + test_config.peer_bandwidth = bandwidth * 1024; + } + + if let Some(bandwidth) = self.bandwidth { + // CLI expects bw in KiB + test_config.bandwidth = bandwidth * 1024; + } + + display_configuration(&test_config); + + let mut state = TestState::new(&test_config); + let (mut env, _protocol_config) = prepare_test(test_config, &mut state); + // test_config.write_to_disk(); + env.runtime() + .block_on(availability::benchmark_availability_read(&mut env, state)); + + Ok(()) + } +} + +fn main() -> eyre::Result<()> { + color_eyre::install()?; + env_logger::builder() + .filter(Some("hyper"), log::LevelFilter::Info) + // Avoid `Terminating due to subsystem exit subsystem` warnings + .filter(Some("polkadot_overseer"), log::LevelFilter::Error) + .filter(None, log::LevelFilter::Info) + // .filter(None, log::LevelFilter::Trace) + .try_init() + .unwrap(); + + let cli: BenchCli = BenchCli::parse(); + cli.launch()?; + Ok(()) +} diff --git a/polkadot/node/subsystem-test-helpers/Cargo.toml b/polkadot/node/subsystem-test-helpers/Cargo.toml index eb6e10559c2a..7b616bdb4382 100644 --- a/polkadot/node/subsystem-test-helpers/Cargo.toml +++ b/polkadot/node/subsystem-test-helpers/Cargo.toml @@ -15,8 +15,11 @@ async-trait = "0.1.57" futures = "0.3.21" parking_lot = "0.12.0" polkadot-node-subsystem = { path = "../subsystem" } +polkadot-erasure-coding = { path = "../../erasure-coding" } polkadot-node-subsystem-util = { path = "../subsystem-util" } polkadot-primitives = { path = "../../primitives" } +polkadot-node-primitives = { path = "../primitives" } + sc-client-api = { path = "../../../substrate/client/api" } sc-utils = { path = "../../../substrate/client/utils" } sp-core = { path = "../../../substrate/primitives/core" } diff --git a/polkadot/node/subsystem-test-helpers/src/lib.rs b/polkadot/node/subsystem-test-helpers/src/lib.rs index 3f92513498c4..dfa78e04b8c9 100644 --- a/polkadot/node/subsystem-test-helpers/src/lib.rs +++ b/polkadot/node/subsystem-test-helpers/src/lib.rs @@ -18,11 +18,14 @@ #![warn(missing_docs)] +use polkadot_erasure_coding::{branches, obtain_chunks_v1 as obtain_chunks}; +use polkadot_node_primitives::{AvailableData, ErasureChunk, Proof}; use polkadot_node_subsystem::{ messages::AllMessages, overseer, FromOrchestra, OverseerSignal, SpawnGlue, SpawnedSubsystem, SubsystemError, SubsystemResult, TrySendError, }; use polkadot_node_subsystem_util::TimeoutExt; +use polkadot_primitives::{Hash, ValidatorIndex}; use futures::{channel::mpsc, poll, prelude::*}; use parking_lot::Mutex; @@ -440,6 +443,34 @@ impl Future for Yield { } } +/// Helper for chunking available data. +pub fn derive_erasure_chunks_with_proofs_and_root( + n_validators: usize, + available_data: &AvailableData, + alter_chunk: impl Fn(usize, &mut Vec), +) -> (Vec, Hash) { + let mut chunks: Vec> = obtain_chunks(n_validators, available_data).unwrap(); + + for (i, chunk) in chunks.iter_mut().enumerate() { + alter_chunk(i, chunk) + } + + // create proofs for each erasure chunk + let branches = branches(chunks.as_ref()); + + let root = branches.root(); + let erasure_chunks = branches + .enumerate() + .map(|(index, (proof, chunk))| ErasureChunk { + chunk: chunk.to_vec(), + index: ValidatorIndex(index as _), + proof: Proof::try_from(proof).unwrap(), + }) + .collect::>(); + + (erasure_chunks, root) +} + #[cfg(test)] mod tests { use super::*; diff --git a/polkadot/node/subsystem-test-helpers/src/mock.rs b/polkadot/node/subsystem-test-helpers/src/mock.rs index 522bc3c2cc4f..14026960ac13 100644 --- a/polkadot/node/subsystem-test-helpers/src/mock.rs +++ b/polkadot/node/subsystem-test-helpers/src/mock.rs @@ -16,7 +16,7 @@ use std::sync::Arc; -use polkadot_node_subsystem::{jaeger, ActivatedLeaf}; +use polkadot_node_subsystem::{jaeger, ActivatedLeaf, BlockInfo}; use sc_client_api::UnpinHandle; use sc_keystore::LocalKeystore; use sc_utils::mpsc::tracing_unbounded; @@ -59,3 +59,8 @@ pub fn new_leaf(hash: Hash, number: BlockNumber) -> ActivatedLeaf { span: Arc::new(jaeger::Span::Disabled), } } + +/// Create a new leaf with the given hash and number. +pub fn new_block_import_info(hash: Hash, number: BlockNumber) -> BlockInfo { + BlockInfo { hash, parent_hash: Hash::default(), number, unpin_handle: dummy_unpin_handle(hash) } +}