From 624300d58deda9aea058f2dac58bdfab1a87e13f Mon Sep 17 00:00:00 2001 From: mwtian <81660174+mwtian@users.noreply.github.com> Date: Fri, 30 Aug 2024 14:39:32 -0700 Subject: [PATCH] Cherrypick additional rocksdb metrics (#19172) ## Description #19112: this emits more metrics from rocksdb, to help with investigations into external fullnode write stalls. ## Test plan CI --- ## Release notes Check each box that your changes affect. If none of the boxes relate to your changes, release notes aren't required. For each box you select, include information after the relevant heading that describes the impact of your changes that a user might notice and any actions they must take to implement updates. - [ ] Protocol: - [ ] Nodes (Validators and Full nodes): - [ ] Indexer: - [ ] JSON-RPC: - [ ] GraphQL: - [ ] CLI: - [ ] Rust SDK: - [ ] REST API: ## Description Describe the changes or additions included in this PR. ## Test plan How did you test the new or updated feature? --- ## Release notes Check each box that your changes affect. If none of the boxes relate to your changes, release notes aren't required. For each box you select, include information after the relevant heading that describes the impact of your changes that a user might notice and any actions they must take to implement updates. - [ ] Protocol: - [ ] Nodes (Validators and Full nodes): - [ ] Indexer: - [ ] JSON-RPC: - [ ] GraphQL: - [ ] CLI: - [ ] Rust SDK: - [ ] REST API: --- crates/typed-store/src/metrics.rs | 34 ++++++++++++++++++++++++++- crates/typed-store/src/rocks/mod.rs | 36 +++++++++++++++++++++++++++-- 2 files changed, 67 insertions(+), 3 deletions(-) diff --git a/crates/typed-store/src/metrics.rs b/crates/typed-store/src/metrics.rs index 904ad5a31cf67..f7049d8d63768 100644 --- a/crates/typed-store/src/metrics.rs +++ b/crates/typed-store/src/metrics.rs @@ -77,6 +77,7 @@ impl SamplingInterval { pub struct ColumnFamilyMetrics { pub rocksdb_total_sst_files_size: IntGaugeVec, pub rocksdb_total_blob_files_size: IntGaugeVec, + pub rocksdb_current_size_active_mem_tables: IntGaugeVec, pub rocksdb_size_all_mem_tables: IntGaugeVec, pub rocksdb_num_snapshots: IntGaugeVec, pub rocksdb_oldest_snapshot_time: IntGaugeVec, @@ -86,13 +87,16 @@ pub struct ColumnFamilyMetrics { pub rocksdb_block_cache_usage: IntGaugeVec, pub rocksdb_block_cache_pinned_usage: IntGaugeVec, pub rocksdb_estimate_table_readers_mem: IntGaugeVec, + pub rocksdb_num_immutable_mem_tables: IntGaugeVec, pub rocksdb_mem_table_flush_pending: IntGaugeVec, pub rocksdb_compaction_pending: IntGaugeVec, + pub rocksdb_estimate_pending_compaction_bytes: IntGaugeVec, pub rocksdb_num_running_compactions: IntGaugeVec, pub rocksdb_num_running_flushes: IntGaugeVec, pub rocksdb_estimate_oldest_key_time: IntGaugeVec, pub rocksdb_background_errors: IntGaugeVec, pub rocksdb_estimated_num_keys: IntGaugeVec, + pub rocksdb_base_level: IntGaugeVec, } impl ColumnFamilyMetrics { @@ -112,6 +116,13 @@ impl ColumnFamilyMetrics { registry, ) .unwrap(), + rocksdb_current_size_active_mem_tables: register_int_gauge_vec_with_registry!( + "rocksdb_current_size_active_mem_tables", + "The current approximate size of active memtable (bytes).", + &["cf_name"], + registry, + ) + .unwrap(), rocksdb_size_all_mem_tables: register_int_gauge_vec_with_registry!( "rocksdb_size_all_mem_tables", "The memory size occupied by the column family's in-memory buffer", @@ -177,6 +188,13 @@ impl ColumnFamilyMetrics { registry, ) .unwrap(), + rocksdb_num_immutable_mem_tables: register_int_gauge_vec_with_registry!( + "rocksdb_num_immutable_mem_tables", + "The number of immutable memtables that have not yet been flushed.", + &["cf_name"], + registry, + ) + .unwrap(), rocksdb_mem_table_flush_pending: register_int_gauge_vec_with_registry!( "rocksdb_mem_table_flush_pending", "A 1 or 0 flag indicating whether a memtable flush is pending. @@ -198,6 +216,14 @@ impl ColumnFamilyMetrics { registry, ) .unwrap(), + rocksdb_estimate_pending_compaction_bytes: register_int_gauge_vec_with_registry!( + "rocksdb_estimate_pending_compaction_bytes", + "Estimated total number of bytes compaction needs to rewrite to get all levels down + to under target size. Not valid for other compactions than level-based.", + &["cf_name"], + registry, + ) + .unwrap(), rocksdb_num_running_compactions: register_int_gauge_vec_with_registry!( "rocksdb_num_running_compactions", "The number of compactions that are currently running for the column family.", @@ -234,7 +260,13 @@ impl ColumnFamilyMetrics { registry, ) .unwrap(), - + rocksdb_base_level: register_int_gauge_vec_with_registry!( + "rocksdb_base_level", + "The number of level to which L0 data will be compacted.", + &["cf_name"], + registry, + ) + .unwrap(), } } } diff --git a/crates/typed-store/src/rocks/mod.rs b/crates/typed-store/src/rocks/mod.rs index ca6505b9a448e..6bf550f4aa131 100644 --- a/crates/typed-store/src/rocks/mod.rs +++ b/crates/typed-store/src/rocks/mod.rs @@ -704,7 +704,7 @@ impl MetricConf { } } } -const CF_METRICS_REPORT_PERIOD_MILLIS: u64 = 1000; +const CF_METRICS_REPORT_PERIOD_SECS: u64 = 30; const METRICS_ERROR: i64 = -1; /// An interface to a rocksDB database, keyed by a columnfamily @@ -740,7 +740,7 @@ impl DBMap { if !is_deprecated { tokio::task::spawn(async move { let mut interval = - tokio::time::interval(Duration::from_millis(CF_METRICS_REPORT_PERIOD_MILLIS)); + tokio::time::interval(Duration::from_secs(CF_METRICS_REPORT_PERIOD_SECS)); loop { tokio::select! { _ = interval.tick() => { @@ -983,6 +983,14 @@ impl DBMap { Self::get_int_property(rocksdb, &cf, ROCKSDB_PROPERTY_TOTAL_BLOB_FILES_SIZE) .unwrap_or(METRICS_ERROR), ); + db_metrics + .cf_metrics + .rocksdb_current_size_active_mem_tables + .with_label_values(&[cf_name]) + .set( + Self::get_int_property(rocksdb, &cf, properties::CUR_SIZE_ACTIVE_MEM_TABLE) + .unwrap_or(METRICS_ERROR), + ); db_metrics .cf_metrics .rocksdb_size_all_mem_tables @@ -1063,6 +1071,14 @@ impl DBMap { Self::get_int_property(rocksdb, &cf, properties::ESTIMATE_NUM_KEYS) .unwrap_or(METRICS_ERROR), ); + db_metrics + .cf_metrics + .rocksdb_num_immutable_mem_tables + .with_label_values(&[cf_name]) + .set( + Self::get_int_property(rocksdb, &cf, properties::NUM_IMMUTABLE_MEM_TABLE) + .unwrap_or(METRICS_ERROR), + ); db_metrics .cf_metrics .rocksdb_mem_table_flush_pending @@ -1079,6 +1095,14 @@ impl DBMap { Self::get_int_property(rocksdb, &cf, properties::COMPACTION_PENDING) .unwrap_or(METRICS_ERROR), ); + db_metrics + .cf_metrics + .rocksdb_estimate_pending_compaction_bytes + .with_label_values(&[cf_name]) + .set( + Self::get_int_property(rocksdb, &cf, properties::ESTIMATE_PENDING_COMPACTION_BYTES) + .unwrap_or(METRICS_ERROR), + ); db_metrics .cf_metrics .rocksdb_num_running_compactions @@ -1111,6 +1135,14 @@ impl DBMap { Self::get_int_property(rocksdb, &cf, properties::BACKGROUND_ERRORS) .unwrap_or(METRICS_ERROR), ); + db_metrics + .cf_metrics + .rocksdb_base_level + .with_label_values(&[cf_name]) + .set( + Self::get_int_property(rocksdb, &cf, properties::BASE_LEVEL) + .unwrap_or(METRICS_ERROR), + ); } pub fn transaction(&self) -> Result, TypedStoreError> {