From f7604cd1128bcd701a106a8dbec3e8ff31977079 Mon Sep 17 00:00:00 2001
From: Jan Ciolek <149345204+jancionear@users.noreply.github.com>
Date: Thu, 11 Apr 2024 13:06:00 +0200
Subject: [PATCH] Add metric for recorded storage proof size, also calculate
 adjusted size based on number of trie removals (#11000)

In https://github.com/near/nearcore/issues/10890 we considered adding
extra 2000 bytes to the storage proof size for every removal operation
on the trie (Idea 1). This PR implements the logic of recording the
number of removals and calculating the adjusted storage proof size. It's
not used in the soft witness size limit for now, as we have to evaluate
how viable of a solution it is. There are concerns that charging 2000
bytes extra would cause the throughput to drop.

To evaluate the impact of adjusting the size calculation like this, the
PR adds three new metrics.
* `near_receipt_recorded_size` - a histogram of how much storage proof
is recorded when processing a receipt. Apart from
https://github.com/near/nearcore/issues/10890, it'll help us estimate
what the hard per-receipt storage proof size limit should be.
* `near_receipt_adjusted_recorded_size` - a histogram of adjusted
storage proof size calculated when processing a receipt
* `near_receipt_adjusted_recorded_size_ratio` - ratio of adjusted size
to non-adjusted size. It'll allow us to evaluate how much the adjustment
affects the final size. The hope is that contracts rarely delete things,
so the effect will be small (a few percent), but it might turn out that
this assumption is false and the adjusted size is e.g 2x higher that the
non-adjusted one. In that case we'd have to reevaluate whether it's a
viable solution.

I'd like to run code from this branch on shadow validation nodes to
gather data from mainnet traffic.
---
 core/store/src/trie/mod.rs            |  9 ++++
 core/store/src/trie/trie_recording.rs | 18 ++++++-
 core/store/src/trie/update.rs         |  3 ++
 runtime/runtime/src/lib.rs            | 25 ++++++++++
 runtime/runtime/src/metrics.rs        | 71 ++++++++++++++++++++++++++-
 5 files changed, 123 insertions(+), 3 deletions(-)

diff --git a/core/store/src/trie/mod.rs b/core/store/src/trie/mod.rs
index aed7cc38c7d..33fce745135 100644
--- a/core/store/src/trie/mod.rs
+++ b/core/store/src/trie/mod.rs
@@ -683,6 +683,15 @@ impl Trie {
             .unwrap_or_default()
     }
 
+    /// Size of the recorded state proof plus some additional size added to cover removals.
+    /// An upper-bound estimation of the true recorded size after finalization.
+    pub fn recorded_storage_size_upper_bound(&self) -> usize {
+        self.recorder
+            .as_ref()
+            .map(|recorder| recorder.borrow().recorded_storage_size_upper_bound())
+            .unwrap_or_default()
+    }
+
     /// Constructs a Trie from the partial storage (i.e. state proof) that
     /// was returned from recorded_storage(). If used to access the same trie
     /// nodes as when the partial storage was generated, this trie will behave
diff --git a/core/store/src/trie/trie_recording.rs b/core/store/src/trie/trie_recording.rs
index 46310a9a750..819ef5a295d 100644
--- a/core/store/src/trie/trie_recording.rs
+++ b/core/store/src/trie/trie_recording.rs
@@ -8,11 +8,14 @@ use std::sync::Arc;
 pub struct TrieRecorder {
     recorded: HashMap<CryptoHash, Arc<[u8]>>,
     size: usize,
+    /// Counts removals performed while recording.
+    /// recorded_storage_size_upper_bound takes it into account when calculating the total size.
+    removal_counter: usize,
 }
 
 impl TrieRecorder {
     pub fn new() -> Self {
-        Self { recorded: HashMap::new(), size: 0 }
+        Self { recorded: HashMap::new(), size: 0, removal_counter: 0 }
     }
 
     pub fn record(&mut self, hash: &CryptoHash, node: Arc<[u8]>) {
@@ -22,6 +25,10 @@ impl TrieRecorder {
         }
     }
 
+    pub fn record_removal(&mut self) {
+        self.removal_counter = self.removal_counter.saturating_add(1)
+    }
+
     pub fn recorded_storage(&mut self) -> PartialStorage {
         let mut nodes: Vec<_> = self.recorded.drain().map(|(_key, value)| value).collect();
         nodes.sort();
@@ -32,6 +39,15 @@ impl TrieRecorder {
         debug_assert!(self.size == self.recorded.values().map(|v| v.len()).sum::<usize>());
         self.size
     }
+
+    /// Size of the recorded state proof plus some additional size added to cover removals.
+    /// An upper-bound estimation of the true recorded size after finalization.
+    /// See https://github.com/near/nearcore/issues/10890 and https://github.com/near/nearcore/pull/11000 for details.
+    pub fn recorded_storage_size_upper_bound(&self) -> usize {
+        // Charge 2000 bytes for every removal
+        let removals_size = self.removal_counter.saturating_mul(2000);
+        self.recorded_storage_size().saturating_add(removals_size)
+    }
 }
 
 #[cfg(test)]
diff --git a/core/store/src/trie/update.rs b/core/store/src/trie/update.rs
index e3e11554b4b..2bb715fa6ad 100644
--- a/core/store/src/trie/update.rs
+++ b/core/store/src/trie/update.rs
@@ -115,6 +115,9 @@ impl TrieUpdate {
 
     pub fn remove(&mut self, trie_key: TrieKey) {
         self.prospective.insert(trie_key.to_vec(), TrieKeyValueUpdate { trie_key, value: None });
+        if let Some(recorder) = &self.trie.recorder {
+            recorder.borrow_mut().record_removal();
+        }
     }
 
     pub fn commit(&mut self, event: StateChangeCause) {
diff --git a/runtime/runtime/src/lib.rs b/runtime/runtime/src/lib.rs
index 68627152719..6cbbb93b3de 100644
--- a/runtime/runtime/src/lib.rs
+++ b/runtime/runtime/src/lib.rs
@@ -1424,6 +1424,9 @@ impl Runtime {
             )
             .entered();
             let node_counter_before = state_update.trie().get_trie_nodes_count();
+            let recorded_storage_size_before = state_update.trie().recorded_storage_size();
+            let storage_proof_size_upper_bound_before =
+                state_update.trie().recorded_storage_size_upper_bound();
             let result = self.process_receipt(
                 state_update,
                 apply_state,
@@ -1436,6 +1439,21 @@ impl Runtime {
             let node_counter_after = state_update.trie().get_trie_nodes_count();
             tracing::trace!(target: "runtime", ?node_counter_before, ?node_counter_after);
 
+            let recorded_storage_diff = state_update
+                .trie()
+                .recorded_storage_size()
+                .saturating_sub(recorded_storage_size_before)
+                as f64;
+            let recorded_storage_upper_bound_diff = state_update
+                .trie()
+                .recorded_storage_size_upper_bound()
+                .saturating_sub(storage_proof_size_upper_bound_before)
+                as f64;
+            metrics::RECEIPT_RECORDED_SIZE.observe(recorded_storage_diff);
+            metrics::RECEIPT_RECORDED_SIZE_UPPER_BOUND.observe(recorded_storage_upper_bound_diff);
+            let recorded_storage_proof_ratio =
+                recorded_storage_upper_bound_diff / f64::max(1.0, recorded_storage_diff);
+            metrics::RECEIPT_RECORDED_SIZE_UPPER_BOUND_RATIO.observe(recorded_storage_proof_ratio);
             if let Some(outcome_with_id) = result? {
                 let gas_burnt = outcome_with_id.outcome.gas_burnt;
                 let compute_usage = outcome_with_id
@@ -1672,6 +1690,9 @@ impl Runtime {
 
         state_update.commit(StateChangeCause::UpdatedDelayedReceipts);
         self.apply_state_patch(&mut state_update, state_patch);
+        let chunk_recorded_size_upper_bound =
+            state_update.trie.recorded_storage_size_upper_bound() as f64;
+        metrics::CHUNK_RECORDED_SIZE_UPPER_BOUND.observe(chunk_recorded_size_upper_bound);
         let (trie, trie_changes, state_changes) = state_update.finalize()?;
         if let Some(prefetcher) = &prefetcher {
             // Only clear the prefetcher queue after finalize is done because as part of receipt
@@ -1701,6 +1722,10 @@ impl Runtime {
         }
 
         let state_root = trie_changes.new_root;
+        let chunk_recorded_size = trie.recorded_storage_size() as f64;
+        metrics::CHUNK_RECORDED_SIZE.observe(chunk_recorded_size);
+        metrics::CHUNK_RECORDED_SIZE_UPPER_BOUND_RATIO
+            .observe(chunk_recorded_size_upper_bound / f64::max(1.0, chunk_recorded_size));
         let proof = trie.recorded_storage();
         Ok(ApplyResult {
             state_root,
diff --git a/runtime/runtime/src/metrics.rs b/runtime/runtime/src/metrics.rs
index 6c3c9495ba1..cef59d610e3 100644
--- a/runtime/runtime/src/metrics.rs
+++ b/runtime/runtime/src/metrics.rs
@@ -1,6 +1,7 @@
 use near_o11y::metrics::{
-    try_create_counter_vec, try_create_histogram_vec, try_create_int_counter,
-    try_create_int_counter_vec, CounterVec, HistogramVec, IntCounter, IntCounterVec,
+    exponential_buckets, try_create_counter_vec, try_create_histogram_vec,
+    try_create_histogram_with_buckets, try_create_int_counter, try_create_int_counter_vec,
+    CounterVec, Histogram, HistogramVec, IntCounter, IntCounterVec,
 };
 use once_cell::sync::Lazy;
 use std::time::Duration;
@@ -289,6 +290,54 @@ static CHUNK_TX_TGAS: Lazy<HistogramVec> = Lazy::new(|| {
     )
     .unwrap()
 });
+pub static RECEIPT_RECORDED_SIZE: Lazy<Histogram> = Lazy::new(|| {
+    try_create_histogram_with_buckets(
+        "near_receipt_recorded_size",
+        "Size of storage proof recorded when executing a receipt",
+        buckets_for_receipt_storage_proof_size(),
+    )
+    .unwrap()
+});
+pub static RECEIPT_RECORDED_SIZE_UPPER_BOUND: Lazy<Histogram> = Lazy::new(|| {
+    try_create_histogram_with_buckets(
+        "near_receipt_recorded_size_upper_bound",
+        "Upper bound estimation (e.g with extra size added for deletes) of storage proof size recorded when executing a receipt",
+        buckets_for_receipt_storage_proof_size(),
+    )
+    .unwrap()
+});
+pub static RECEIPT_RECORDED_SIZE_UPPER_BOUND_RATIO: Lazy<Histogram> = Lazy::new(|| {
+    try_create_histogram_with_buckets(
+        "near_receipt_recorded_size_upper_bound_ratio",
+        "Ratio of upper bound to true recorded size, equal to (near_receipt_recorded_size_upper_bound / near_receipt_recorded_size)",
+        buckets_for_storage_proof_size_ratio(),
+    )
+    .unwrap()
+});
+pub static CHUNK_RECORDED_SIZE: Lazy<Histogram> = Lazy::new(|| {
+    try_create_histogram_with_buckets(
+        "near_chunk_recorded_size",
+        "Total size of storage proof (recorded trie nodes for state witness, post-finalization) for a single chunk",
+        buckets_for_storage_proof_size(),
+    )
+    .unwrap()
+});
+pub static CHUNK_RECORDED_SIZE_UPPER_BOUND: Lazy<Histogram> = Lazy::new(|| {
+    try_create_histogram_with_buckets(
+        "near_chunk_recorded_size_upper_bound",
+        "Upper bound of storage proof size (recorded trie nodes size + estimated charges, pre-finalization) for a single chunk",
+        buckets_for_storage_proof_size(),
+    )
+    .unwrap()
+});
+pub static CHUNK_RECORDED_SIZE_UPPER_BOUND_RATIO: Lazy<Histogram> = Lazy::new(|| {
+    try_create_histogram_with_buckets(
+        "near_chunk_recorded_size_upper_bound_ratio",
+        "Ratio of upper bound to true storage proof size, equal to (near_chunk_recorded_size_upper_bound / near_chunk_recorded_size)",
+        buckets_for_storage_proof_size_ratio(),
+    )
+    .unwrap()
+});
 
 /// Buckets used for burned gas in receipts.
 ///
@@ -313,6 +362,24 @@ fn buckets_for_compute() -> Option<Vec<f64>> {
     ])
 }
 
+// Buckets from 0 to 100 MB
+fn buckets_for_receipt_storage_proof_size() -> Vec<f64> {
+    // 100 * 2**20 = 100 MB
+    exponential_buckets(100., 2., 20).unwrap()
+}
+
+// Buckets from 100KB to 100MB
+fn buckets_for_storage_proof_size() -> Vec<f64> {
+    // 100KB * 2**10 = 100 MB
+    exponential_buckets(100_000., 2., 10).unwrap()
+}
+
+// Buckets from 1 to 1.46
+fn buckets_for_storage_proof_size_ratio() -> Vec<f64> {
+    // 1.02 ** 20 = 1.46
+    exponential_buckets(1., 1.02, 20).unwrap()
+}
+
 /// Helper struct to collect partial costs of `Runtime::apply` and reporting it
 /// atomically.
 #[derive(Debug, Default)]