diff --git a/ledger-tool/src/main.rs b/ledger-tool/src/main.rs index 36c48ed0419390..2c343c4f65547e 100644 --- a/ledger-tool/src/main.rs +++ b/ledger-tool/src/main.rs @@ -678,6 +678,7 @@ fn open_blockstore( access_type, recovery_mode: wal_recovery_mode, enforce_ulimit_nofile: true, + ..BlockstoreOptions::default() }, ) { Ok(blockstore) => blockstore, diff --git a/ledger/src/blockstore.rs b/ledger/src/blockstore.rs index b36e8028496bf0..0ef3058b1a9a96 100644 --- a/ledger/src/blockstore.rs +++ b/ledger/src/blockstore.rs @@ -3781,6 +3781,7 @@ pub fn create_new_ledger( access_type, recovery_mode: None, enforce_ulimit_nofile: false, + ..BlockstoreOptions::default() }, )?; let ticks_per_slot = genesis_config.ticks_per_slot; diff --git a/ledger/src/blockstore_db.rs b/ledger/src/blockstore_db.rs index a0c34b1f9ca8c8..d39c1a2e91530e 100644 --- a/ledger/src/blockstore_db.rs +++ b/ledger/src/blockstore_db.rs @@ -9,8 +9,9 @@ use { self, compaction_filter::CompactionFilter, compaction_filter_factory::{CompactionFilterContext, CompactionFilterFactory}, - ColumnFamily, ColumnFamilyDescriptor, CompactionDecision, DBIterator, DBRawIterator, - DBRecoveryMode, IteratorMode as RocksIteratorMode, Options, WriteBatch as RWriteBatch, DB, + ColumnFamily, ColumnFamilyDescriptor, CompactionDecision, DBCompactionStyle, DBIterator, + DBRawIterator, DBRecoveryMode, FifoCompactOptions, IteratorMode as RocksIteratorMode, + Options, WriteBatch as RWriteBatch, DB, }, serde::{de::DeserializeOwned, Serialize}, solana_runtime::hardened_unpack::UnpackError, @@ -35,6 +36,17 @@ use { }; const MAX_WRITE_BUFFER_SIZE: u64 = 256 * 1024 * 1024; // 256MB +const FIFO_WRITE_BUFFER_SIZE: u64 = 2 * MAX_WRITE_BUFFER_SIZE; +// Maximum size of cf::DataShred. Used when `shred_storage_type` +// is set to ShredStorageType::RocksFifo. The default value is set +// to 125GB, assuming 500GB total storage for ledger and 25% is +// used by data shreds. +const DEFAULT_FIFO_COMPACTION_DATA_CF_SIZE: u64 = 125 * 1024 * 1024 * 1024; +// Maximum size of cf::CodeShred. Used when `shred_storage_type` +// is set to ShredStorageType::RocksFifo. The default value is set +// to 100GB, assuming 500GB total storage for ledger and 20% is +// used by coding shreds. +const DEFAULT_FIFO_COMPACTION_CODING_CF_SIZE: u64 = 100 * 1024 * 1024 * 1024; // Column family for metadata about a leader slot const META_CF: &str = "meta"; @@ -305,8 +317,40 @@ impl Rocks { new_cf_descriptor::(&access_type, &oldest_slot), new_cf_descriptor::(&access_type, &oldest_slot), new_cf_descriptor::(&access_type, &oldest_slot), - new_cf_descriptor::(&access_type, &oldest_slot), - new_cf_descriptor::(&access_type, &oldest_slot), + match options.shred_storage_type { + ShredStorageType::RocksLevel => { + new_cf_descriptor::(&access_type, &oldest_slot) + } + ShredStorageType::RocksFifo => { + if options.shred_data_cf_size > FIFO_WRITE_BUFFER_SIZE { + new_cf_descriptor_fifo::(&options.shred_data_cf_size) + } else { + warn!( + "shred_data_cf_size is must be greater than {} for RocksFifo.", + FIFO_WRITE_BUFFER_SIZE + ); + warn!("Fall back to ShredStorageType::RocksLevel for cf::ShredData."); + new_cf_descriptor::(&access_type, &oldest_slot) + } + } + }, + match options.shred_storage_type { + ShredStorageType::RocksLevel => { + new_cf_descriptor::(&access_type, &oldest_slot) + } + ShredStorageType::RocksFifo => { + if options.shred_code_cf_size > FIFO_WRITE_BUFFER_SIZE { + new_cf_descriptor_fifo::(&options.shred_code_cf_size) + } else { + warn!( + "shred_code_cf_size is must be greater than {} for RocksFifo.", + FIFO_WRITE_BUFFER_SIZE + ); + warn!("Fall back to ShredStorageType::RocksLevel for cf::ShredCode."); + new_cf_descriptor::(&access_type, &oldest_slot) + } + } + }, new_cf_descriptor::(&access_type, &oldest_slot), new_cf_descriptor::(&access_type, &oldest_slot), new_cf_descriptor::(&access_type, &oldest_slot), @@ -952,10 +996,40 @@ pub struct WriteBatch<'a> { map: HashMap<&'static str, &'a ColumnFamily>, } +pub enum ShredStorageType { + // Stores shreds under RocksDB's default compaction (level). + RocksLevel, + // (Experimental) Stores shreds under RocksDB's FIFO compaction which + // allows ledger store to reclaim storage more efficiently with + // lower I/O overhead. + RocksFifo, +} + pub struct BlockstoreOptions { + // The access type of blockstore. Default: PrimaryOnly pub access_type: AccessType, + // Whether to open a blockstore under a recovery mode. Default: None. pub recovery_mode: Option, + // Whether to allow unlimited number of open files. Default: true. pub enforce_ulimit_nofile: bool, + // Determine how to store both data and coding shreds. Default: RocksLevel. + pub shred_storage_type: ShredStorageType, + // The maximum storage size for storing data shreds in column family + // [`cf::DataShred`]. Typically, data shreds contribute around 25% of the + // ledger store storage size if the RPC service is enabled, or 50% if RPC + // service is not enabled. + // + // Currently, this setting is only used when shred_storage_type is set to + // [`ShredStorageType::RocksFifo`]. + pub shred_data_cf_size: u64, + // The maximum storage size for storing coding shreds in column family + // [`cf::CodeShred`]. Typically, coding shreds contribute around 20% of the + // ledger store storage size if the RPC service is enabled, or 40% if RPC + // service is not enabled. + // + // Currently, this setting is only used when shred_storage_type is set to + // [`ShredStorageType::RocksFifo`]. + pub shred_code_cf_size: u64, } impl Default for BlockstoreOptions { @@ -965,6 +1039,13 @@ impl Default for BlockstoreOptions { access_type: AccessType::PrimaryOnly, recovery_mode: None, enforce_ulimit_nofile: true, + shred_storage_type: ShredStorageType::RocksLevel, + // Maximum size of cf::DataShred. Used when `shred_storage_type` + // is set to ShredStorageType::RocksFifo. + shred_data_cf_size: DEFAULT_FIFO_COMPACTION_DATA_CF_SIZE, + // Maximum size of cf::CodeShred. Used when `shred_storage_type` + // is set to ShredStorageType::RocksFifo. + shred_code_cf_size: DEFAULT_FIFO_COMPACTION_CODING_CF_SIZE, } } } @@ -1355,6 +1436,51 @@ fn get_cf_options( options } +fn new_cf_descriptor_fifo( + max_cf_size: &u64, +) -> ColumnFamilyDescriptor { + ColumnFamilyDescriptor::new(C::NAME, get_cf_options_fifo::(max_cf_size)) +} + +/// Returns the RocksDB Column Family Options which use FIFO Compaction. +/// +/// Note that this CF options is optimized for workloads which write-keys +/// are mostly monotonically increasing over time. For workloads where +/// write-keys do not follow any order in general should use get_cf_options +/// instead. +/// +/// - [`max_cf_size`]: the maximum allowed column family size. Note that +/// rocksdb will start deleting the oldest SST file when the column family +/// size reaches `max_cf_size` - `FIFO_WRITE_BUFFER_SIZE` to strictly +/// maintain the size limit. +fn get_cf_options_fifo(max_cf_size: &u64) -> Options { + let mut options = Options::default(); + + options.set_max_write_buffer_number(8); + options.set_write_buffer_size(FIFO_WRITE_BUFFER_SIZE as usize); + // FIFO always has its files in L0 so we only have one level. + options.set_num_levels(1); + // Since FIFO puts all its file in L0, it is suggested to have unlimited + // number of open files. The actual total number of open files will + // be close to max_cf_size / write_buffer_size. + options.set_max_open_files(-1); + + let mut fifo_compact_options = FifoCompactOptions::default(); + + // Note that the following actually specifies size trigger for deleting + // the oldest SST file instead of specifying the size limit as its name + // might suggest. As a result, we should trigger the file deletion when + // the size reaches `max_cf_size - write_buffer_size` in order to correctly + // maintain the storage size limit. + fifo_compact_options + .set_max_table_files_size((*max_cf_size).saturating_sub(FIFO_WRITE_BUFFER_SIZE)); + + options.set_compaction_style(DBCompactionStyle::Fifo); + options.set_fifo_compaction_options(&fifo_compact_options); + + options +} + fn get_db_options(access_type: &AccessType) -> Options { let mut options = Options::default(); diff --git a/local-cluster/tests/common.rs b/local-cluster/tests/common.rs index cfa26742c2e429..b051d8bf3e7c36 100644 --- a/local-cluster/tests/common.rs +++ b/local-cluster/tests/common.rs @@ -76,6 +76,7 @@ pub fn open_blockstore(ledger_path: &Path) -> Blockstore { access_type: AccessType::TryPrimaryThenSecondary, recovery_mode: None, enforce_ulimit_nofile: true, + ..BlockstoreOptions::default() }, ) .unwrap_or_else(|e| { diff --git a/local-cluster/tests/local_cluster_flakey.rs b/local-cluster/tests/local_cluster_flakey.rs index b4f413d867b6b9..9cc053175e48ff 100644 --- a/local-cluster/tests/local_cluster_flakey.rs +++ b/local-cluster/tests/local_cluster_flakey.rs @@ -331,6 +331,7 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b access_type: AccessType::TryPrimaryThenSecondary, recovery_mode: None, enforce_ulimit_nofile: true, + ..BlockstoreOptions::default() }, ) .unwrap();