From aaa570b21fdc03a28b9c5a1102afce62bc078c6e Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Fri, 23 Aug 2024 19:49:25 -0700 Subject: [PATCH] Options for file temperature for more files (#12957) Summary: We have a request to use the cold tier as primary source of truth for the DB, and to best support such use cases and to complement the existing options controlling SST file temperatures, we add two new DB options: * `metadata_write_temperature` for DB "small" files that don't contain much user data * `wal_write_temperature` for WALs. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12957 Test Plan: Unit test included, though it's hard to be sure we've covered all the places Reviewed By: jowlyzhang Differential Revision: D61664815 Pulled By: pdillinger fbshipit-source-id: 8e19c9dd8fd2db059bb15f74938d6bc12002e82b --- db/compaction/compaction_job_test.cc | 6 +- db/db_impl/db_impl_files.cc | 4 +- db/db_impl/db_impl_open.cc | 14 +- db/db_test2.cc | 230 +++++++++++++++++++++++++++ db/db_test_util.h | 11 +- db/flush_job_test.cc | 6 +- db/version_set.cc | 10 +- db/version_set_test.cc | 98 +++++------- env/file_system.cc | 6 +- file/filename.cc | 14 +- file/filename.h | 3 +- include/rocksdb/advanced_options.h | 2 +- include/rocksdb/file_system.h | 7 +- include/rocksdb/options.h | 10 ++ options/db_options.cc | 16 +- options/db_options.h | 2 + options/options_helper.cc | 9 ++ options/options_parser.cc | 5 +- options/options_settable_test.cc | 7 +- 19 files changed, 382 insertions(+), 78 deletions(-) diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index 11a757fd68d..bbc0fe4cf3d 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -552,7 +552,8 @@ class CompactionJobTestBase : public testing::Test { /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", /*error_handler=*/nullptr, /*read_only=*/false)); compaction_job_stats_.Reset(); - ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_)); + ASSERT_OK( + SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown)); VersionEdit new_db; new_db.SetLogNumber(0); @@ -575,7 +576,8 @@ class CompactionJobTestBase : public testing::Test { } ASSERT_OK(s); // Make "CURRENT" file that points to the new manifest file. - s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); + s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, + Temperature::kUnknown, nullptr); ASSERT_OK(s); diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index dd4bf411cd8..0db7293682d 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -970,7 +970,9 @@ Status DBImpl::SetupDBId(const WriteOptions& write_options, bool read_only, } // Persist it to IDENTITY file if allowed if (!read_only) { - s = SetIdentityFile(write_options, env_, dbname_, db_id_); + s = SetIdentityFile(write_options, env_, dbname_, + immutable_db_options_.metadata_write_temperature, + db_id_); } return s; } diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 8ef024e7ba8..a58a142d712 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -295,7 +295,8 @@ Status DBImpl::ValidateOptions(const DBOptions& db_options) { Status DBImpl::NewDB(std::vector* new_filenames) { VersionEdit new_db; const WriteOptions write_options(Env::IOActivity::kDBOpen); - Status s = SetIdentityFile(write_options, env_, dbname_); + Status s = SetIdentityFile(write_options, env_, dbname_, + immutable_db_options_.metadata_write_temperature); if (!s.ok()) { return s; } @@ -319,6 +320,12 @@ Status DBImpl::NewDB(std::vector* new_filenames) { } std::unique_ptr file; FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_); + // DB option takes precedence when not kUnknown + if (immutable_db_options_.metadata_write_temperature != + Temperature::kUnknown) { + file_options.temperature = + immutable_db_options_.metadata_write_temperature; + } s = NewWritableFile(fs_.get(), manifest, &file, file_options); if (!s.ok()) { return s; @@ -344,6 +351,7 @@ Status DBImpl::NewDB(std::vector* new_filenames) { if (s.ok()) { // Make "CURRENT" file that points to the new manifest file. s = SetCurrentFile(write_options, fs_.get(), dbname_, 1, + immutable_db_options_.metadata_write_temperature, directories_.GetDbDir()); if (new_filenames) { new_filenames->emplace_back( @@ -1936,6 +1944,10 @@ IOStatus DBImpl::CreateWAL(const WriteOptions& write_options, BuildDBOptions(immutable_db_options_, mutable_db_options_); FileOptions opt_file_options = fs_->OptimizeForLogWrite(file_options_, db_options); + // DB option takes precedence when not kUnknown + if (immutable_db_options_.wal_write_temperature != Temperature::kUnknown) { + opt_file_options.temperature = immutable_db_options_.wal_write_temperature; + } std::string wal_dir = immutable_db_options_.GetWalDir(); std::string log_fname = LogFileName(wal_dir, log_file_num); diff --git a/db/db_test2.cc b/db/db_test2.cc index e6a3adf9b8b..f380144c6ab 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -10,6 +10,7 @@ #include #include #include +#include #include #include "db/db_test_util.h" @@ -26,6 +27,7 @@ #include "rocksdb/utilities/replayer.h" #include "rocksdb/wal_filter.h" #include "test_util/testutil.h" +#include "util/defer.h" #include "util/random.h" #include "utilities/fault_injection_env.h" @@ -6544,6 +6546,234 @@ TEST_P(RenameCurrentTest, Compaction) { ASSERT_EQ("d_value", Get("d")); } +TEST_F(DBTest2, VariousFileTemperatures) { + constexpr size_t kNumberFileTypes = static_cast(kBlobFile) + 1U; + + struct MyTestFS : public FileTemperatureTestFS { + explicit MyTestFS(const std::shared_ptr& fs) + : FileTemperatureTestFS(fs) { + Reset(); + } + + IOStatus NewWritableFile(const std::string& fname, const FileOptions& opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + IOStatus ios = + FileTemperatureTestFS::NewWritableFile(fname, opts, result, dbg); + if (ios.ok()) { + uint64_t number; + FileType type; + if (ParseFileName(GetFileName(fname), &number, "LOG", &type)) { + if (type == kTableFile) { + // Not checked here + } else if (type == kWalFile) { + if (opts.temperature != expected_wal_temperature) { + std::cerr << "Attempt to open " << fname << " with temperature " + << temperature_to_string[opts.temperature] + << " rather than " + << temperature_to_string[expected_wal_temperature] + << std::endl; + assert(false); + } + } else if (type == kDescriptorFile) { + if (opts.temperature != expected_manifest_temperature) { + std::cerr << "Attempt to open " << fname << " with temperature " + << temperature_to_string[opts.temperature] + << " rather than " + << temperature_to_string[expected_wal_temperature] + << std::endl; + assert(false); + } + } else if (opts.temperature != expected_other_metadata_temperature) { + std::cerr << "Attempt to open " << fname << " with temperature " + << temperature_to_string[opts.temperature] + << " rather than " + << temperature_to_string[expected_wal_temperature] + << std::endl; + assert(false); + } + UpdateCount(type, 1); + } + } + return ios; + } + + IOStatus RenameFile(const std::string& src, const std::string& dst, + const IOOptions& options, + IODebugContext* dbg) override { + IOStatus ios = FileTemperatureTestFS::RenameFile(src, dst, options, dbg); + if (ios.ok()) { + uint64_t number; + FileType src_type; + FileType dst_type; + assert(ParseFileName(GetFileName(src), &number, "LOG", &src_type)); + assert(ParseFileName(GetFileName(dst), &number, "LOG", &dst_type)); + + UpdateCount(src_type, -1); + UpdateCount(dst_type, 1); + } + return ios; + } + + void UpdateCount(FileType type, int delta) { + size_t i = static_cast(type); + assert(i < kNumberFileTypes); + counts[i].FetchAddRelaxed(delta); + } + + std::map PopCounts() { + std::map ret; + for (size_t i = 0; i < kNumberFileTypes; ++i) { + int c = counts[i].ExchangeRelaxed(0); + if (c > 0) { + ret[static_cast(i)] = c; + } + } + return ret; + } + + FileOptions OptimizeForLogWrite( + const FileOptions& file_options, + const DBOptions& /*db_options*/) const override { + FileOptions opts = file_options; + if (optimize_wal_temperature != Temperature::kUnknown) { + opts.temperature = optimize_wal_temperature; + } + return opts; + } + + FileOptions OptimizeForManifestWrite( + const FileOptions& file_options) const override { + FileOptions opts = file_options; + if (optimize_manifest_temperature != Temperature::kUnknown) { + opts.temperature = optimize_manifest_temperature; + } + return opts; + } + + void Reset() { + optimize_manifest_temperature = Temperature::kUnknown; + optimize_wal_temperature = Temperature::kUnknown; + expected_manifest_temperature = Temperature::kUnknown; + expected_other_metadata_temperature = Temperature::kUnknown; + expected_wal_temperature = Temperature::kUnknown; + for (auto& c : counts) { + c.StoreRelaxed(0); + } + } + + Temperature optimize_manifest_temperature; + Temperature optimize_wal_temperature; + Temperature expected_manifest_temperature; + Temperature expected_other_metadata_temperature; + Temperature expected_wal_temperature; + std::array, kNumberFileTypes> counts; + }; + + // We don't have enough non-unknown temps to confidently distinguish that + // a specific setting caused a specific outcome, in a single run. This is a + // reasonable work-around without blowing up test time. Only returns + // non-unknown temperatures. + auto RandomTemp = [] { + static std::vector temps = { + Temperature::kHot, Temperature::kWarm, Temperature::kCold}; + return temps[Random::GetTLSInstance()->Uniform( + static_cast(temps.size()))]; + }; + + auto test_fs = std::make_shared(env_->GetFileSystem()); + std::unique_ptr env(new CompositeEnvWrapper(env_, test_fs)); + for (bool use_optimize : {false, true}) { + std::cerr << "use_optimize: " << std::to_string(use_optimize) << std::endl; + for (bool use_temp_options : {false, true}) { + std::cerr << "use_temp_options: " << std::to_string(use_temp_options) + << std::endl; + + Options options = CurrentOptions(); + // Currently require for last level temperature + options.compaction_style = kCompactionStyleUniversal; + options.env = env.get(); + test_fs->Reset(); + if (use_optimize) { + test_fs->optimize_manifest_temperature = RandomTemp(); + test_fs->expected_manifest_temperature = + test_fs->optimize_manifest_temperature; + test_fs->optimize_wal_temperature = RandomTemp(); + test_fs->expected_wal_temperature = test_fs->optimize_wal_temperature; + } + if (use_temp_options) { + options.metadata_write_temperature = RandomTemp(); + test_fs->expected_manifest_temperature = + options.metadata_write_temperature; + test_fs->expected_other_metadata_temperature = + options.metadata_write_temperature; + options.wal_write_temperature = RandomTemp(); + test_fs->expected_wal_temperature = options.wal_write_temperature; + options.last_level_temperature = RandomTemp(); + options.default_write_temperature = RandomTemp(); + } + + DestroyAndReopen(options); + Defer closer([&] { Close(); }); + + using FTC = std::map; + // Files on DB startup + ASSERT_EQ(test_fs->PopCounts(), FTC({{kWalFile, 1}, + {kDescriptorFile, 2}, + {kCurrentFile, 2}, + {kIdentityFile, 1}, + {kOptionsFile, 1}})); + + // Temperature count map + using TCM = std::map; + ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(), TCM({})); + + ASSERT_OK(Put("foo", "1")); + ASSERT_OK(Put("bar", "1")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "2")); + ASSERT_OK(Put("bar", "2")); + ASSERT_OK(Flush()); + + ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(), + TCM({{options.default_write_temperature, 2}})); + + ASSERT_OK(db_->CompactRange({}, nullptr, nullptr)); + + ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(), + TCM({{options.last_level_temperature, 1}})); + + ASSERT_OK(Put("foo", "3")); + ASSERT_OK(Put("bar", "3")); + ASSERT_OK(Flush()); + + // Just in memtable/WAL + ASSERT_OK(Put("dog", "3")); + + { + TCM expected; + expected[options.default_write_temperature] += 1; + expected[options.last_level_temperature] += 1; + ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(), expected); + } + + // New files during operation + ASSERT_EQ(test_fs->PopCounts(), FTC({{kWalFile, 3}, {kTableFile, 4}})); + + Reopen(options); + + // New files during re-open/recovery + ASSERT_EQ(test_fs->PopCounts(), FTC({{kWalFile, 1}, + {kTableFile, 1}, + {kDescriptorFile, 1}, + {kCurrentFile, 1}, + {kOptionsFile, 1}})); + + Destroy(options); + } + } +} + TEST_F(DBTest2, LastLevelTemperature) { class TestListener : public EventListener { public: diff --git a/db/db_test_util.h b/db/db_test_util.h index 47b1667eac1..36a4615344b 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -831,6 +831,15 @@ class FileTemperatureTestFS : public FileSystemWrapper { return count; } + std::map CountCurrentSstFilesByTemp() { + MutexLock lock(&mu_); + std::map ret; + for (const auto& e : current_sst_file_temperatures_) { + ret[e.second]++; + } + return ret; + } + void OverrideSstFileTemperature(uint64_t number, Temperature temp) { MutexLock lock(&mu_); current_sst_file_temperatures_[number] = temp; @@ -842,7 +851,7 @@ class FileTemperatureTestFS : public FileSystemWrapper { requested_sst_file_temperatures_; std::map current_sst_file_temperatures_; - std::string GetFileName(const std::string& fname) { + static std::string GetFileName(const std::string& fname) { auto filename = fname.substr(fname.find_last_of(kFilePathSeparator) + 1); // workaround only for Windows that the file path could contain both Windows // FilePathSeparator and '/' diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc index 3ffb77d5378..d407e4815f7 100644 --- a/db/flush_job_test.cc +++ b/db/flush_job_test.cc @@ -68,7 +68,8 @@ class FlushJobTestBase : public testing::Test { } void NewDB() { - ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_)); + ASSERT_OK( + SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown)); VersionEdit new_db; new_db.SetLogNumber(0); @@ -114,7 +115,8 @@ class FlushJobTestBase : public testing::Test { } ASSERT_OK(s); // Make "CURRENT" file that points to the new manifest file. - s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); + s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, + Temperature::kUnknown, nullptr); ASSERT_OK(s); } diff --git a/db/version_set.cc b/db/version_set.cc index fec847630a1..e81165a3d2e 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -5511,6 +5511,10 @@ Status VersionSet::ProcessManifestWrites( std::unique_ptr new_desc_log_ptr; { FileOptions opt_file_opts = fs_->OptimizeForManifestWrite(file_options_); + // DB option (in file_options_) takes precedence when not kUnknown + if (file_options_.temperature != Temperature::kUnknown) { + opt_file_opts.temperature = file_options_.temperature; + } mu->Unlock(); TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestStart"); TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WriteManifest", nullptr); @@ -5637,9 +5641,9 @@ Status VersionSet::ProcessManifestWrites( assert(manifest_io_status.ok()); } if (s.ok() && new_descriptor_log) { - io_s = SetCurrentFile(write_options, fs_.get(), dbname_, - pending_manifest_file_number_, - dir_contains_current_file); + io_s = SetCurrentFile( + write_options, fs_.get(), dbname_, pending_manifest_file_number_, + file_options_.temperature, dir_contains_current_file); if (!io_s.ok()) { s = io_s; // Quarantine old manifest file in case new manifest file's CURRENT file diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 94ee5f2e574..4f3665fba66 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -1415,16 +1415,22 @@ class VersionSetTestBase { } } + void CreateCurrentFile() { + // Make "CURRENT" file point to the new manifest file. + ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, + Temperature::kUnknown, + /* dir_contains_current_file */ nullptr)); + } + // Create DB with 3 column families. void NewDB() { SequenceNumber last_seqno; std::unique_ptr log_writer; - ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_)); + ASSERT_OK( + SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown)); PrepareManifest(&column_families_, &last_seqno, &log_writer); log_writer.reset(); - // Make "CURRENT" file point to the new manifest file. - Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); - ASSERT_OK(s); + CreateCurrentFile(); EXPECT_OK(versions_->Recover(column_families_, false)); EXPECT_EQ(column_families_.size(), @@ -2600,7 +2606,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase, edits_[i].MarkAtomicGroup(--remaining); edits_[i].SetLastSequence(last_seqno_++); } - ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr)); + CreateCurrentFile(); } void SetupIncompleteTrailingAtomicGroup(int atomic_group_size) { @@ -2612,7 +2618,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase, edits_[i].MarkAtomicGroup(--remaining); edits_[i].SetLastSequence(last_seqno_++); } - ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr)); + CreateCurrentFile(); } void SetupCorruptedAtomicGroup(int atomic_group_size) { @@ -2626,7 +2632,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase, } edits_[i].SetLastSequence(last_seqno_++); } - ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr)); + CreateCurrentFile(); } void SetupIncorrectAtomicGroup(int atomic_group_size) { @@ -2642,7 +2648,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase, } edits_[i].SetLastSequence(last_seqno_++); } - ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr)); + CreateCurrentFile(); } void SetupTestSyncPoints() { @@ -3408,8 +3414,7 @@ TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { SequenceNumber last_seqno; std::unique_ptr log_writer; PrepareManifest(&column_families, &last_seqno, &log_writer); - Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); - ASSERT_OK(s); + CreateCurrentFile(); EXPECT_OK(versions_->Recover(column_families, false /* read_only */)); EXPECT_EQ(column_families.size(), @@ -3431,7 +3436,7 @@ TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { cfd_to_drop->Ref(); drop_cf_edit.SetColumnFamily(cfd_to_drop->GetID()); mutex_.Lock(); - s = versions_->LogAndApply( + Status s = versions_->LogAndApply( cfd_to_drop, *cfd_to_drop->GetLatestMutableCFOptions(), read_options, write_options, &drop_cf_edit, &mutex_, nullptr); mutex_.Unlock(); @@ -3541,9 +3546,7 @@ class EmptyDefaultCfNewManifest : public VersionSetTestBase, TEST_F(EmptyDefaultCfNewManifest, Recover) { PrepareManifest(nullptr, nullptr, &log_writer_); log_writer_.reset(); - Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, - /* dir_contains_current_file */ nullptr); - ASSERT_OK(s); + CreateCurrentFile(); std::string manifest_path; VerifyManifest(&manifest_path); std::vector column_families; @@ -3552,7 +3555,7 @@ TEST_F(EmptyDefaultCfNewManifest, Recover) { cf_options_); std::string db_id; bool has_missing_table_file = false; - s = versions_->TryRecoverFromOneManifest( + Status s = versions_->TryRecoverFromOneManifest( manifest_path, column_families, false, &db_id, &has_missing_table_file); ASSERT_OK(s); ASSERT_FALSE(has_missing_table_file); @@ -3573,7 +3576,8 @@ class VersionSetTestEmptyDb assert(nullptr != log_writer); VersionEdit new_db; if (db_options_.write_dbid_to_manifest) { - ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_)); + ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_, + Temperature::kUnknown)); DBOptions tmp_db_options; tmp_db_options.env = env_; std::unique_ptr impl(new DBImpl(tmp_db_options, dbname_)); @@ -3606,9 +3610,7 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) { db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); PrepareManifest(nullptr, nullptr, &log_writer_); log_writer_.reset(); - Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, - /* dir_contains_current_file */ nullptr); - ASSERT_OK(s); + CreateCurrentFile(); std::string manifest_path; VerifyManifest(&manifest_path); @@ -3623,9 +3625,9 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) { std::string db_id; bool has_missing_table_file = false; - s = versions_->TryRecoverFromOneManifest(manifest_path, column_families, - read_only, &db_id, - &has_missing_table_file); + Status s = versions_->TryRecoverFromOneManifest( + manifest_path, column_families, read_only, &db_id, + &has_missing_table_file); auto iter = std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName); if (iter == cf_names.end()) { @@ -3651,9 +3653,7 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest1) { ASSERT_OK(s); } log_writer_.reset(); - s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, - /* dir_contains_current_file */ nullptr); - ASSERT_OK(s); + CreateCurrentFile(); std::string manifest_path; VerifyManifest(&manifest_path); @@ -3699,9 +3699,7 @@ TEST_P(VersionSetTestEmptyDb, OpenFromInCompleteManifest2) { ASSERT_OK(s); } log_writer_.reset(); - s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, - /* dir_contains_current_file */ nullptr); - ASSERT_OK(s); + CreateCurrentFile(); std::string manifest_path; VerifyManifest(&manifest_path); @@ -3758,9 +3756,7 @@ TEST_P(VersionSetTestEmptyDb, OpenManifestWithUnknownCF) { ASSERT_OK(s); } log_writer_.reset(); - s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, - /* dir_contains_current_file */ nullptr); - ASSERT_OK(s); + CreateCurrentFile(); std::string manifest_path; VerifyManifest(&manifest_path); @@ -3816,9 +3812,7 @@ TEST_P(VersionSetTestEmptyDb, OpenCompleteManifest) { ASSERT_OK(s); } log_writer_.reset(); - s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, - /* dir_contains_current_file */ nullptr); - ASSERT_OK(s); + CreateCurrentFile(); std::string manifest_path; VerifyManifest(&manifest_path); @@ -4025,15 +4019,14 @@ TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) { WriteFileAdditionAndDeletionToManifest( /*cf=*/0, std::vector>(), deleted_files); log_writer_.reset(); - Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); - ASSERT_OK(s); + CreateCurrentFile(); std::string manifest_path; VerifyManifest(&manifest_path); std::string db_id; bool has_missing_table_file = false; - s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_, - /*read_only=*/false, &db_id, - &has_missing_table_file); + Status s = versions_->TryRecoverFromOneManifest( + manifest_path, column_families_, + /*read_only=*/false, &db_id, &has_missing_table_file); ASSERT_OK(s); ASSERT_TRUE(has_missing_table_file); for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) { @@ -4083,15 +4076,14 @@ TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) { WriteFileAdditionAndDeletionToManifest( /*cf=*/0, added_files, std::vector>()); log_writer_.reset(); - Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); - ASSERT_OK(s); + CreateCurrentFile(); std::string manifest_path; VerifyManifest(&manifest_path); std::string db_id; bool has_missing_table_file = false; - s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_, - /*read_only=*/false, &db_id, - &has_missing_table_file); + Status s = versions_->TryRecoverFromOneManifest( + manifest_path, column_families_, + /*read_only=*/false, &db_id, &has_missing_table_file); ASSERT_OK(s); ASSERT_TRUE(has_missing_table_file); for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) { @@ -4137,15 +4129,14 @@ TEST_F(VersionSetTestMissingFiles, NoFileMissing) { WriteFileAdditionAndDeletionToManifest( /*cf=*/0, std::vector>(), deleted_files); log_writer_.reset(); - Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); - ASSERT_OK(s); + CreateCurrentFile(); std::string manifest_path; VerifyManifest(&manifest_path); std::string db_id; bool has_missing_table_file = false; - s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_, - /*read_only=*/false, &db_id, - &has_missing_table_file); + Status s = versions_->TryRecoverFromOneManifest( + manifest_path, column_families_, + /*read_only=*/false, &db_id, &has_missing_table_file); ASSERT_OK(s); ASSERT_FALSE(has_missing_table_file); for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) { @@ -4266,15 +4257,14 @@ class BestEffortsRecoverIncompleteVersionTest /*cf=*/0, added_files, std::vector>(), blob_files); log_writer_.reset(); - Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); - ASSERT_OK(s); + CreateCurrentFile(); std::string manifest_path; VerifyManifest(&manifest_path); std::string db_id; bool has_missing_table_file = false; - s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_, - /*read_only=*/false, &db_id, - &has_missing_table_file); + Status s = versions_->TryRecoverFromOneManifest( + manifest_path, column_families_, + /*read_only=*/false, &db_id, &has_missing_table_file); ASSERT_OK(s); ASSERT_TRUE(has_missing_table_file); } diff --git a/env/file_system.cc b/env/file_system.cc index 27c7207f0f5..1f02f7a7eeb 100644 --- a/env/file_system.cc +++ b/env/file_system.cc @@ -181,10 +181,10 @@ FileOptions FileSystem::OptimizeForBlobFileRead( IOStatus WriteStringToFile(FileSystem* fs, const Slice& data, const std::string& fname, bool should_sync, - const IOOptions& io_options) { + const IOOptions& io_options, + const FileOptions& file_options) { std::unique_ptr file; - EnvOptions soptions; - IOStatus s = fs->NewWritableFile(fname, soptions, &file, nullptr); + IOStatus s = fs->NewWritableFile(fname, file_options, &file, nullptr); if (!s.ok()) { return s; } diff --git a/file/filename.cc b/file/filename.cc index b34a0e113e8..45cbf9d76a9 100644 --- a/file/filename.cc +++ b/file/filename.cc @@ -388,6 +388,7 @@ bool ParseFileName(const std::string& fname, uint64_t* number, IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs, const std::string& dbname, uint64_t descriptor_number, + Temperature temp, FSDirectory* dir_contains_current_file) { // Remove leading "dbname/" and add newline to manifest file name std::string manifest = DescriptorFileName(dbname, descriptor_number); @@ -397,8 +398,11 @@ IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs, std::string tmp = TempFileName(dbname, descriptor_number); IOOptions opts; IOStatus s = PrepareIOFromWriteOptions(write_options, opts); + FileOptions file_opts; + file_opts.temperature = temp; if (s.ok()) { - s = WriteStringToFile(fs, contents.ToString() + "\n", tmp, true, opts); + s = WriteStringToFile(fs, contents.ToString() + "\n", tmp, true, opts, + file_opts); } TEST_SYNC_POINT_CALLBACK("SetCurrentFile:BeforeRename", &s); if (s.ok()) { @@ -423,7 +427,8 @@ IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs, } Status SetIdentityFile(const WriteOptions& write_options, Env* env, - const std::string& dbname, const std::string& db_id) { + const std::string& dbname, Temperature temp, + const std::string& db_id) { std::string id; if (db_id.empty()) { id = env->GenerateUniqueId(); @@ -437,8 +442,11 @@ Status SetIdentityFile(const WriteOptions& write_options, Env* env, Status s; IOOptions opts; s = PrepareIOFromWriteOptions(write_options, opts); + FileOptions file_opts; + file_opts.temperature = temp; if (s.ok()) { - s = WriteStringToFile(env, id, tmp, true, &opts); + s = WriteStringToFile(env->GetFileSystem().get(), id, tmp, + /*should_sync=*/true, opts, file_opts); } if (s.ok()) { s = env->RenameFile(tmp, identify_file_name); diff --git a/file/filename.h b/file/filename.h index 56bbd78d555..5a52c745ac6 100644 --- a/file/filename.h +++ b/file/filename.h @@ -161,11 +161,12 @@ bool ParseFileName(const std::string& filename, uint64_t* number, // when IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs, const std::string& dbname, uint64_t descriptor_number, + Temperature temp, FSDirectory* dir_contains_current_file); // Make the IDENTITY file for the db Status SetIdentityFile(const WriteOptions& write_options, Env* env, - const std::string& dbname, + const std::string& dbname, Temperature temp, const std::string& db_id = {}); // Sync manifest file `file`. diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index 11f971c2426..3761923ce7e 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -813,7 +813,7 @@ struct AdvancedColumnFamilyOptions { // If this option is set, when creating the last level files, pass this // temperature to FileSystem used. Should be no-op for default FileSystem // and users need to plug in their own FileSystem to take advantage of it. - // When using FIFO compaction, this option is ignored. + // Currently only compatible with universal compaction. // // Dynamically changeable through the SetOptions() API Temperature last_level_temperature = Temperature::kUnknown; diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h index 8d21c919466..042b38305ca 100644 --- a/include/rocksdb/file_system.h +++ b/include/rocksdb/file_system.h @@ -195,7 +195,9 @@ struct FileOptions : EnvOptions { FileOptions() : EnvOptions(), handoff_checksum_type(ChecksumType::kCRC32c) {} FileOptions(const DBOptions& opts) - : EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {} + : EnvOptions(opts), + temperature(opts.metadata_write_temperature), + handoff_checksum_type(ChecksumType::kCRC32c) {} FileOptions(const EnvOptions& opts) : EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {} @@ -1952,7 +1954,8 @@ class FSDirectoryWrapper : public FSDirectory { // A utility routine: write "data" to the named file. IOStatus WriteStringToFile(FileSystem* fs, const Slice& data, const std::string& fname, bool should_sync = false, - const IOOptions& io_options = IOOptions()); + const IOOptions& io_options = IOOptions(), + const FileOptions& file_options = FileOptions()); // A utility routine: read contents of named file into *data IOStatus ReadFileToString(FileSystem* fs, const std::string& fname, diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 8223c4a1338..4b25640836c 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1580,6 +1580,16 @@ struct DBOptions { // Default 100ms uint64_t follower_catchup_retry_wait_ms = 100; + // When DB files other than SST, blob and WAL files are created, use this + // filesystem temperature. (See also `wal_write_temperature` and various + // `*_temperature` CF options.) When not `kUnknown`, this overrides any + // temperature set by OptimizeForManifestWrite functions. + Temperature metadata_write_temperature = Temperature::kUnknown; + + // Use this filesystem temperature when creating WAL files. When not + // `kUnknown`, this overrides any temperature set by OptimizeForLogWrite + // functions. + Temperature wal_write_temperature = Temperature::kUnknown; // End EXPERIMENTAL }; diff --git a/options/db_options.cc b/options/db_options.cc index 8eb28c1edca..2678bb5a76d 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -576,6 +576,14 @@ static std::unordered_map {offsetof(struct ImmutableDBOptions, follower_catchup_retry_wait_ms), OptionType::kUInt64T, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"metadata_write_temperature", + {offsetof(struct ImmutableDBOptions, metadata_write_temperature), + OptionType::kTemperature, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"wal_write_temperature", + {offsetof(struct ImmutableDBOptions, wal_write_temperature), + OptionType::kTemperature, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, }; const std::string OptionsHelper::kDBOptionsName = "DBOptions"; @@ -778,7 +786,9 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) follower_refresh_catchup_period_ms( options.follower_refresh_catchup_period_ms), follower_catchup_retry_count(options.follower_catchup_retry_count), - follower_catchup_retry_wait_ms(options.follower_catchup_retry_wait_ms) { + follower_catchup_retry_wait_ms(options.follower_catchup_retry_wait_ms), + metadata_write_temperature(options.metadata_write_temperature), + wal_write_temperature(options.wal_write_temperature) { fs = env->GetFileSystem(); clock = env->GetSystemClock().get(); logger = info_log.get(); @@ -956,6 +966,10 @@ void ImmutableDBOptions::Dump(Logger* log) const { db_host_id.c_str()); ROCKS_LOG_HEADER(log, " Options.enforce_single_del_contracts: %s", enforce_single_del_contracts ? "true" : "false"); + ROCKS_LOG_HEADER(log, " Options.metadata_write_temperature: %s", + temperature_to_string[metadata_write_temperature].c_str()); + ROCKS_LOG_HEADER(log, " Options.wal_write_temperature: %s", + temperature_to_string[wal_write_temperature].c_str()); } bool ImmutableDBOptions::IsWalDirSameAsDBPath() const { diff --git a/options/db_options.h b/options/db_options.h index 5de6ab498a6..7e075262699 100644 --- a/options/db_options.h +++ b/options/db_options.h @@ -103,6 +103,8 @@ struct ImmutableDBOptions { uint64_t follower_refresh_catchup_period_ms; uint64_t follower_catchup_retry_count; uint64_t follower_catchup_retry_wait_ms; + Temperature metadata_write_temperature; + Temperature wal_write_temperature; // Beginning convenience/helper objects that are not part of the base // DBOptions diff --git a/options/options_helper.cc b/options/options_helper.cc index ec62dd1f5c9..011f47b9843 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -180,6 +180,15 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, options.enforce_single_del_contracts = immutable_db_options.enforce_single_del_contracts; options.daily_offpeak_time_utc = mutable_db_options.daily_offpeak_time_utc; + options.follower_refresh_catchup_period_ms = + immutable_db_options.follower_refresh_catchup_period_ms; + options.follower_catchup_retry_count = + immutable_db_options.follower_catchup_retry_count; + options.follower_catchup_retry_wait_ms = + immutable_db_options.follower_catchup_retry_wait_ms; + options.metadata_write_temperature = + immutable_db_options.metadata_write_temperature; + options.wal_write_temperature = immutable_db_options.wal_write_temperature; return options; } diff --git a/options/options_parser.cc b/options/options_parser.cc index ec32f764472..4e249908be1 100644 --- a/options/options_parser.cc +++ b/options/options_parser.cc @@ -69,8 +69,9 @@ Status PersistRocksDBOptions(const WriteOptions& write_options, } std::unique_ptr wf; - Status s = - fs->NewWritableFile(file_name, FileOptions(), &wf, nullptr); + FileOptions file_options; + file_options.temperature = db_opt.metadata_write_temperature; + Status s = fs->NewWritableFile(file_name, file_options, &wf, nullptr); if (!s.ok()) { return s; } diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 2bf349b1cbb..67aab055e18 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -367,7 +367,12 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { "lowest_used_cache_tier=kNonVolatileBlockTier;" "allow_data_in_errors=false;" "enforce_single_del_contracts=false;" - "daily_offpeak_time_utc=08:30-19:00;", + "daily_offpeak_time_utc=08:30-19:00;" + "follower_refresh_catchup_period_ms=123;" + "follower_catchup_retry_count=456;" + "follower_catchup_retry_wait_ms=789;" + "metadata_write_temperature=kCold;" + "wal_write_temperature=kHot;", new_options)); ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions),