diff --git a/.travis.yml b/.travis.yml index 66f37a5d284..bcb852cf04e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,6 @@ before_install: - sudo dpkg -i libgflags-dev_2.0-1_amd64.deb # Lousy hack to disable use and testing of fallocate, which doesn't behave quite # as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment. - - sed -i "s/fallocate(/HACK_NO_fallocate(/" build_tools/build_detect_platform -script: make check -j8 +script: OPT=-DTRAVIS make check -j8 notifications: email: false diff --git a/HISTORY.md b/HISTORY.md index 420377cbf5a..5b144ff3a94 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,10 @@ # Rocksdb Change Log -### Unreleased +## Unreleased + +----- Past Releases ----- + +## 3.5.0 (9/3/2014) ### New Features * Add include/utilities/write_batch_with_index.h, providing a utilitiy class to query data out of WriteBatch when building it. * Move BlockBasedTable related options to BlockBasedTableOptions from Options. Change corresponding JNI interface. Options affected include: @@ -11,15 +15,12 @@ ### Public API changes * The Prefix Extractor used with V2 compaction filters is now passed user key to SliceTransform::Transform instead of unparsed RocksDB key. - ------ Past Releases ----- - - ## 3.4.0 (8/18/2014) ### New Features * Support Multiple DB paths in universal style compactions * Add feature of storing plain table index and bloom filter in SST file. * CompactRange() will never output compacted files to level 0. This used to be the case when all the compaction input files were at level 0. +* Added iterate_upper_bound to define the extent upto which the forward iterator will return entries. This will prevent iterating over delete markers and overwritten entries for edge cases where you want to break out the iterator anyways. This may improve perfomance in case there are a large number of delete markers or overwritten entries. ### Public API changes * DBOptions.db_paths now is a vector of a DBPath structure which indicates both of path and target size diff --git a/Makefile b/Makefile index c75274cd031..b9c00158ad1 100644 --- a/Makefile +++ b/Makefile @@ -132,7 +132,7 @@ TOOLS = \ options_test \ blob_store_bench -PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench $(TOOLS) +PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench cache_bench $(TOOLS) # The library name is configurable since we are maintaining libraries of both # debug/release mode. @@ -264,6 +264,9 @@ $(LIBRARY): $(LIBOBJECTS) db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +cache_bench: util/cache_bench.o $(LIBOBJECTS) $(TESTUTIL) + $(CXX) util/cache_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + block_hash_index_test: table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) diff --git a/build_tools/regression_build_test.sh b/build_tools/regression_build_test.sh index 5e335afde21..ee2d334f0bf 100755 --- a/build_tools/regression_build_test.sh +++ b/build_tools/regression_build_test.sh @@ -344,6 +344,38 @@ common_in_mem_args="--db=/dev/shm/rocksdb \ --threads=32 \ --writes_per_second=81920 > ${STAT_FILE}.seekwhilewriting_in_ram +# measure fillseq with bunch of column families +./db_bench \ + --benchmarks=fillseq \ + --num_column_families=500 \ + --write_buffer_size=1048576 \ + --db=$DATA_DIR \ + --use_existing_db=0 \ + --num=$NUM \ + --writes=$NUM \ + --open_files=55000 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 > ${STAT_FILE}.fillseq_lots_column_families + +# measure overwrite performance with bunch of column families +./db_bench \ + --benchmarks=overwrite \ + --num_column_families=500 \ + --write_buffer_size=1048576 \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --num=$NUM \ + --writes=$((NUM / 10)) \ + --open_files=55000 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=8 > ${STAT_FILE}.overwrite_lots_column_families # send data to ods function send_to_ods { @@ -392,3 +424,5 @@ send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadr send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting send_benchmark_to_ods readwhilewriting readwhilewriting_in_ram ${STAT_FILE}.readwhilewriting_in_ram send_benchmark_to_ods seekrandomwhilewriting seekwhilewriting_in_ram ${STAT_FILE}.seekwhilewriting_in_ram +send_benchmark_to_ods fillseq fillseq_lots_column_families ${STAT_FILE}.fillseq_lots_column_families +send_benchmark_to_ods overwrite overwrite_lots_column_families ${STAT_FILE}.overwrite_lots_column_families diff --git a/db/builder.cc b/db/builder.cc index 1084f04138e..2c509437038 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -26,21 +26,24 @@ namespace rocksdb { class TableFactory; -TableBuilder* NewTableBuilder(const Options& options, +TableBuilder* NewTableBuilder(const ImmutableCFOptions& ioptions, const InternalKeyComparator& internal_comparator, WritableFile* file, - CompressionType compression_type) { - return options.table_factory->NewTableBuilder(options, internal_comparator, - file, compression_type); + const CompressionType compression_type, + const CompressionOptions& compression_opts) { + return ioptions.table_factory->NewTableBuilder( + ioptions, internal_comparator, file, compression_type, compression_opts); } -Status BuildTable(const std::string& dbname, Env* env, const Options& options, - const EnvOptions& soptions, TableCache* table_cache, +Status BuildTable(const std::string& dbname, Env* env, + const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, TableCache* table_cache, Iterator* iter, FileMetaData* meta, const InternalKeyComparator& internal_comparator, const SequenceNumber newest_snapshot, const SequenceNumber earliest_seqno_in_memtable, const CompressionType compression, + const CompressionOptions& compression_opts, const Env::IOPriority io_priority) { Status s; meta->fd.file_size = 0; @@ -50,23 +53,24 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options, // If the sequence number of the smallest entry in the memtable is // smaller than the most recent snapshot, then we do not trigger // removal of duplicate/deleted keys as part of this builder. - bool purge = options.purge_redundant_kvs_while_flush; + bool purge = ioptions.purge_redundant_kvs_while_flush; if (earliest_seqno_in_memtable <= newest_snapshot) { purge = false; } - std::string fname = TableFileName(options.db_paths, meta->fd.GetNumber(), + std::string fname = TableFileName(ioptions.db_paths, meta->fd.GetNumber(), meta->fd.GetPathId()); if (iter->Valid()) { unique_ptr file; - s = env->NewWritableFile(fname, &file, soptions); + s = env->NewWritableFile(fname, &file, env_options); if (!s.ok()) { return s; } file->SetIOPriority(io_priority); - TableBuilder* builder = - NewTableBuilder(options, internal_comparator, file.get(), compression); + TableBuilder* builder = NewTableBuilder( + ioptions, internal_comparator, file.get(), + compression, compression_opts); // the first key is the smallest key Slice key = iter->key(); @@ -75,8 +79,8 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options, meta->largest_seqno = meta->smallest_seqno; MergeHelper merge(internal_comparator.user_comparator(), - options.merge_operator.get(), options.info_log.get(), - options.min_partial_merge_operands, + ioptions.merge_operator, ioptions.info_log, + ioptions.min_partial_merge_operands, true /* internal key corruption is not ok */); if (purge) { @@ -196,12 +200,12 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options, delete builder; // Finish and check for file errors - if (s.ok() && !options.disableDataSync) { - if (options.use_fsync) { - StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS); + if (s.ok() && !ioptions.disable_data_sync) { + if (ioptions.use_fsync) { + StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS); s = file->Fsync(); } else { - StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS); + StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS); s = file->Sync(); } } @@ -211,7 +215,7 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options, if (s.ok()) { // Verify that the table is usable - Iterator* it = table_cache->NewIterator(ReadOptions(), soptions, + Iterator* it = table_cache->NewIterator(ReadOptions(), env_options, internal_comparator, meta->fd); s = it->status(); delete it; diff --git a/db/builder.h b/db/builder.h index f57501abd13..cf3ebd1ae0e 100644 --- a/db/builder.h +++ b/db/builder.h @@ -11,6 +11,7 @@ #include "rocksdb/status.h" #include "rocksdb/types.h" #include "rocksdb/options.h" +#include "rocksdb/immutable_options.h" namespace rocksdb { @@ -26,8 +27,10 @@ class TableBuilder; class WritableFile; extern TableBuilder* NewTableBuilder( - const Options& options, const InternalKeyComparator& internal_comparator, - WritableFile* file, CompressionType compression_type); + const ImmutableCFOptions& options, + const InternalKeyComparator& internal_comparator, + WritableFile* file, const CompressionType compression_type, + const CompressionOptions& compression_opts); // Build a Table file from the contents of *iter. The generated file // will be named according to number specified in meta. On success, the rest of @@ -35,13 +38,15 @@ extern TableBuilder* NewTableBuilder( // If no data is present in *iter, meta->file_size will be set to // zero, and no Table file will be produced. extern Status BuildTable(const std::string& dbname, Env* env, - const Options& options, const EnvOptions& soptions, + const ImmutableCFOptions& options, + const EnvOptions& env_options, TableCache* table_cache, Iterator* iter, FileMetaData* meta, const InternalKeyComparator& internal_comparator, const SequenceNumber newest_snapshot, const SequenceNumber earliest_seqno_in_memtable, const CompressionType compression, + const CompressionOptions& compression_opts, const Env::IOPriority io_priority = Env::IO_HIGH); } // namespace rocksdb diff --git a/db/c.cc b/db/c.cc index 3114f350042..9ea54964602 100644 --- a/db/c.cc +++ b/db/c.cc @@ -1844,6 +1844,13 @@ void rocksdb_readoptions_set_snapshot( opt->rep.snapshot = (snap ? snap->rep : nullptr); } +void rocksdb_readoptions_set_iterate_upper_bound( + rocksdb_readoptions_t* opt, + const char* key, size_t keylen) { + Slice prefix = Slice(key, keylen); + opt->rep.iterate_upper_bound = &prefix; +} + void rocksdb_readoptions_set_read_tier( rocksdb_readoptions_t* opt, int v) { opt->rep.read_tier = static_cast(v); diff --git a/db/column_family.cc b/db/column_family.cc index b1c9ba7e83c..7e06c9bd7cd 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -178,7 +178,7 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name, Version* dummy_versions, Cache* table_cache, const ColumnFamilyOptions& options, const DBOptions* db_options, - const EnvOptions& storage_options, + const EnvOptions& env_options, ColumnFamilySet* column_family_set) : id_(id), name_(name), @@ -188,6 +188,7 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name, dropped_(false), internal_comparator_(options.comparator), options_(*db_options, SanitizeOptions(&internal_comparator_, options)), + ioptions_(options_), mem_(nullptr), imm_(options_.min_write_buffer_number_to_merge), super_version_(nullptr), @@ -204,7 +205,7 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name, if (dummy_versions != nullptr) { internal_stats_.reset( new InternalStats(options_.num_levels, db_options->env, this)); - table_cache_.reset(new TableCache(&options_, storage_options, table_cache)); + table_cache_.reset(new TableCache(ioptions_, env_options, table_cache)); if (options_.compaction_style == kCompactionStyleUniversal) { compaction_picker_.reset( new UniversalCompactionPicker(&options_, &internal_comparator_)); @@ -306,7 +307,7 @@ void ColumnFamilyData::RecalculateWriteStallRateLimitsConditions() { } const EnvOptions* ColumnFamilyData::soptions() const { - return &(column_family_set_->storage_options_); + return &(column_family_set_->env_options_); } void ColumnFamilyData::SetCurrent(Version* current) { @@ -462,16 +463,16 @@ void ColumnFamilyData::ResetThreadLocalSuperVersions() { ColumnFamilySet::ColumnFamilySet(const std::string& dbname, const DBOptions* db_options, - const EnvOptions& storage_options, + const EnvOptions& env_options, Cache* table_cache) : max_column_family_(0), dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr, ColumnFamilyOptions(), db_options, - storage_options_, nullptr)), + env_options_, nullptr)), default_cfd_cache_(nullptr), db_name_(dbname), db_options_(db_options), - storage_options_(storage_options), + env_options_(env_options), table_cache_(table_cache), spin_lock_(ATOMIC_FLAG_INIT) { // initialize linked list @@ -537,7 +538,7 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily( assert(column_families_.find(name) == column_families_.end()); ColumnFamilyData* new_cfd = new ColumnFamilyData(id, name, dummy_versions, table_cache_, options, - db_options_, storage_options_, this); + db_options_, env_options_, this); Lock(); column_families_.insert({name, id}); column_family_data_.insert({id, new_cfd}); diff --git a/db/column_family.h b/db/column_family.h index 33bceadc62c..a68189d5132 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -165,9 +165,11 @@ class ColumnFamilyData { void SetLogNumber(uint64_t log_number) { log_number_ = log_number; } uint64_t GetLogNumber() const { return log_number_; } - // thread-safe + // TODO(ljin): make this API thread-safe once we allow updating options_ const Options* options() const { return &options_; } + // thread-safe const EnvOptions* soptions() const; + const ImmutableCFOptions* ioptions() const { return &ioptions_; } InternalStats* internal_stats() { return internal_stats_.get(); } @@ -251,7 +253,7 @@ class ColumnFamilyData { Version* dummy_versions, Cache* table_cache, const ColumnFamilyOptions& options, const DBOptions* db_options, - const EnvOptions& storage_options, + const EnvOptions& env_options, ColumnFamilySet* column_family_set); // Recalculate some small conditions, which are changed only during @@ -272,7 +274,8 @@ class ColumnFamilyData { const InternalKeyComparator internal_comparator_; - Options const options_; + const Options options_; + const ImmutableCFOptions ioptions_; std::unique_ptr table_cache_; @@ -367,7 +370,7 @@ class ColumnFamilySet { }; ColumnFamilySet(const std::string& dbname, const DBOptions* db_options, - const EnvOptions& storage_options, Cache* table_cache); + const EnvOptions& env_options, Cache* table_cache); ~ColumnFamilySet(); ColumnFamilyData* GetDefault() const; @@ -420,7 +423,7 @@ class ColumnFamilySet { const std::string db_name_; const DBOptions* const db_options_; - const EnvOptions storage_options_; + const EnvOptions env_options_; Cache* table_cache_; std::atomic_flag spin_lock_; }; diff --git a/db/compaction.cc b/db/compaction.cc index 0bffa0162fa..cf0b682aa8b 100644 --- a/db/compaction.cc +++ b/db/compaction.cc @@ -9,7 +9,10 @@ #include "db/compaction.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc index e05d07776e5..6e9a46ed442 100644 --- a/db/compaction_picker.cc +++ b/db/compaction_picker.cc @@ -9,7 +9,10 @@ #include "db/compaction_picker.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include "db/filename.h" diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc index c1e59b1b568..2652d1776c8 100644 --- a/db/cuckoo_table_db_test.cc +++ b/db/cuckoo_table_db_test.cc @@ -245,14 +245,38 @@ TEST(CuckooTableDBTest, CompactionTrigger) { ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx))); } dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_EQ("2", FilesPerLevel()); + dbfull()->TEST_CompactRange(0, nullptr, nullptr); ASSERT_EQ("0,2", FilesPerLevel()); for (int idx = 0; idx < 22; ++idx) { ASSERT_EQ(std::string(10000, 'a' + idx), Get(Key(idx))); } } +TEST(CuckooTableDBTest, CompactionIntoMultipleFiles) { + // Create a big L0 file and check it compacts into multiple files in L1. + Options options = CurrentOptions(); + options.write_buffer_size = 270 << 10; + // Two SST files should be created, each containing 14 keys. + // Number of buckets will be 16. Total size ~156 KB. + options.target_file_size_base = 160 << 10; + Reopen(&options); + + // Write 28 values, each 10016 B ~ 10KB + for (int idx = 0; idx < 28; ++idx) { + ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx))); + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ("1", FilesPerLevel()); + + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_EQ("0,2", FilesPerLevel()); + for (int idx = 0; idx < 28; ++idx) { + ASSERT_EQ(std::string(10000, 'a' + idx), Get(Key(idx))); + } +} + TEST(CuckooTableDBTest, SameKeyInsertedInTwoDifferentFilesAndCompacted) { // Insert same key twice so that they go to different SST files. Then wait for // compaction and check if the latest value is stored and old value removed. diff --git a/db/db_bench.cc b/db/db_bench.cc index 2f88e81ffaa..ced93f227f5 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -7,7 +7,9 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif #ifndef GFLAGS #include @@ -1108,6 +1110,8 @@ class Benchmark { } ~Benchmark() { + std::for_each(db_.cfh.begin(), db_.cfh.end(), + [](ColumnFamilyHandle* cfh) { delete cfh; }); delete db_.db; delete prefix_extractor_; } @@ -1332,6 +1336,8 @@ class Benchmark { method = nullptr; } else { if (db_.db != nullptr) { + std::for_each(db_.cfh.begin(), db_.cfh.end(), + [](ColumnFamilyHandle* cfh) { delete cfh; }); delete db_.db; db_.db = nullptr; db_.cfh.clear(); diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index 4185a40cab9..9f05b8d3079 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -9,7 +9,10 @@ #ifndef ROCKSDB_LITE +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include @@ -29,9 +32,9 @@ Status DBImpl::DisableFileDeletions() { MutexLock l(&mutex_); ++disable_delete_obsolete_files_; if (disable_delete_obsolete_files_ == 1) { - Log(options_.info_log, "File Deletions Disabled"); + Log(db_options_.info_log, "File Deletions Disabled"); } else { - Log(options_.info_log, + Log(db_options_.info_log, "File Deletions Disabled, but already disabled. Counter: %d", disable_delete_obsolete_files_); } @@ -50,11 +53,11 @@ Status DBImpl::EnableFileDeletions(bool force) { --disable_delete_obsolete_files_; } if (disable_delete_obsolete_files_ == 0) { - Log(options_.info_log, "File Deletions Enabled"); + Log(db_options_.info_log, "File Deletions Enabled"); should_purge_files = true; FindObsoleteFiles(deletion_state, true); } else { - Log(options_.info_log, + Log(db_options_.info_log, "File Deletions Enable, but not really enabled. Counter: %d", disable_delete_obsolete_files_); } @@ -62,7 +65,7 @@ Status DBImpl::EnableFileDeletions(bool force) { if (should_purge_files) { PurgeObsoleteFiles(deletion_state); } - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); return Status::OK(); } @@ -95,7 +98,7 @@ Status DBImpl::GetLiveFiles(std::vector& ret, if (!status.ok()) { mutex_.Unlock(); - Log(options_.info_log, "Cannot Flush data %s\n", + Log(db_options_.info_log, "Cannot Flush data %s\n", status.ToString().c_str()); return status; } @@ -133,7 +136,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { Status s; // list wal files in main db dir. VectorLogPtr logs; - s = GetSortedWalsOfType(options_.wal_dir, logs, kAliveLogFile); + s = GetSortedWalsOfType(db_options_.wal_dir, logs, kAliveLogFile); if (!s.ok()) { return s; } @@ -146,7 +149,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { files.clear(); // list wal files in archive dir. - std::string archivedir = ArchivalDirectory(options_.wal_dir); + std::string archivedir = ArchivalDirectory(db_options_.wal_dir); if (env_->FileExists(archivedir)) { s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile); if (!s.ok()) { @@ -157,7 +160,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { uint64_t latest_archived_log_number = 0; if (!files.empty()) { latest_archived_log_number = files.back()->LogNumber(); - Log(options_.info_log, "Latest Archived log: %" PRIu64, + Log(db_options_.info_log, "Latest Archived log: %" PRIu64, latest_archived_log_number); } @@ -170,7 +173,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { // same log in both db dir and archived dir. Simply // ignore the one in db dir. Note that, if we read // archived dir first, we would have missed the log file. - Log(options_.info_log, "%s already moved to archive", + Log(db_options_.info_log, "%s already moved to archive", log->PathName().c_str()); } } diff --git a/db/db_impl.cc b/db/db_impl.cc index 7c65e9a61d3..b83d60f5e0c 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -9,7 +9,10 @@ #include "db/db_impl.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include @@ -74,20 +77,6 @@ const std::string kDefaultColumnFamilyName("default"); void DumpLeveldbBuildVersion(Logger * log); -// Information kept for every waiting writer -struct DBImpl::Writer { - Status status; - WriteBatch* batch; - bool sync; - bool disableWAL; - bool in_batch_group; - bool done; - uint64_t timeout_hint_us; - port::CondVar cv; - - explicit Writer(port::Mutex* mu) : cv(mu) { } -}; - struct DBImpl::WriteContext { autovector superversions_to_free_; autovector logs_to_free_; @@ -332,8 +321,8 @@ CompressionType GetCompressionFlush(const Options& options) { DBImpl::DBImpl(const DBOptions& options, const std::string& dbname) : env_(options.env), dbname_(dbname), - options_(SanitizeOptions(dbname, options)), - stats_(options_.statistics.get()), + db_options_(SanitizeOptions(dbname, options)), + stats_(db_options_.statistics.get()), db_lock_(nullptr), mutex_(options.use_adaptive_mutex), shutting_down_(nullptr), @@ -356,7 +345,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname) default_interval_to_delete_obsolete_WAL_(600), flush_on_destroy_(false), delayed_writes_(0), - storage_options_(options), + env_options_(options), bg_work_gate_closed_(false), refitting_level_(false), opened_successfully_(false) { @@ -364,23 +353,23 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname) // Reserve ten files or so for other uses and give the rest to TableCache. // Give a large number for setting of "infinite" open files. - const int table_cache_size = - (options_.max_open_files == -1) ? 4194304 : options_.max_open_files - 10; + const int table_cache_size = (db_options_.max_open_files == -1) ? + 4194304 : db_options_.max_open_files - 10; // Reserve ten files or so for other uses and give the rest to TableCache. table_cache_ = - NewLRUCache(table_cache_size, options_.table_cache_numshardbits, - options_.table_cache_remove_scan_count_limit); + NewLRUCache(table_cache_size, db_options_.table_cache_numshardbits, + db_options_.table_cache_remove_scan_count_limit); versions_.reset( - new VersionSet(dbname_, &options_, storage_options_, table_cache_.get())); + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get())); column_family_memtables_.reset( new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); - DumpLeveldbBuildVersion(options_.info_log.get()); - DumpDBFileSummary(options_, dbname_); - options_.Dump(options_.info_log.get()); + DumpLeveldbBuildVersion(db_options_.info_log.get()); + DumpDBFileSummary(db_options_, dbname_); + db_options_.Dump(db_options_.info_log.get()); - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); } DBImpl::~DBImpl() { @@ -411,7 +400,7 @@ DBImpl::~DBImpl() { mutex_.Lock(); } - if (options_.allow_thread_local) { + if (db_options_.allow_thread_local) { // Clean up obsolete files due to SuperVersion release. // (1) Need to delete to obsolete files before closing because RepairDB() // scans all existing files in the file system and builds manifest file. @@ -440,7 +429,7 @@ DBImpl::~DBImpl() { env_->UnlockFile(db_lock_); } - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); } Status DBImpl::NewDB() { @@ -449,15 +438,15 @@ Status DBImpl::NewDB() { new_db.SetNextFile(2); new_db.SetLastSequence(0); - Log(options_.info_log, "Creating manifest 1 \n"); + Log(db_options_.info_log, "Creating manifest 1 \n"); const std::string manifest = DescriptorFileName(dbname_, 1); unique_ptr file; Status s = env_->NewWritableFile( - manifest, &file, env_->OptimizeForManifestWrite(storage_options_)); + manifest, &file, env_->OptimizeForManifestWrite(env_options_)); if (!s.ok()) { return s; } - file->SetPreallocationBlockSize(options_.manifest_preallocation_size); + file->SetPreallocationBlockSize(db_options_.manifest_preallocation_size); { log::Writer log(std::move(file)); std::string record; @@ -474,38 +463,38 @@ Status DBImpl::NewDB() { } void DBImpl::MaybeIgnoreError(Status* s) const { - if (s->ok() || options_.paranoid_checks) { + if (s->ok() || db_options_.paranoid_checks) { // No change needed } else { - Log(options_.info_log, "Ignoring error %s", s->ToString().c_str()); + Log(db_options_.info_log, "Ignoring error %s", s->ToString().c_str()); *s = Status::OK(); } } const Status DBImpl::CreateArchivalDirectory() { - if (options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0) { - std::string archivalPath = ArchivalDirectory(options_.wal_dir); + if (db_options_.WAL_ttl_seconds > 0 || db_options_.WAL_size_limit_MB > 0) { + std::string archivalPath = ArchivalDirectory(db_options_.wal_dir); return env_->CreateDirIfMissing(archivalPath); } return Status::OK(); } void DBImpl::PrintStatistics() { - auto dbstats = options_.statistics.get(); + auto dbstats = db_options_.statistics.get(); if (dbstats) { - Log(options_.info_log, + Log(db_options_.info_log, "STATISTCS:\n %s", dbstats->ToString().c_str()); } } void DBImpl::MaybeDumpStats() { - if (options_.stats_dump_period_sec == 0) return; + if (db_options_.stats_dump_period_sec == 0) return; const uint64_t now_micros = env_->NowMicros(); if (last_stats_dump_time_microsec_ + - options_.stats_dump_period_sec * 1000000 + db_options_.stats_dump_period_sec * 1000000 <= now_micros) { // Multiple threads could race in here simultaneously. // However, the last one will update last_stats_dump_time_microsec_ @@ -529,8 +518,8 @@ void DBImpl::MaybeDumpStats() { default_cf_internal_stats_->GetStringProperty(db_property_type, "rocksdb.dbstats", &stats); } - Log(options_.info_log, "------- DUMPING STATS -------"); - Log(options_.info_log, "%s", stats.c_str()); + Log(db_options_.info_log, "------- DUMPING STATS -------"); + Log(db_options_.info_log, "%s", stats.c_str()); PrintStatistics(); } @@ -540,7 +529,7 @@ void DBImpl::MaybeDumpStats() { // of all files in the filesystem in 'candidate_files'. // no_full_scan = true -- never do the full scan using GetChildren() // force = false -- don't force the full scan, except every -// options_.delete_obsolete_files_period_micros +// db_options_.delete_obsolete_files_period_micros // force = true -- force the full scan void DBImpl::FindObsoleteFiles(DeletionState& deletion_state, bool force, @@ -557,12 +546,12 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state, // logic for figurint out if we're doing the full scan if (no_full_scan) { doing_the_full_scan = false; - } else if (force || options_.delete_obsolete_files_period_micros == 0) { + } else if (force || db_options_.delete_obsolete_files_period_micros == 0) { doing_the_full_scan = true; } else { const uint64_t now_micros = env_->NowMicros(); if (delete_obsolete_files_last_run_ + - options_.delete_obsolete_files_period_micros < now_micros) { + db_options_.delete_obsolete_files_period_micros < now_micros) { doing_the_full_scan = true; delete_obsolete_files_last_run_ = now_micros; } @@ -594,11 +583,12 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state, versions_->AddLiveFiles(&deletion_state.sst_live); if (doing_the_full_scan) { - for (uint32_t path_id = 0; path_id < options_.db_paths.size(); path_id++) { + for (uint32_t path_id = 0; + path_id < db_options_.db_paths.size(); path_id++) { // set of all files in the directory. We'll exclude files that are still // alive in the subsequent processings. std::vector files; - env_->GetChildren(options_.db_paths[path_id].path, + env_->GetChildren(db_options_.db_paths[path_id].path, &files); // Ignore errors for (std::string file : files) { deletion_state.candidate_files.emplace_back(file, path_id); @@ -606,17 +596,18 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state, } //Add log files in wal_dir - if (options_.wal_dir != dbname_) { + if (db_options_.wal_dir != dbname_) { std::vector log_files; - env_->GetChildren(options_.wal_dir, &log_files); // Ignore errors + env_->GetChildren(db_options_.wal_dir, &log_files); // Ignore errors for (std::string log_file : log_files) { deletion_state.candidate_files.emplace_back(log_file, 0); } } // Add info log files in db_log_dir - if (!options_.db_log_dir.empty() && options_.db_log_dir != dbname_) { + if (!db_options_.db_log_dir.empty() && db_options_.db_log_dir != dbname_) { std::vector info_log_files; - env_->GetChildren(options_.db_log_dir, &info_log_files); // Ignore errors + // Ignore errors + env_->GetChildren(db_options_.db_log_dir, &info_log_files); for (std::string log_file : info_log_files) { deletion_state.candidate_files.emplace_back(log_file, 0); } @@ -687,7 +678,7 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) { candidate_files.end()); std::vector old_info_log_files; - InfoLogPrefix info_log_prefix(!options_.db_log_dir.empty(), dbname_); + InfoLogPrefix info_log_prefix(!db_options_.db_log_dir.empty(), dbname_); for (const auto& candidate_file : candidate_files) { std::string to_delete = candidate_file.file_name; uint32_t path_id = candidate_file.path_id; @@ -743,51 +734,51 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) { if (type == kTableFile) { // evict from cache TableCache::Evict(table_cache_.get(), number); - fname = TableFileName(options_.db_paths, number, path_id); + fname = TableFileName(db_options_.db_paths, number, path_id); } else { - fname = - ((type == kLogFile) ? options_.wal_dir : dbname_) + "/" + to_delete; + fname = ((type == kLogFile) ? + db_options_.wal_dir : dbname_) + "/" + to_delete; } if (type == kLogFile && - (options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0)) { - auto archived_log_name = ArchivedLogFileName(options_.wal_dir, number); + (db_options_.WAL_ttl_seconds > 0 || + db_options_.WAL_size_limit_MB > 0)) { + auto archived_log_name = ArchivedLogFileName(db_options_.wal_dir, number); // The sync point below is used in (DBTest,TransactionLogIteratorRace) TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:1"); Status s = env_->RenameFile(fname, archived_log_name); // The sync point below is used in (DBTest,TransactionLogIteratorRace) TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:2"); - Log(options_.info_log, + Log(db_options_.info_log, "Move log file %s to %s -- %s\n", fname.c_str(), archived_log_name.c_str(), s.ToString().c_str()); } else { Status s = env_->DeleteFile(fname); - Log(options_.info_log, "Delete %s type=%d #%" PRIu64 " -- %s\n", + Log(db_options_.info_log, "Delete %s type=%d #%" PRIu64 " -- %s\n", fname.c_str(), type, number, s.ToString().c_str()); } } // Delete old info log files. size_t old_info_log_file_count = old_info_log_files.size(); - if (old_info_log_file_count >= options_.keep_log_file_num) { + if (old_info_log_file_count >= db_options_.keep_log_file_num) { std::sort(old_info_log_files.begin(), old_info_log_files.end()); - size_t end = old_info_log_file_count - options_.keep_log_file_num; + size_t end = old_info_log_file_count - db_options_.keep_log_file_num; for (unsigned int i = 0; i <= end; i++) { std::string& to_delete = old_info_log_files.at(i); - std::string full_path_to_delete = - (options_.db_log_dir.empty() ? dbname_ : options_.db_log_dir) + "/" + - to_delete; - Log(options_.info_log, "Delete info log file %s\n", + std::string full_path_to_delete = (db_options_.db_log_dir.empty() ? + dbname_ : db_options_.db_log_dir) + "/" + to_delete; + Log(db_options_.info_log, "Delete info log file %s\n", full_path_to_delete.c_str()); Status s = env_->DeleteFile(full_path_to_delete); if (!s.ok()) { - Log(options_.info_log, "Delete info log file %s FAILED -- %s\n", + Log(db_options_.info_log, "Delete info log file %s FAILED -- %s\n", to_delete.c_str(), s.ToString().c_str()); } } } PurgeObsoleteWALFiles(); - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); } void DBImpl::DeleteObsoleteFiles() { @@ -809,8 +800,8 @@ void DBImpl::DeleteObsoleteFiles() { // b. get sorted non-empty archived logs // c. delete what should be deleted void DBImpl::PurgeObsoleteWALFiles() { - bool const ttl_enabled = options_.WAL_ttl_seconds > 0; - bool const size_limit_enabled = options_.WAL_size_limit_MB > 0; + bool const ttl_enabled = db_options_.WAL_ttl_seconds > 0; + bool const size_limit_enabled = db_options_.WAL_size_limit_MB > 0; if (!ttl_enabled && !size_limit_enabled) { return; } @@ -818,13 +809,14 @@ void DBImpl::PurgeObsoleteWALFiles() { int64_t current_time; Status s = env_->GetCurrentTime(¤t_time); if (!s.ok()) { - Log(options_.info_log, "Can't get current time: %s", s.ToString().c_str()); + Log(db_options_.info_log, "Can't get current time: %s", + s.ToString().c_str()); assert(false); return; } uint64_t const now_seconds = static_cast(current_time); uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled) ? - options_.WAL_ttl_seconds / 2 : default_interval_to_delete_obsolete_WAL_; + db_options_.WAL_ttl_seconds / 2 : default_interval_to_delete_obsolete_WAL_; if (purge_wal_files_last_run_ + time_to_check > now_seconds) { return; @@ -832,11 +824,12 @@ void DBImpl::PurgeObsoleteWALFiles() { purge_wal_files_last_run_ = now_seconds; - std::string archival_dir = ArchivalDirectory(options_.wal_dir); + std::string archival_dir = ArchivalDirectory(db_options_.wal_dir); std::vector files; s = env_->GetChildren(archival_dir, &files); if (!s.ok()) { - Log(options_.info_log, "Can't get archive files: %s", s.ToString().c_str()); + Log(db_options_.info_log, "Can't get archive files: %s", + s.ToString().c_str()); assert(false); return; } @@ -854,14 +847,14 @@ void DBImpl::PurgeObsoleteWALFiles() { Status const s = env_->GetFileModificationTime(file_path, &file_m_time); if (!s.ok()) { - Log(options_.info_log, "Can't get file mod time: %s: %s", + Log(db_options_.info_log, "Can't get file mod time: %s: %s", file_path.c_str(), s.ToString().c_str()); continue; } - if (now_seconds - file_m_time > options_.WAL_ttl_seconds) { + if (now_seconds - file_m_time > db_options_.WAL_ttl_seconds) { Status const s = env_->DeleteFile(file_path); if (!s.ok()) { - Log(options_.info_log, "Can't delete file: %s: %s", + Log(db_options_.info_log, "Can't delete file: %s: %s", file_path.c_str(), s.ToString().c_str()); continue; } else { @@ -876,7 +869,7 @@ void DBImpl::PurgeObsoleteWALFiles() { uint64_t file_size; Status const s = env_->GetFileSize(file_path, &file_size); if (!s.ok()) { - Log(options_.info_log, "Can't get file size: %s: %s", + Log(db_options_.info_log, "Can't get file size: %s: %s", file_path.c_str(), s.ToString().c_str()); return; } else { @@ -886,7 +879,7 @@ void DBImpl::PurgeObsoleteWALFiles() { } else { Status s = env_->DeleteFile(file_path); if (!s.ok()) { - Log(options_.info_log, "Can't delete file: %s: %s", + Log(db_options_.info_log, "Can't delete file: %s: %s", file_path.c_str(), s.ToString().c_str()); continue; } else { @@ -903,7 +896,7 @@ void DBImpl::PurgeObsoleteWALFiles() { return; } - size_t const files_keep_num = options_.WAL_size_limit_MB * + size_t const files_keep_num = db_options_.WAL_size_limit_MB * 1024 * 1024 / log_file_size; if (log_files_num <= files_keep_num) { return; @@ -914,7 +907,7 @@ void DBImpl::PurgeObsoleteWALFiles() { GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile); if (files_del_num > archived_logs.size()) { - Log(options_.info_log, "Trying to delete more archived log files than " + Log(db_options_.info_log, "Trying to delete more archived log files than " "exist. Deleting all"); files_del_num = archived_logs.size(); } @@ -923,7 +916,7 @@ void DBImpl::PurgeObsoleteWALFiles() { std::string const file_path = archived_logs[i]->PathName(); Status const s = DeleteFile(file_path); if (!s.ok()) { - Log(options_.info_log, "Can't delete file: %s: %s", + Log(db_options_.info_log, "Can't delete file: %s: %s", file_path.c_str(), s.ToString().c_str()); continue; } else { @@ -1031,7 +1024,7 @@ Status DBImpl::ReadFirstRecord(const WalFileType type, const uint64_t number, } Status s; if (type == kAliveLogFile) { - std::string fname = LogFileName(options_.wal_dir, number); + std::string fname = LogFileName(db_options_.wal_dir, number); s = ReadFirstLine(fname, sequence); if (env_->FileExists(fname) && !s.ok()) { // return any error that is not caused by non-existing file @@ -1041,7 +1034,8 @@ Status DBImpl::ReadFirstRecord(const WalFileType type, const uint64_t number, if (type == kArchivedLogFile || !s.ok()) { // check if the file got moved to archive. - std::string archived_file = ArchivedLogFileName(options_.wal_dir, number); + std::string archived_file = + ArchivedLogFileName(db_options_.wal_dir, number); s = ReadFirstLine(archived_file, sequence); } @@ -1062,7 +1056,7 @@ Status DBImpl::ReadFirstLine(const std::string& fname, const char* fname; Status* status; - bool ignore_error; // true if options_.paranoid_checks==false + bool ignore_error; // true if db_options_.paranoid_checks==false virtual void Corruption(size_t bytes, const Status& s) { Log(info_log, "%s%s: dropping %d bytes; %s", (this->ignore_error ? "(ignoring error) " : ""), fname, @@ -1075,7 +1069,7 @@ Status DBImpl::ReadFirstLine(const std::string& fname, }; unique_ptr file; - Status status = env_->NewSequentialFile(fname, &file, storage_options_); + Status status = env_->NewSequentialFile(fname, &file, env_options_); if (!status.ok()) { return status; @@ -1083,17 +1077,17 @@ Status DBImpl::ReadFirstLine(const std::string& fname, LogReporter reporter; reporter.env = env_; - reporter.info_log = options_.info_log.get(); + reporter.info_log = db_options_.info_log.get(); reporter.fname = fname.c_str(); reporter.status = &status; - reporter.ignore_error = !options_.paranoid_checks; + reporter.ignore_error = !db_options_.paranoid_checks; log::Reader reader(std::move(file), &reporter, true /*checksum*/, 0 /*initial_offset*/); std::string scratch; Slice record; if (reader.ReadRecord(&record, &scratch) && - (status.ok() || !options_.paranoid_checks)) { + (status.ok() || !db_options_.paranoid_checks)) { if (record.size() < 12) { reporter.Corruption(record.size(), Status::Corruption("log record too small")); @@ -1134,7 +1128,7 @@ Status DBImpl::Recover( return s; } - for (auto& db_path : options_.db_paths) { + for (auto& db_path : db_options_.db_paths) { s = env_->CreateDirIfMissing(db_path.path); if (!s.ok()) { return s; @@ -1152,7 +1146,7 @@ Status DBImpl::Recover( } if (!env_->FileExists(CurrentFileName(dbname_))) { - if (options_.create_if_missing) { + if (db_options_.create_if_missing) { s = NewDB(); is_new_db = true; if (!s.ok()) { @@ -1163,7 +1157,7 @@ Status DBImpl::Recover( dbname_, "does not exist (create_if_missing is false)"); } } else { - if (options_.error_if_exists) { + if (db_options_.error_if_exists) { return Status::InvalidArgument( dbname_, "exists (error_if_exists is true)"); } @@ -1178,7 +1172,7 @@ Status DBImpl::Recover( } Status s = versions_->Recover(column_families, read_only); - if (options_.paranoid_checks && s.ok()) { + if (db_options_.paranoid_checks && s.ok()) { s = CheckConsistency(); } if (s.ok()) { @@ -1199,7 +1193,7 @@ Status DBImpl::Recover( const uint64_t min_log = versions_->MinLogNumber(); const uint64_t prev_log = versions_->PrevLogNumber(); std::vector filenames; - s = env_->GetChildren(options_.wal_dir, &filenames); + s = env_->GetChildren(db_options_.wal_dir, &filenames); if (!s.ok()) { return s; } @@ -1252,8 +1246,8 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, Env* env; Logger* info_log; const char* fname; - Status* status; // nullptr if options_.paranoid_checks==false or - // options_.skip_log_error_on_recovery==true + Status* status; // nullptr if db_options_.paranoid_checks==false or + // db_options_.skip_log_error_on_recovery==true virtual void Corruption(size_t bytes, const Status& s) { Log(info_log, "%s%s: dropping %d bytes; %s", (this->status == nullptr ? "(ignoring error) " : ""), @@ -1273,9 +1267,9 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, } // Open the log file - std::string fname = LogFileName(options_.wal_dir, log_number); + std::string fname = LogFileName(db_options_.wal_dir, log_number); unique_ptr file; - Status status = env_->NewSequentialFile(fname, &file, storage_options_); + Status status = env_->NewSequentialFile(fname, &file, env_options_); if (!status.ok()) { MaybeIgnoreError(&status); return status; @@ -1284,17 +1278,18 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, // Create the log reader. LogReporter reporter; reporter.env = env_; - reporter.info_log = options_.info_log.get(); + reporter.info_log = db_options_.info_log.get(); reporter.fname = fname.c_str(); - reporter.status = (options_.paranoid_checks && - !options_.skip_log_error_on_recovery ? &status : nullptr); + reporter.status = (db_options_.paranoid_checks && + !db_options_.skip_log_error_on_recovery ? &status + : nullptr); // We intentially make log::Reader do checksumming even if // paranoid_checks==false so that corruptions cause entire commits // to be skipped instead of propagating bad information (like overly // large sequence numbers). log::Reader reader(std::move(file), &reporter, true/*checksum*/, 0/*initial_offset*/); - Log(options_.info_log, "Recovering log #%" PRIu64 "", log_number); + Log(db_options_.info_log, "Recovering log #%" PRIu64 "", log_number); // Read all the records and add to a memtable std::string scratch; @@ -1415,30 +1410,32 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem, pending_outputs_[meta.fd.GetNumber()] = 0; // path 0 for level 0 file. ReadOptions ro; ro.total_order_seek = true; - Iterator* iter = mem->NewIterator(ro); - const SequenceNumber newest_snapshot = snapshots_.GetNewest(); - const SequenceNumber earliest_seqno_in_memtable = - mem->GetFirstSequenceNumber(); - Log(options_.info_log, "[%s] Level-0 table #%" PRIu64 ": started", - cfd->GetName().c_str(), meta.fd.GetNumber()); - + Arena arena; Status s; { - mutex_.Unlock(); - s = BuildTable(dbname_, env_, *cfd->options(), storage_options_, - cfd->table_cache(), iter, &meta, cfd->internal_comparator(), - newest_snapshot, earliest_seqno_in_memtable, - GetCompressionFlush(*cfd->options()), Env::IO_HIGH); - LogFlush(options_.info_log); - mutex_.Lock(); - } + ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); + const SequenceNumber newest_snapshot = snapshots_.GetNewest(); + const SequenceNumber earliest_seqno_in_memtable = + mem->GetFirstSequenceNumber(); + Log(db_options_.info_log, "[%s] Level-0 table #%" PRIu64 ": started", + cfd->GetName().c_str(), meta.fd.GetNumber()); - Log(options_.info_log, - "[%s] Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s", - cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(), - s.ToString().c_str()); - delete iter; + { + mutex_.Unlock(); + s = BuildTable( + dbname_, env_, *cfd->ioptions(), env_options_, cfd->table_cache(), + iter.get(), &meta, cfd->internal_comparator(), newest_snapshot, + earliest_seqno_in_memtable, GetCompressionFlush(*cfd->options()), + cfd->options()->compression_opts, Env::IO_HIGH); + LogFlush(db_options_.info_log); + mutex_.Lock(); + } + Log(db_options_.info_log, + "[%s] Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s", + cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(), + s.ToString().c_str()); + } pending_outputs_.erase(meta.fd.GetNumber()); // Note that if file_size is zero, the file has been deleted and @@ -1484,29 +1481,34 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd, std::vector memtables; ReadOptions ro; ro.total_order_seek = true; + Arena arena; for (MemTable* m : mems) { - Log(options_.info_log, + Log(db_options_.info_log, "[%s] Flushing memtable with next log file: %" PRIu64 "\n", cfd->GetName().c_str(), m->GetNextLogNumber()); - memtables.push_back(m->NewIterator(ro)); + memtables.push_back(m->NewIterator(ro, &arena)); } - Iterator* iter = NewMergingIterator(&cfd->internal_comparator(), - &memtables[0], memtables.size()); - Log(options_.info_log, "[%s] Level-0 flush table #%" PRIu64 ": started", - cfd->GetName().c_str(), meta.fd.GetNumber()); - - s = BuildTable(dbname_, env_, *cfd->options(), storage_options_, - cfd->table_cache(), iter, &meta, cfd->internal_comparator(), - newest_snapshot, earliest_seqno_in_memtable, - GetCompressionFlush(*cfd->options()), Env::IO_HIGH); - LogFlush(options_.info_log); - delete iter; - Log(options_.info_log, + { + ScopedArenaIterator iter(NewMergingIterator(&cfd->internal_comparator(), + &memtables[0], + memtables.size(), &arena)); + Log(db_options_.info_log, + "[%s] Level-0 flush table #%" PRIu64 ": started", + cfd->GetName().c_str(), meta.fd.GetNumber()); + + s = BuildTable( + dbname_, env_, *cfd->ioptions(), env_options_, cfd->table_cache(), + iter.get(), &meta, cfd->internal_comparator(), newest_snapshot, + earliest_seqno_in_memtable, GetCompressionFlush(*cfd->options()), + cfd->options()->compression_opts, Env::IO_HIGH); + LogFlush(db_options_.info_log); + } + Log(db_options_.info_log, "[%s] Level-0 flush table #%" PRIu64 ": %" PRIu64 " bytes %s", cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(), s.ToString().c_str()); - if (!options_.disableDataSync) { + if (!db_options_.disableDataSync) { db_directory_->Fsync(); } mutex_.Lock(); @@ -1535,7 +1537,7 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd, // insert files directly into higher levels because some other // threads could be concurrently producing compacted files for // that key range. - if (base != nullptr && options_.max_background_compactions <= 1 && + if (base != nullptr && db_options_.max_background_compactions <= 1 && cfd->options()->compaction_style == kCompactionStyleLevel) { level = base->PickLevelForMemTableOutput(min_user_key, max_user_key); } @@ -1597,7 +1599,7 @@ Status DBImpl::FlushMemTableToOutputFile(ColumnFamilyData* cfd, } else { // Replace immutable memtable with the generated Table s = cfd->imm()->InstallMemtableFlushResults( - cfd, mems, versions_.get(), &mutex_, options_.info_log.get(), + cfd, mems, versions_.get(), &mutex_, db_options_.info_log.get(), file_number, &pending_outputs_, &deletion_state.memtables_to_free, db_directory_.get(), log_buffer); } @@ -1623,7 +1625,7 @@ Status DBImpl::FlushMemTableToOutputFile(ColumnFamilyData* cfd, } } - if (!s.ok() && !s.IsShutdownInProgress() && options_.paranoid_checks && + if (!s.ok() && !s.IsShutdownInProgress() && db_options_.paranoid_checks && bg_error_.ok()) { // if a bad error happened (not ShutdownInProgress) and paranoid_checks is // true, mark DB read-only @@ -1637,7 +1639,7 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family, const Slice* begin, const Slice* end, bool reduce_level, int target_level, uint32_t target_path_id) { - if (target_path_id >= options_.db_paths.size()) { + if (target_path_id >= db_options_.db_paths.size()) { return Status::InvalidArgument("Invalid target path ID"); } @@ -1646,7 +1648,7 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family, Status s = FlushMemTable(cfd, FlushOptions()); if (!s.ok()) { - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); return s; } @@ -1674,7 +1676,7 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family, end); } if (!s.ok()) { - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); return s; } } @@ -1682,7 +1684,7 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family, if (reduce_level) { s = ReFitLevel(cfd, max_level_with_files, target_level); } - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); { MutexLock l(&mutex_); @@ -1724,7 +1726,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { // only allow one thread refitting if (refitting_level_) { mutex_.Unlock(); - Log(options_.info_log, "ReFitLevel: another thread is refitting"); + Log(db_options_.info_log, "ReFitLevel: another thread is refitting"); delete new_superversion; return Status::NotSupported("another thread is refitting"); } @@ -1733,7 +1735,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { // wait for all background threads to stop bg_work_gate_closed_ = true; while (bg_compaction_scheduled_ > 0 || bg_flush_scheduled_) { - Log(options_.info_log, + Log(db_options_.info_log, "RefitLevel: waiting for background threads to stop: %d %d", bg_compaction_scheduled_, bg_flush_scheduled_); bg_cv_.Wait(); @@ -1749,8 +1751,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { Status status; if (to_level < level) { - Log(options_.info_log, "[%s] Before refitting:\n%s", cfd->GetName().c_str(), - cfd->current()->DebugString().data()); + Log(db_options_.info_log, "[%s] Before refitting:\n%s", + cfd->GetName().c_str(), cfd->current()->DebugString().data()); VersionEdit edit; edit.SetColumnFamily(cfd->GetID()); @@ -1760,18 +1762,18 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { f->fd.GetFileSize(), f->smallest, f->largest, f->smallest_seqno, f->largest_seqno); } - Log(options_.info_log, "[%s] Apply version edit:\n%s", + Log(db_options_.info_log, "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), edit.DebugString().data()); status = versions_->LogAndApply(cfd, &edit, &mutex_, db_directory_.get()); superversion_to_free = cfd->InstallSuperVersion(new_superversion, &mutex_); new_superversion = nullptr; - Log(options_.info_log, "[%s] LogAndApply: %s\n", cfd->GetName().c_str(), + Log(db_options_.info_log, "[%s] LogAndApply: %s\n", cfd->GetName().c_str(), status.ToString().data()); if (status.ok()) { - Log(options_.info_log, "[%s] After refitting:\n%s", + Log(db_options_.info_log, "[%s] After refitting:\n%s", cfd->GetName().c_str(), cfd->current()->DebugString().data()); } } @@ -1861,14 +1863,14 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level, ++bg_manual_only_; while (bg_compaction_scheduled_ > 0) { - Log(options_.info_log, + Log(db_options_.info_log, "[%s] Manual compaction waiting for all other scheduled background " "compactions to finish", cfd->GetName().c_str()); bg_cv_.Wait(); } - Log(options_.info_log, "[%s] Manual compaction starting", + Log(db_options_.info_log, "[%s] Manual compaction starting", cfd->GetName().c_str()); while (!manual.done && !shutting_down_.Acquire_Load() && bg_error_.ok()) { @@ -1956,10 +1958,10 @@ void DBImpl::MaybeScheduleFlushOrCompaction() { } if (is_flush_pending) { // memtable flush needed - if (bg_flush_scheduled_ < options_.max_background_flushes) { + if (bg_flush_scheduled_ < db_options_.max_background_flushes) { bg_flush_scheduled_++; env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH); - } else if (options_.max_background_flushes > 0) { + } else if (db_options_.max_background_flushes > 0) { bg_schedule_needed_ = true; } } @@ -1978,8 +1980,8 @@ void DBImpl::MaybeScheduleFlushOrCompaction() { // bg_manual_only_ == 0 if (!bg_manual_only_ && (is_compaction_needed || - (is_flush_pending && options_.max_background_flushes == 0))) { - if (bg_compaction_scheduled_ < options_.max_background_compactions) { + (is_flush_pending && db_options_.max_background_flushes == 0))) { + if (bg_compaction_scheduled_ < db_options_.max_background_compactions) { bg_compaction_scheduled_++; env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW); } else { @@ -2029,7 +2031,7 @@ Status DBImpl::BackgroundFlush(bool* madeProgress, "BackgroundCallFlush doing FlushMemTableToOutputFile with column " "family [%s], flush slots available %d", cfd->GetName().c_str(), - options_.max_background_flushes - bg_flush_scheduled_); + db_options_.max_background_flushes - bg_flush_scheduled_); flush_status = FlushMemTableToOutputFile(cfd, madeProgress, deletion_state, log_buffer); } @@ -2047,7 +2049,7 @@ void DBImpl::BackgroundCallFlush() { DeletionState deletion_state(true); assert(bg_flush_scheduled_); - LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, options_.info_log.get()); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); { MutexLock l(&mutex_); @@ -2063,12 +2065,12 @@ void DBImpl::BackgroundCallFlush() { default_cf_internal_stats_->BumpAndGetBackgroundErrorCount(); bg_cv_.SignalAll(); // In case a waiter can proceed despite the error mutex_.Unlock(); - Log(options_.info_log, + Log(db_options_.info_log, "Waiting after background flush error: %s" "Accumulated background error counts: %" PRIu64, s.ToString().c_str(), error_cnt); log_buffer.FlushBufferToLog(); - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); env_->SleepForMicroseconds(1000000); mutex_.Lock(); } @@ -2114,7 +2116,7 @@ void DBImpl::BackgroundCallCompaction() { DeletionState deletion_state(true); MaybeDumpStats(); - LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, options_.info_log.get()); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); { MutexLock l(&mutex_); assert(bg_compaction_scheduled_); @@ -2131,11 +2133,11 @@ void DBImpl::BackgroundCallCompaction() { bg_cv_.SignalAll(); // In case a waiter can proceed despite the error mutex_.Unlock(); log_buffer.FlushBufferToLog(); - Log(options_.info_log, + Log(db_options_.info_log, "Waiting after background compaction error: %s, " "Accumulated background error counts: %" PRIu64, s.ToString().c_str(), error_cnt); - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); env_->SleepForMicroseconds(1000000); mutex_.Lock(); } @@ -2219,7 +2221,7 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, log_buffer, "BackgroundCompaction doing FlushMemTableToOutputFile, " "compaction slots available %d", - options_.max_background_compactions - bg_compaction_scheduled_); + db_options_.max_background_compactions - bg_compaction_scheduled_); cfd->Ref(); flush_stat = FlushMemTableToOutputFile(cfd, madeProgress, deletion_state, log_buffer); @@ -2331,9 +2333,9 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, } else if (status.IsShutdownInProgress()) { // Ignore compaction errors found during shutting down } else { - Log(InfoLogLevel::WARN_LEVEL, options_.info_log, "Compaction error: %s", + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, "Compaction error: %s", status.ToString().c_str()); - if (options_.paranoid_checks && bg_error_.ok()) { + if (db_options_.paranoid_checks && bg_error_.ok()) { bg_error_ = status; } } @@ -2445,9 +2447,9 @@ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { compact->outputs.push_back(out); // Make the output file - std::string fname = TableFileName(options_.db_paths, file_number, + std::string fname = TableFileName(db_options_.db_paths, file_number, compact->compaction->GetOutputPathId()); - Status s = env_->NewWritableFile(fname, &compact->outfile, storage_options_); + Status s = env_->NewWritableFile(fname, &compact->outfile, env_options_); if (s.ok()) { compact->outfile->SetIOPriority(Env::IO_LOW); @@ -2456,10 +2458,11 @@ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { ColumnFamilyData* cfd = compact->compaction->column_family_data(); compact->builder.reset(NewTableBuilder( - *cfd->options(), cfd->internal_comparator(), compact->outfile.get(), - compact->compaction->OutputCompressionType())); + *cfd->ioptions(), cfd->internal_comparator(), compact->outfile.get(), + compact->compaction->OutputCompressionType(), + cfd->options()->compression_opts)); } - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); return s; } @@ -2487,8 +2490,8 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, compact->builder.reset(); // Finish and check for file errors - if (s.ok() && !options_.disableDataSync) { - if (options_.use_fsync) { + if (s.ok() && !db_options_.disableDataSync) { + if (db_options_.use_fsync) { StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS); s = compact->outfile->Fsync(); } else { @@ -2506,11 +2509,11 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, ColumnFamilyData* cfd = compact->compaction->column_family_data(); FileDescriptor fd(output_number, output_path_id, current_bytes); Iterator* iter = cfd->table_cache()->NewIterator( - ReadOptions(), storage_options_, cfd->internal_comparator(), fd); + ReadOptions(), env_options_, cfd->internal_comparator(), fd); s = iter->status(); delete iter; if (s.ok()) { - Log(options_.info_log, "[%s] Generated table #%" PRIu64 ": %" PRIu64 + Log(db_options_.info_log, "[%s] Generated table #%" PRIu64 ": %" PRIu64 " keys, %" PRIu64 " bytes", cfd->GetName().c_str(), output_number, current_entries, current_bytes); @@ -2529,7 +2532,7 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact, // This ensures that a concurrent compaction did not erroneously // pick the same files to compact. if (!versions_->VerifyCompactionFileConsistency(compact->compaction)) { - Log(options_.info_log, "[%s] Compaction %d@%d + %d@%d files aborted", + Log(db_options_.info_log, "[%s] Compaction %d@%d + %d@%d files aborted", compact->compaction->column_family_data()->GetName().c_str(), compact->compaction->num_input_files(0), compact->compaction->level(), compact->compaction->num_input_files(1), @@ -2578,7 +2581,7 @@ inline SequenceNumber DBImpl::findEarliestVisibleSnapshot( prev = cur; // assignment assert(prev); } - Log(options_.info_log, + Log(db_options_.info_log, "Looking for seqid %" PRIu64 " but maxseqid is %" PRIu64 "", in, snapshots[snapshots.size() - 1]); assert(0); @@ -2588,7 +2591,7 @@ inline SequenceNumber DBImpl::findEarliestVisibleSnapshot( uint64_t DBImpl::CallFlushDuringCompaction(ColumnFamilyData* cfd, DeletionState& deletion_state, LogBuffer* log_buffer) { - if (options_.max_background_flushes > 0) { + if (db_options_.max_background_flushes > 0) { // flush thread will take care of this return 0; } @@ -2633,7 +2636,7 @@ Status DBImpl::ProcessKeyValueCompaction( ColumnFamilyData* cfd = compact->compaction->column_family_data(); MergeHelper merge( cfd->user_comparator(), cfd->options()->merge_operator.get(), - options_.info_log.get(), cfd->options()->min_partial_merge_operands, + db_options_.info_log.get(), cfd->options()->min_partial_merge_operands, false /* internal key corruption is expected */); auto compaction_filter = cfd->options()->compaction_filter; std::unique_ptr compaction_filter_from_factory = nullptr; @@ -2800,7 +2803,7 @@ Status DBImpl::ProcessKeyValueCompaction( // optimization in BuildTable. int steps = 0; merge.MergeUntil(input, prev_snapshot, bottommost_level, - options_.statistics.get(), &steps); + db_options_.statistics.get(), &steps); // Skip the Merge ops combined_idx = combined_idx - 1 + steps; @@ -3027,7 +3030,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, cfd->GetName().c_str(), compact->compaction->num_input_files(0), compact->compaction->level(), compact->compaction->num_input_files(1), compact->compaction->output_level(), compact->compaction->score(), - options_.max_background_compactions - bg_compaction_scheduled_); + db_options_.max_background_compactions - bg_compaction_scheduled_); char scratch[2345]; compact->compaction->Summary(scratch, sizeof(scratch)); LogToBuffer(log_buffer, "[%s] Compaction start summary: %s\n", @@ -3103,7 +3106,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, if (!ParseInternalKey(key, &ikey)) { // log error - Log(options_.info_log, "[%s] Failed to parse key: %s", + Log(db_options_.info_log, "[%s] Failed to parse key: %s", cfd->GetName().c_str(), key.ToString().c_str()); continue; } else { @@ -3244,7 +3247,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, } input.reset(); - if (!options_.disableDataSync) { + if (!db_options_.disableDataSync) { db_directory_->Fsync(); } @@ -3276,7 +3279,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, RecordCompactionIOStats(); - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); mutex_.Lock(); cfd->internal_stats()->AddCompactionStats( compact->compaction->output_level(), stats); @@ -3346,31 +3349,18 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, SuperVersion* super_version, Arena* arena) { Iterator* internal_iter; - if (arena != nullptr) { - // Need to create internal iterator from the arena. - MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), arena); - // Collect iterator for mutable mem - merge_iter_builder.AddIterator( - super_version->mem->NewIterator(options, arena)); - // Collect all needed child iterators for immutable memtables - super_version->imm->AddIterators(options, &merge_iter_builder); - // Collect iterators for files in L0 - Ln - super_version->current->AddIterators(options, storage_options_, - &merge_iter_builder); - internal_iter = merge_iter_builder.Finish(); - } else { - // Need to create internal iterator using malloc. - std::vector iterator_list; - // Collect iterator for mutable mem - iterator_list.push_back(super_version->mem->NewIterator(options)); - // Collect all needed child iterators for immutable memtables - super_version->imm->AddIterators(options, &iterator_list); - // Collect iterators for files in L0 - Ln - super_version->current->AddIterators(options, storage_options_, - &iterator_list); - internal_iter = NewMergingIterator(&cfd->internal_comparator(), - &iterator_list[0], iterator_list.size()); - } + assert(arena != nullptr); + // Need to create internal iterator from the arena. + MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), arena); + // Collect iterator for mutable mem + merge_iter_builder.AddIterator( + super_version->mem->NewIterator(options, arena)); + // Collect all needed child iterators for immutable memtables + super_version->imm->AddIterators(options, &merge_iter_builder); + // Collect iterators for files in L0 - Ln + super_version->current->AddIterators(options, env_options_, + &merge_iter_builder); + internal_iter = merge_iter_builder.Finish(); IterState* cleanup = new IterState(this, &mutex_, super_version); internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr); @@ -3601,12 +3591,12 @@ Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& options, assert(cfd != nullptr); delete cfd->InstallSuperVersion(new SuperVersion(), &mutex_); *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_); - Log(options_.info_log, "Created column family [%s] (ID %u)", + Log(db_options_.info_log, "Created column family [%s] (ID %u)", column_family_name.c_str(), (unsigned)cfd->GetID()); max_total_in_memory_state_ += cfd->options()->write_buffer_size * cfd->options()->max_write_buffer_number; } else { - Log(options_.info_log, "Creating column family [%s] FAILED -- %s", + Log(db_options_.info_log, "Creating column family [%s] FAILED -- %s", column_family_name.c_str(), s.ToString().c_str()); } return s; @@ -3623,6 +3613,14 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { edit.DropColumnFamily(); edit.SetColumnFamily(cfd->GetID()); + Writer w(&mutex_); + w.batch = nullptr; + w.sync = false; + w.disableWAL = false; + w.in_batch_group = false; + w.done = false; + w.timeout_hint_us = kNoTimeOut; + Status s; { MutexLock l(&mutex_); @@ -3630,7 +3628,11 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { s = Status::InvalidArgument("Column family already dropped!\n"); } if (s.ok()) { + // we drop column family from a single write thread + s = BeginWrite(&w, 0); + assert(s.ok() && !w.done); // No timeout and nobody should do our job s = versions_->LogAndApply(cfd, &edit, &mutex_); + EndWrite(&w, &w, s); } } @@ -3638,9 +3640,11 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { assert(cfd->IsDropped()); max_total_in_memory_state_ -= cfd->options()->write_buffer_size * cfd->options()->max_write_buffer_number; - Log(options_.info_log, "Dropped column family with id %u\n", cfd->GetID()); + Log(db_options_.info_log, "Dropped column family with id %u\n", + cfd->GetID()); } else { - Log(options_.info_log, "Dropping column family with id %u FAILED -- %s\n", + Log(db_options_.info_log, + "Dropping column family with id %u FAILED -- %s\n", cfd->GetID(), s.ToString().c_str()); } @@ -3677,7 +3681,7 @@ Iterator* DBImpl::NewIterator(const ReadOptions& options, // TODO(ljin): remove tailing iterator auto iter = new ForwardIterator(this, options, cfd); return NewDBIterator(env_, *cfd->options(), cfd->user_comparator(), iter, - kMaxSequenceNumber); + kMaxSequenceNumber, options.iterate_upper_bound); // return new TailingIterator(env_, this, options, cfd); #endif } else { @@ -3733,7 +3737,9 @@ Iterator* DBImpl::NewIterator(const ReadOptions& options, // likely that any iterator pointer is close to the iterator it points to so // that they are likely to be in the same cache line and/or page. ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( - env_, *cfd->options(), cfd->user_comparator(), snapshot); + env_, *cfd->options(), cfd->user_comparator(), + snapshot, options.iterate_upper_bound); + Iterator* internal_iter = NewInternalIterator(options, cfd, sv, db_iter->GetArena()); db_iter->SetIterUnderDBIter(internal_iter); @@ -3785,10 +3791,12 @@ Status DBImpl::NewIterators( ? reinterpret_cast(options.snapshot)->number_ : latest_snapshot; - auto iter = NewInternalIterator(options, cfd, super_versions[i]); - iter = NewDBIterator(env_, *cfd->options(), - cfd->user_comparator(), iter, snapshot); - iterators->push_back(iter); + ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( + env_, *cfd->options(), cfd->user_comparator(), snapshot); + Iterator* internal_iter = NewInternalIterator( + options, cfd, super_versions[i], db_iter->GetArena()); + db_iter->SetIterUnderDBIter(internal_iter); + iterators->push_back(db_iter); } } @@ -3966,15 +3974,15 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1); uint64_t flush_column_family_if_log_file = 0; - uint64_t max_total_wal_size = (options_.max_total_wal_size == 0) + uint64_t max_total_wal_size = (db_options_.max_total_wal_size == 0) ? 4 * max_total_in_memory_state_ - : options_.max_total_wal_size; + : db_options_.max_total_wal_size; if (UNLIKELY(!single_column_family_mode_) && alive_log_files_.begin()->getting_flushed == false && total_log_size_ > max_total_wal_size) { flush_column_family_if_log_file = alive_log_files_.begin()->number; alive_log_files_.begin()->getting_flushed = true; - Log(options_.info_log, + Log(db_options_.info_log, "Flushing all column families with data in WAL number %" PRIu64 ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64, flush_column_family_if_log_file, total_log_size_, max_total_wal_size); @@ -4060,7 +4068,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { RecordTick(stats_, WAL_FILE_SYNCED); RecordTick(stats_, WAL_FILE_BYTES, log_size); if (status.ok() && options.sync) { - if (options_.use_fsync) { + if (db_options_.use_fsync) { StopWatch(env_, stats_, WAL_FILE_SYNC_MICROS); status = log_->file()->Fsync(); } else { @@ -4104,7 +4112,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { } } } - if (options_.paranoid_checks && !status.ok() && + if (db_options_.paranoid_checks && !status.ok() && !status.IsTimedOut() && bg_error_.ok()) { bg_error_ = status; // stop compaction & fail any further writes } @@ -4163,15 +4171,19 @@ void DBImpl::BuildBatchGroup(Writer** last_writer, break; } - if (w->batch != nullptr) { - size += WriteBatchInternal::ByteSize(w->batch); - if (size > max_size) { - // Do not make batch too big - break; - } + if (w->batch == nullptr) { + // Do not include those writes with nullptr batch. Those are not writes, + // those are something else. They want to be alone + break; + } - write_batch_group->push_back(w->batch); + size += WriteBatchInternal::ByteSize(w->batch); + if (size > max_size) { + // Do not make batch too big + break; } + + write_batch_group->push_back(w->batch); w->in_batch_group = true; *last_writer = w; } @@ -4269,7 +4281,7 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, // We have filled up the current memtable, but the previous // ones are still being flushed, so we wait. DelayLoggingAndReset(); - Log(options_.info_log, "[%s] wait for memtable flush...\n", + Log(db_options_.info_log, "[%s] wait for memtable flush...\n", cfd->GetName().c_str()); if (schedule_background_work) { MaybeScheduleFlushOrCompaction(); @@ -4289,7 +4301,7 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, InternalStats::MEMTABLE_COMPACTION, stall); } else if (cfd->NeedWaitForNumLevel0Files()) { DelayLoggingAndReset(); - Log(options_.info_log, "[%s] wait for fewer level0 files...\n", + Log(db_options_.info_log, "[%s] wait for fewer level0 files...\n", cfd->GetName().c_str()); uint64_t stall; { @@ -4373,9 +4385,9 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd, { DelayLoggingAndReset(); if (creating_new_log) { - s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number), - &lfile, - env_->OptimizeForLogWrite(storage_options_)); + s = env_->NewWritableFile( + LogFileName(db_options_.wal_dir, new_log_number), + &lfile, env_->OptimizeForLogWrite(env_options_)); if (s.ok()) { // Our final size should be less than write_buffer_size // (compression, etc) but err on the side of caution. @@ -4422,7 +4434,7 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd, cfd->imm()->Add(cfd->mem()); new_mem->Ref(); cfd->SetMemtable(new_mem); - Log(options_.info_log, + Log(db_options_.info_log, "[%s] New memtable created with log file: #%" PRIu64 "\n", cfd->GetName().c_str(), logfile_number_); context->superversions_to_free_.push_back( @@ -4527,7 +4539,7 @@ bool DBImpl::GetIntPropertyInternal(ColumnFamilyHandle* column_family, SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) { // TODO(ljin): consider using GetReferencedSuperVersion() directly - if (LIKELY(options_.allow_thread_local)) { + if (LIKELY(db_options_.allow_thread_local)) { return cfd->GetThreadLocalSuperVersion(&mutex_); } else { MutexLock l(&mutex_); @@ -4538,7 +4550,7 @@ SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) { void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, SuperVersion* sv) { bool unref_sv = true; - if (LIKELY(options_.allow_thread_local)) { + if (LIKELY(db_options_.allow_thread_local)) { unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv); } @@ -4585,7 +4597,7 @@ void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family, inline void DBImpl::DelayLoggingAndReset() { if (delayed_writes_ > 0) { - Log(options_.info_log, "delayed %d write...\n", delayed_writes_ ); + Log(db_options_.info_log, "delayed %d write...\n", delayed_writes_); delayed_writes_ = 0; } } @@ -4612,8 +4624,8 @@ Status DBImpl::GetUpdatesSince( if (!s.ok()) { return s; } - iter->reset(new TransactionLogIteratorImpl(options_.wal_dir, &options_, - read_options, storage_options_, + iter->reset(new TransactionLogIteratorImpl(db_options_.wal_dir, &db_options_, + read_options, env_options_, seq, std::move(wal_files), this)); return (*iter)->status(); } @@ -4624,7 +4636,7 @@ Status DBImpl::DeleteFile(std::string name) { WalFileType log_type; if (!ParseFileName(name, &number, &type, &log_type) || (type != kTableFile && type != kLogFile)) { - Log(options_.info_log, "DeleteFile %s failed.\n", name.c_str()); + Log(db_options_.info_log, "DeleteFile %s failed.\n", name.c_str()); return Status::InvalidArgument("Invalid file name"); } @@ -4632,13 +4644,13 @@ Status DBImpl::DeleteFile(std::string name) { if (type == kLogFile) { // Only allow deleting archived log files if (log_type != kArchivedLogFile) { - Log(options_.info_log, "DeleteFile %s failed - not archived log.\n", + Log(db_options_.info_log, "DeleteFile %s failed - not archived log.\n", name.c_str()); return Status::NotSupported("Delete only supported for archived logs"); } - status = env_->DeleteFile(options_.wal_dir + "/" + name.c_str()); + status = env_->DeleteFile(db_options_.wal_dir + "/" + name.c_str()); if (!status.ok()) { - Log(options_.info_log, "DeleteFile %s failed -- %s.\n", + Log(db_options_.info_log, "DeleteFile %s failed -- %s.\n", name.c_str(), status.ToString().c_str()); } return status; @@ -4653,7 +4665,7 @@ Status DBImpl::DeleteFile(std::string name) { MutexLock l(&mutex_); status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd); if (!status.ok()) { - Log(options_.info_log, "DeleteFile %s failed. File not found\n", + Log(db_options_.info_log, "DeleteFile %s failed. File not found\n", name.c_str()); return Status::InvalidArgument("File not found"); } @@ -4661,7 +4673,7 @@ Status DBImpl::DeleteFile(std::string name) { // If the file is being compacted no need to delete. if (metadata->being_compacted) { - Log(options_.info_log, + Log(db_options_.info_log, "DeleteFile %s Skipped. File about to be compacted\n", name.c_str()); return Status::OK(); } @@ -4671,7 +4683,7 @@ Status DBImpl::DeleteFile(std::string name) { // lost. Check that the level passed is the last level. for (int i = level + 1; i < cfd->NumberLevels(); i++) { if (cfd->current()->NumLevelFiles(i) != 0) { - Log(options_.info_log, + Log(db_options_.info_log, "DeleteFile %s FAILED. File not in last level\n", name.c_str()); return Status::InvalidArgument("File not in last level"); } @@ -4683,7 +4695,7 @@ Status DBImpl::DeleteFile(std::string name) { } FindObsoleteFiles(deletion_state, false); } // lock released here - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); // remove files outside the db-lock if (deletion_state.HaveSomethingToDelete()) { PurgeObsoleteFiles(deletion_state); @@ -4845,9 +4857,9 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, } DBImpl* impl = new DBImpl(db_options, dbname); - s = impl->env_->CreateDirIfMissing(impl->options_.wal_dir); + s = impl->env_->CreateDirIfMissing(impl->db_options_.wal_dir); if (s.ok()) { - for (auto db_path : impl->options_.db_paths) { + for (auto db_path : impl->db_options_.db_paths) { s = impl->env_->CreateDirIfMissing(db_path.path); if (!s.ok()) { break; @@ -4872,9 +4884,9 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, uint64_t new_log_number = impl->versions_->NewFileNumber(); unique_ptr lfile; EnvOptions soptions(db_options); - s = impl->options_.env->NewWritableFile( - LogFileName(impl->options_.wal_dir, new_log_number), &lfile, - impl->options_.env->OptimizeForLogWrite(soptions)); + s = impl->db_options_.env->NewWritableFile( + LogFileName(impl->db_options_.wal_dir, new_log_number), &lfile, + impl->db_options_.env->OptimizeForLogWrite(soptions)); if (s.ok()) { lfile->SetPreallocationBlockSize(1.1 * max_write_buffer_size); impl->logfile_number_ = new_log_number; diff --git a/db/db_impl.h b/db/db_impl.h index 086ac9fd4e4..69fe2eaac6b 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -30,6 +30,7 @@ #include "util/autovector.h" #include "util/stop_watch.h" #include "util/thread_local.h" +#include "util/scoped_arena_iterator.h" #include "db/internal_stats.h" namespace rocksdb { @@ -173,8 +174,8 @@ class DBImpl : public DB { // Return an internal iterator over the current state of the database. // The keys of this iterator are internal keys (see format.h). // The returned iterator should be deleted when no longer needed. - Iterator* TEST_NewInternalIterator(ColumnFamilyHandle* column_family = - nullptr); + Iterator* TEST_NewInternalIterator( + Arena* arena, ColumnFamilyHandle* column_family = nullptr); // Return the maximum overlapping data (in bytes) at next level for any // file at a level >= 1. @@ -202,6 +203,17 @@ class DBImpl : public DB { SequenceNumber* sequence); Status TEST_ReadFirstLine(const std::string& fname, SequenceNumber* sequence); + + void TEST_LockMutex(); + + void TEST_UnlockMutex(); + + // REQUIRES: mutex locked + void* TEST_BeginWrite(); + + // REQUIRES: mutex locked + // pass the pointer that you got from TEST_BeginWrite() + void TEST_EndWrite(void* w); #endif // NDEBUG // Structure to store information for candidate files to delete. @@ -275,7 +287,7 @@ class DBImpl : public DB { // Returns the list of live files in 'live' and the list // of all files in the filesystem in 'candidate_files'. // If force == false and the last call was less than - // options_.delete_obsolete_files_period_micros microseconds ago, + // db_options_.delete_obsolete_files_period_micros microseconds ago, // it will not fill up the deletion_state void FindObsoleteFiles(DeletionState& deletion_state, bool force, @@ -293,12 +305,11 @@ class DBImpl : public DB { Env* const env_; const std::string dbname_; unique_ptr versions_; - const DBOptions options_; + const DBOptions db_options_; Statistics* stats_; Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd, - SuperVersion* super_version, - Arena* arena = nullptr); + SuperVersion* super_version, Arena* arena); private: friend class DB; @@ -309,7 +320,7 @@ class DBImpl : public DB { #endif friend struct SuperVersion; struct CompactionState; - struct Writer; + struct WriteContext; Status NewDB(); @@ -349,6 +360,20 @@ class DBImpl : public DB { uint64_t SlowdownAmount(int n, double bottom, double top); + // Information kept for every waiting writer + struct Writer { + Status status; + WriteBatch* batch; + bool sync; + bool disableWAL; + bool in_batch_group; + bool done; + uint64_t timeout_hint_us; + port::CondVar cv; + + explicit Writer(port::Mutex* mu) : cv(mu) {} + }; + // Before applying write operation (such as DBImpl::Write, DBImpl::Flush) // thread should grab the mutex_ and be the first on writers queue. // BeginWrite is used for it. @@ -607,7 +632,7 @@ class DBImpl : public DB { int delayed_writes_; // The options to access storage files - const EnvOptions storage_options_; + const EnvOptions env_options_; // A value of true temporarily disables scheduling of background work bool bg_work_gate_closed_; diff --git a/db/db_impl_debug.cc b/db/db_impl_debug.cc index 8df66f6c6d9..5f7a4818d09 100644 --- a/db/db_impl_debug.cc +++ b/db/db_impl_debug.cc @@ -20,7 +20,8 @@ uint64_t DBImpl::TEST_GetLevel0TotalSize() { return default_cf_handle_->cfd()->current()->NumLevelBytes(0); } -Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) { +Iterator* DBImpl::TEST_NewInternalIterator(Arena* arena, + ColumnFamilyHandle* column_family) { ColumnFamilyData* cfd; if (column_family == nullptr) { cfd = default_cf_handle_->cfd(); @@ -33,7 +34,7 @@ Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) { SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); mutex_.Unlock(); ReadOptions roptions; - return NewInternalIterator(roptions, cfd, super_version); + return NewInternalIterator(roptions, cfd, super_version, arena); } int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes( @@ -129,5 +130,32 @@ Status DBImpl::TEST_ReadFirstLine(const std::string& fname, SequenceNumber* sequence) { return ReadFirstLine(fname, sequence); } + +void DBImpl::TEST_LockMutex() { + mutex_.Lock(); +} + +void DBImpl::TEST_UnlockMutex() { + mutex_.Unlock(); +} + +void* DBImpl::TEST_BeginWrite() { + auto w = new Writer(&mutex_); + w->batch = nullptr; + w->sync = false; + w->disableWAL = false; + w->in_batch_group = false; + w->done = false; + w->timeout_hint_us = kNoTimeOut; + Status s = BeginWrite(w, 0); + assert(s.ok() && !w->done); // No timeout and nobody should do our job + return reinterpret_cast(w); +} + +void DBImpl::TEST_EndWrite(void* w) { + auto writer = reinterpret_cast(w); + EndWrite(writer, writer, Status::OK()); +} + } // namespace rocksdb #endif // ROCKSDB_LITE diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc index 6c864aefd80..db0718bd1f8 100644 --- a/db/db_impl_readonly.cc +++ b/db/db_impl_readonly.cc @@ -44,7 +44,7 @@ namespace rocksdb { DBImplReadOnly::DBImplReadOnly(const DBOptions& options, const std::string& dbname) : DBImpl(options, dbname) { - Log(options_.info_log, "Opening the db in read only mode"); + Log(db_options_.info_log, "Opening the db in read only mode"); } DBImplReadOnly::~DBImplReadOnly() { diff --git a/db/db_iter.cc b/db/db_iter.cc index 599a56a99c0..bfdcd4edb8c 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -59,7 +59,8 @@ class DBIter: public Iterator { }; DBIter(Env* env, const Options& options, const Comparator* cmp, - Iterator* iter, SequenceNumber s, bool arena_mode) + Iterator* iter, SequenceNumber s, bool arena_mode, + const Slice* iterate_upper_bound = nullptr) : arena_mode_(arena_mode), env_(env), logger_(options.info_log.get()), @@ -70,9 +71,10 @@ class DBIter: public Iterator { direction_(kForward), valid_(false), current_entry_is_merged_(false), - statistics_(options.statistics.get()) { + statistics_(options.statistics.get()), + iterate_upper_bound_(iterate_upper_bound) { RecordTick(statistics_, NO_ITERATORS); - has_prefix_extractor_ = (options.prefix_extractor.get() != nullptr); + prefix_extractor_ = options.prefix_extractor.get(); max_skip_ = options.max_sequential_skip_in_iterations; } virtual ~DBIter() { @@ -132,7 +134,7 @@ class DBIter: public Iterator { } } - bool has_prefix_extractor_; + const SliceTransform* prefix_extractor_; bool arena_mode_; Env* const env_; Logger* logger_; @@ -149,6 +151,7 @@ class DBIter: public Iterator { bool current_entry_is_merged_; Statistics* statistics_; uint64_t max_skip_; + const Slice* iterate_upper_bound_; // No copying allowed DBIter(const DBIter&); @@ -207,36 +210,44 @@ void DBIter::FindNextUserEntryInternal(bool skipping) { uint64_t num_skipped = 0; do { ParsedInternalKey ikey; - if (ParseKey(&ikey) && ikey.sequence <= sequence_) { - if (skipping && - user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) { - num_skipped++; // skip this entry - PERF_COUNTER_ADD(internal_key_skipped_count, 1); - } else { - skipping = false; - switch (ikey.type) { - case kTypeDeletion: - // Arrange to skip all upcoming entries for this key since - // they are hidden by this deletion. - saved_key_.SetKey(ikey.user_key); - skipping = true; - num_skipped = 0; - PERF_COUNTER_ADD(internal_delete_skipped_count, 1); - break; - case kTypeValue: - valid_ = true; - saved_key_.SetKey(ikey.user_key); - return; - case kTypeMerge: - // By now, we are sure the current ikey is going to yield a value - saved_key_.SetKey(ikey.user_key); - current_entry_is_merged_ = true; - valid_ = true; - MergeValuesNewToOld(); // Go to a different state machine - return; - default: - assert(false); - break; + + if (ParseKey(&ikey)) { + if (iterate_upper_bound_ != nullptr && + ikey.user_key.compare(*iterate_upper_bound_) >= 0) { + break; + } + + if (ikey.sequence <= sequence_) { + if (skipping && + user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) { + num_skipped++; // skip this entry + PERF_COUNTER_ADD(internal_key_skipped_count, 1); + } else { + skipping = false; + switch (ikey.type) { + case kTypeDeletion: + // Arrange to skip all upcoming entries for this key since + // they are hidden by this deletion. + saved_key_.SetKey(ikey.user_key); + skipping = true; + num_skipped = 0; + PERF_COUNTER_ADD(internal_delete_skipped_count, 1); + break; + case kTypeValue: + valid_ = true; + saved_key_.SetKey(ikey.user_key); + return; + case kTypeMerge: + // By now, we are sure the current ikey is going to yield a value + saved_key_.SetKey(ikey.user_key); + current_entry_is_merged_ = true; + valid_ = true; + MergeValuesNewToOld(); // Go to a different state machine + return; + default: + assert(false); + break; + } } } } @@ -398,6 +409,7 @@ bool DBIter::FindValueForCurrentKey() { case kTypeDeletion: operands.clear(); last_not_merge_type = kTypeDeletion; + PERF_COUNTER_ADD(internal_delete_skipped_count, 1); break; case kTypeMerge: assert(user_merge_operator_ != nullptr); @@ -407,6 +419,7 @@ bool DBIter::FindValueForCurrentKey() { assert(false); } + PERF_COUNTER_ADD(internal_key_skipped_count, 1); assert(user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) == 0); iter_->Prev(); ++num_skipped; @@ -553,6 +566,20 @@ void DBIter::FindParseableKey(ParsedInternalKey* ikey, Direction direction) { void DBIter::Seek(const Slice& target) { StopWatch sw(env_, statistics_, DB_SEEK); + // total ordering is not guaranteed if prefix_extractor is set + // hence prefix based seeks will not give correct results + if (iterate_upper_bound_ != nullptr && prefix_extractor_ != nullptr) { + if (!prefix_extractor_->InDomain(*iterate_upper_bound_) || + !prefix_extractor_->InDomain(target) || + prefix_extractor_->Transform(*iterate_upper_bound_).compare( + prefix_extractor_->Transform(target)) != 0) { + status_ = Status::InvalidArgument("read_options.iterate_*_bound " + " and seek target need to have the same prefix."); + valid_ = false; + return; + } + } + saved_key_.Clear(); // now savved_key is used to store internal key. saved_key_.SetInternalKey(target, sequence_); @@ -574,7 +601,7 @@ void DBIter::Seek(const Slice& target) { void DBIter::SeekToFirst() { // Don't use iter_::Seek() if we set a prefix extractor // because prefix seek wiil be used. - if (has_prefix_extractor_) { + if (prefix_extractor_ != nullptr) { max_skip_ = std::numeric_limits::max(); } direction_ = kForward; @@ -595,7 +622,7 @@ void DBIter::SeekToFirst() { void DBIter::SeekToLast() { // Don't use iter_::Seek() if we set a prefix extractor // because prefix seek wiil be used. - if (has_prefix_extractor_) { + if (prefix_extractor_ != nullptr) { max_skip_ = std::numeric_limits::max(); } direction_ = kReverse; @@ -612,9 +639,10 @@ void DBIter::SeekToLast() { Iterator* NewDBIterator(Env* env, const Options& options, const Comparator* user_key_comparator, Iterator* internal_iter, - const SequenceNumber& sequence) { + const SequenceNumber& sequence, + const Slice* iterate_upper_bound) { return new DBIter(env, options, user_key_comparator, internal_iter, sequence, - false); + false, iterate_upper_bound); } ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); } @@ -643,13 +671,16 @@ void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1, ArenaWrappedDBIter* NewArenaWrappedDbIterator( Env* env, const Options& options, const Comparator* user_key_comparator, - const SequenceNumber& sequence) { + const SequenceNumber& sequence, + const Slice* iterate_upper_bound) { ArenaWrappedDBIter* iter = new ArenaWrappedDBIter(); Arena* arena = iter->GetArena(); auto mem = arena->AllocateAligned(sizeof(DBIter)); - DBIter* db_iter = new (mem) - DBIter(env, options, user_key_comparator, nullptr, sequence, true); + DBIter* db_iter = new (mem) DBIter(env, options, user_key_comparator, + nullptr, sequence, true, iterate_upper_bound); + iter->SetDBIter(db_iter); + return iter; } diff --git a/db/db_iter.h b/db/db_iter.h index cb9840324ff..ffea34fa951 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -27,7 +27,8 @@ extern Iterator* NewDBIterator( const Options& options, const Comparator *user_key_comparator, Iterator* internal_iter, - const SequenceNumber& sequence); + const SequenceNumber& sequence, + const Slice* iterate_upper_bound = nullptr); // A wrapper iterator which wraps DB Iterator and the arena, with which the DB // iterator is supposed be allocated. This class is used as an entry point of @@ -68,6 +69,6 @@ class ArenaWrappedDBIter : public Iterator { // Generate the arena wrapped iterator class. extern ArenaWrappedDBIter* NewArenaWrappedDbIterator( Env* env, const Options& options, const Comparator* user_key_comparator, - const SequenceNumber& sequence); + const SequenceNumber& sequence, const Slice* iterate_upper_bound = nullptr); } // namespace rocksdb diff --git a/db/db_test.cc b/db/db_test.cc index 6295f5921ee..5b913f43c12 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -41,6 +42,7 @@ #include "util/rate_limiter.h" #include "util/statistics.h" #include "util/testharness.h" +#include "util/scoped_arena_iterator.h" #include "util/sync_point.h" #include "util/testutil.h" @@ -755,11 +757,12 @@ class DBTest { } std::string AllEntriesFor(const Slice& user_key, int cf = 0) { - Iterator* iter; + ScopedArenaIterator iter; + Arena arena; if (cf == 0) { - iter = dbfull()->TEST_NewInternalIterator(); + iter.set(dbfull()->TEST_NewInternalIterator(&arena)); } else { - iter = dbfull()->TEST_NewInternalIterator(handles_[cf]); + iter.set(dbfull()->TEST_NewInternalIterator(&arena, handles_[cf])); } InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); iter->Seek(target.Encode()); @@ -804,7 +807,6 @@ class DBTest { } result += "]"; } - delete iter; return result; } @@ -1042,11 +1044,12 @@ class DBTest { // Utility method to test InplaceUpdate void validateNumberOfEntries(int numValues, int cf = 0) { - Iterator* iter; + ScopedArenaIterator iter; + Arena arena; if (cf != 0) { - iter = dbfull()->TEST_NewInternalIterator(handles_[cf]); + iter.set(dbfull()->TEST_NewInternalIterator(&arena, handles_[cf])); } else { - iter = dbfull()->TEST_NewInternalIterator(); + iter.set(dbfull()->TEST_NewInternalIterator(&arena)); } iter->SeekToFirst(); ASSERT_EQ(iter->status().ok(), true); @@ -1060,7 +1063,6 @@ class DBTest { ASSERT_EQ(ikey.sequence, (unsigned)seq--); iter->Next(); } - delete iter; ASSERT_EQ(0, seq); } @@ -4210,22 +4212,25 @@ TEST(DBTest, CompactionFilter) { // TODO: figure out sequence number squashtoo int count = 0; int total = 0; - Iterator* iter = dbfull()->TEST_NewInternalIterator(handles_[1]); - iter->SeekToFirst(); - ASSERT_OK(iter->status()); - while (iter->Valid()) { - ParsedInternalKey ikey(Slice(), 0, kTypeValue); - ikey.sequence = -1; - ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); - total++; - if (ikey.sequence != 0) { - count++; + Arena arena; + { + ScopedArenaIterator iter( + dbfull()->TEST_NewInternalIterator(&arena, handles_[1])); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + total++; + if (ikey.sequence != 0) { + count++; + } + iter->Next(); } - iter->Next(); } ASSERT_EQ(total, 100000); ASSERT_EQ(count, 1); - delete iter; // overwrite all the 100K keys once again. for (int i = 0; i < 100000; i++) { @@ -4280,7 +4285,7 @@ TEST(DBTest, CompactionFilter) { ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); // Scan the entire database to ensure that nothing is left - iter = db_->NewIterator(ReadOptions(), handles_[1]); + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); iter->SeekToFirst(); count = 0; while (iter->Valid()) { @@ -4296,18 +4301,20 @@ TEST(DBTest, CompactionFilter) { // TODO: remove the following or design a different // test count = 0; - iter = dbfull()->TEST_NewInternalIterator(handles_[1]); - iter->SeekToFirst(); - ASSERT_OK(iter->status()); - while (iter->Valid()) { - ParsedInternalKey ikey(Slice(), 0, kTypeValue); - ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); - ASSERT_NE(ikey.sequence, (unsigned)0); - count++; - iter->Next(); + { + ScopedArenaIterator iter( + dbfull()->TEST_NewInternalIterator(&arena, handles_[1])); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + ASSERT_NE(ikey.sequence, (unsigned)0); + count++; + iter->Next(); + } + ASSERT_EQ(count, 0); } - ASSERT_EQ(count, 0); - delete iter; } // Tests the edge case where compaction does not produce any output -- all @@ -4429,22 +4436,24 @@ TEST(DBTest, CompactionFilterContextManual) { // Verify total number of keys is correct after manual compaction. int count = 0; int total = 0; - Iterator* iter = dbfull()->TEST_NewInternalIterator(); - iter->SeekToFirst(); - ASSERT_OK(iter->status()); - while (iter->Valid()) { - ParsedInternalKey ikey(Slice(), 0, kTypeValue); - ikey.sequence = -1; - ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); - total++; - if (ikey.sequence != 0) { - count++; + { + Arena arena; + ScopedArenaIterator iter(dbfull()->TEST_NewInternalIterator(&arena)); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + total++; + if (ikey.sequence != 0) { + count++; + } + iter->Next(); } - iter->Next(); + ASSERT_EQ(total, 700); + ASSERT_EQ(count, 1); } - ASSERT_EQ(total, 700); - ASSERT_EQ(count, 1); - delete iter; } class KeepFilterV2 : public CompactionFilterV2 { @@ -4601,25 +4610,27 @@ TEST(DBTest, CompactionFilterV2) { // All the files are in the lowest level. int count = 0; int total = 0; - Iterator* iter = dbfull()->TEST_NewInternalIterator(); - iter->SeekToFirst(); - ASSERT_OK(iter->status()); - while (iter->Valid()) { - ParsedInternalKey ikey(Slice(), 0, kTypeValue); - ikey.sequence = -1; - ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); - total++; - if (ikey.sequence != 0) { - count++; + { + Arena arena; + ScopedArenaIterator iter(dbfull()->TEST_NewInternalIterator(&arena)); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + total++; + if (ikey.sequence != 0) { + count++; + } + iter->Next(); } - iter->Next(); } ASSERT_EQ(total, 100000); // 1 snapshot only. Since we are using universal compacton, // the sequence no is cleared for better compression ASSERT_EQ(count, 1); - delete iter; // create a new database with the compaction // filter in such a way that it deletes all keys @@ -4643,7 +4654,7 @@ TEST(DBTest, CompactionFilterV2) { ASSERT_EQ(NumTableFilesAtLevel(1), 0); // Scan the entire database to ensure that nothing is left - iter = db_->NewIterator(ReadOptions()); + Iterator* iter = db_->NewIterator(ReadOptions()); iter->SeekToFirst(); count = 0; while (iter->Valid()) { @@ -7743,6 +7754,167 @@ TEST(DBTest, TableOptionsSanitizeTest) { ASSERT_TRUE(TryReopen(&options).IsNotSupported()); } +TEST(DBTest, DBIteratorBoundTest) { + Options options; + options.env = env_; + options.create_if_missing = true; + + options.prefix_extractor = nullptr; + DestroyAndReopen(&options); + ASSERT_OK(Put("a", "0")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("g1", "0")); + + // testing basic case with no iterate_upper_bound and no prefix_extractor + { + ReadOptions ro; + ro.iterate_upper_bound = nullptr; + + std::unique_ptr iter(db_->NewIterator(ro)); + + iter->Seek("foo"); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo1")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("g1")), 0); + } + + // testing iterate_upper_bound and forward iterator + // to make sure it stops at bound + { + ReadOptions ro; + // iterate_upper_bound points beyond the last expected entry + Slice prefix("foo2"); + ro.iterate_upper_bound = &prefix; + + std::unique_ptr iter(db_->NewIterator(ro)); + + iter->Seek("foo"); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(("foo1")), 0); + + iter->Next(); + // should stop here... + ASSERT_TRUE(!iter->Valid()); + } + + // prefix is the first letter of the key + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + + DestroyAndReopen(&options); + ASSERT_OK(Put("a", "0")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("g1", "0")); + + // testing with iterate_upper_bound and prefix_extractor + // Seek target and iterate_upper_bound are not is same prefix + // This should be an error + { + ReadOptions ro; + Slice prefix("g1"); + ro.iterate_upper_bound = &prefix; + + std::unique_ptr iter(db_->NewIterator(ro)); + + iter->Seek("foo"); + + ASSERT_TRUE(!iter->Valid()); + ASSERT_TRUE(iter->status().IsInvalidArgument()); + } + + // testing that iterate_upper_bound prevents iterating over deleted items + // if the bound has already reached + { + options.prefix_extractor = nullptr; + DestroyAndReopen(&options); + ASSERT_OK(Put("a", "0")); + ASSERT_OK(Put("b", "0")); + ASSERT_OK(Put("b1", "0")); + ASSERT_OK(Put("c", "0")); + ASSERT_OK(Put("d", "0")); + ASSERT_OK(Put("e", "0")); + ASSERT_OK(Delete("c")); + ASSERT_OK(Delete("d")); + + // base case with no bound + ReadOptions ro; + ro.iterate_upper_bound = nullptr; + + std::unique_ptr iter(db_->NewIterator(ro)); + + iter->Seek("b"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("b")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(("b1")), 0); + + perf_context.Reset(); + iter->Next(); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(static_cast(perf_context.internal_delete_skipped_count), 2); + + // now testing with iterate_bound + Slice prefix("c"); + ro.iterate_upper_bound = &prefix; + + iter.reset(db_->NewIterator(ro)); + + perf_context.Reset(); + + iter->Seek("b"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("b")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(("b1")), 0); + + iter->Next(); + // the iteration should stop as soon as the the bound key is reached + // even though the key is deleted + // hence internal_delete_skipped_count should be 0 + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(static_cast(perf_context.internal_delete_skipped_count), 0); + } +} + +TEST(DBTest, WriteSingleThreadEntry) { + std::vector threads; + dbfull()->TEST_LockMutex(); + auto w = dbfull()->TEST_BeginWrite(); + threads.emplace_back([&] { Put("a", "b"); }); + env_->SleepForMicroseconds(10000); + threads.emplace_back([&] { Flush(); }); + env_->SleepForMicroseconds(10000); + dbfull()->TEST_UnlockMutex(); + dbfull()->TEST_LockMutex(); + dbfull()->TEST_EndWrite(w); + dbfull()->TEST_UnlockMutex(); + + for (auto& t : threads) { + t.join(); + } +} + + + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/dbformat.h b/db/dbformat.h index eb5d8ed534b..516a4693b33 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -244,7 +244,7 @@ class IterKey { Slice GetKey() const { return Slice(key_, key_size_); } - const size_t Size() { return key_size_; } + size_t Size() { return key_size_; } void Clear() { key_size_ = 0; } diff --git a/db/filename.cc b/db/filename.cc index 42c7efb781b..a8f6852968c 100644 --- a/db/filename.cc +++ b/db/filename.cc @@ -6,7 +6,10 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include "db/filename.h" #include diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index 74e6dd24924..6b78c4037bb 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -132,9 +132,11 @@ ForwardIterator::~ForwardIterator() { } void ForwardIterator::Cleanup() { - delete mutable_iter_; + if (mutable_iter_ != nullptr) { + mutable_iter_->~Iterator(); + } for (auto* m : imm_iters_) { - delete m; + m->~Iterator(); } imm_iters_.clear(); for (auto* f : l0_iters_) { @@ -401,8 +403,8 @@ void ForwardIterator::RebuildIterators() { Cleanup(); // New sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_)); - mutable_iter_ = sv_->mem->NewIterator(read_options_); - sv_->imm->AddIterators(read_options_, &imm_iters_); + mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_); + sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_); const auto& l0_files = sv_->current->files_[0]; l0_iters_.reserve(l0_files.size()); for (const auto* l0 : l0_files) { diff --git a/db/forward_iterator.h b/db/forward_iterator.h index bbf423a507b..653a0ac0cae 100644 --- a/db/forward_iterator.h +++ b/db/forward_iterator.h @@ -14,6 +14,7 @@ #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "db/dbformat.h" +#include "util/arena.h" namespace rocksdb { @@ -100,6 +101,7 @@ class ForwardIterator : public Iterator { IterKey prev_key_; bool is_prev_set_; + Arena arena_; }; } // namespace rocksdb diff --git a/db/internal_stats.cc b/db/internal_stats.cc index 3142d13b301..c9f9306e296 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -7,7 +7,11 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/internal_stats.h" + +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include "db/column_family.h" diff --git a/db/memtable.cc b/db/memtable.cc index e9e7051c75f..e102575a4f9 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -249,13 +249,9 @@ class MemTableIterator: public Iterator { }; Iterator* MemTable::NewIterator(const ReadOptions& options, Arena* arena) { - if (arena == nullptr) { - return new MemTableIterator(*this, options, nullptr); - } else { - auto mem = arena->AllocateAligned(sizeof(MemTableIterator)); - return new (mem) - MemTableIterator(*this, options, arena); - } + assert(arena != nullptr); + auto mem = arena->AllocateAligned(sizeof(MemTableIterator)); + return new (mem) MemTableIterator(*this, options, arena); } port::RWMutex* MemTable::GetLock(const Slice& key) { diff --git a/db/memtable.h b/db/memtable.h index 8bc281c6cf9..2723f30d815 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -81,8 +81,7 @@ class MemTable { // arena: If not null, the arena needs to be used to allocate the Iterator. // Calling ~Iterator of the iterator will destroy all the states but // those allocated in arena. - Iterator* NewIterator(const ReadOptions& options, - Arena* arena = nullptr); + Iterator* NewIterator(const ReadOptions& options, Arena* arena); // Add an entry into memtable that maps key to value at the // specified sequence number and with the specified type. diff --git a/db/memtable_list.cc b/db/memtable_list.cc index d3fc1356b27..418aae23091 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -73,9 +73,10 @@ bool MemTableListVersion::Get(const LookupKey& key, std::string* value, } void MemTableListVersion::AddIterators(const ReadOptions& options, - std::vector* iterator_list) { + std::vector* iterator_list, + Arena* arena) { for (auto& m : memlist_) { - iterator_list->push_back(m->NewIterator(options)); + iterator_list->push_back(m->NewIterator(options, arena)); } } diff --git a/db/memtable_list.h b/db/memtable_list.h index f4923e831a4..997834e7869 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -49,7 +49,7 @@ class MemTableListVersion { MergeContext& merge_context, const Options& options); void AddIterators(const ReadOptions& options, - std::vector* iterator_list); + std::vector* iterator_list, Arena* arena); void AddIterators(const ReadOptions& options, MergeIteratorBuilder* merge_iter_builder); diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index bb0f96f1584..1750d265c29 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -192,16 +192,17 @@ extern const uint64_t kPlainTableMagicNumber; class TestPlainTableReader : public PlainTableReader { public: - TestPlainTableReader(const EnvOptions& storage_options, + TestPlainTableReader(const EnvOptions& env_options, const InternalKeyComparator& icomparator, EncodingType encoding_type, uint64_t file_size, int bloom_bits_per_key, double hash_table_ratio, size_t index_sparseness, const TableProperties* table_properties, unique_ptr&& file, - const Options& options, bool* expect_bloom_not_match, + const ImmutableCFOptions& ioptions, + bool* expect_bloom_not_match, bool store_index_in_file) - : PlainTableReader(options, std::move(file), storage_options, icomparator, + : PlainTableReader(ioptions, std::move(file), env_options, icomparator, encoding_type, file_size, table_properties), expect_bloom_not_match_(expect_bloom_not_match) { Status s = MmapDataFile(); @@ -218,7 +219,7 @@ class TestPlainTableReader : public PlainTableReader { PlainTablePropertyNames::kBloomVersion); ASSERT_TRUE(bloom_version_ptr != props->user_collected_properties.end()); ASSERT_EQ(bloom_version_ptr->second, std::string("1")); - if (options.bloom_locality > 0) { + if (ioptions.bloom_locality > 0) { auto num_blocks_ptr = props->user_collected_properties.find( PlainTablePropertyNames::kNumBloomBlocks); ASSERT_TRUE(num_blocks_ptr != props->user_collected_properties.end()); @@ -253,25 +254,26 @@ class TestPlainTableFactory : public PlainTableFactory { store_index_in_file_(options.store_index_in_file), expect_bloom_not_match_(expect_bloom_not_match) {} - Status NewTableReader(const Options& options, const EnvOptions& soptions, + Status NewTableReader(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table) const override { TableProperties* props = nullptr; auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, - options.env, options.info_log.get(), &props); + ioptions.env, ioptions.info_log, &props); ASSERT_TRUE(s.ok()); if (store_index_in_file_) { BlockHandle bloom_block_handle; s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber, - options.env, BloomBlockBuilder::kBloomBlock, + ioptions.env, BloomBlockBuilder::kBloomBlock, &bloom_block_handle); ASSERT_TRUE(s.ok()); BlockHandle index_block_handle; s = FindMetaBlock( - file.get(), file_size, kPlainTableMagicNumber, options.env, + file.get(), file_size, kPlainTableMagicNumber, ioptions.env, PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_handle); ASSERT_TRUE(s.ok()); } @@ -284,9 +286,9 @@ class TestPlainTableFactory : public PlainTableFactory { DecodeFixed32(encoding_type_prop->second.c_str())); std::unique_ptr new_reader(new TestPlainTableReader( - soptions, internal_comparator, encoding_type, file_size, + env_options, internal_comparator, encoding_type, file_size, bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, props, - std::move(file), options, expect_bloom_not_match_, + std::move(file), ioptions, expect_bloom_not_match_, store_index_in_file_)); *table = std::move(new_reader); diff --git a/db/repair.cc b/db/repair.cc index 820cc19243f..ea6cdd6420e 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -31,7 +31,10 @@ #ifndef ROCKSDB_LITE +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include "db/builder.h" #include "db/db_impl.h" @@ -46,6 +49,9 @@ #include "rocksdb/comparator.h" #include "rocksdb/db.h" #include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/immutable_options.h" +#include "util/scoped_arena_iterator.h" namespace rocksdb { @@ -58,6 +64,7 @@ class Repairer { env_(options.env), icmp_(options.comparator), options_(SanitizeOptions(dbname, &icmp_, options)), + ioptions_(options_), raw_table_cache_( // TableCache can be small since we expect each table to be opened // once. @@ -65,7 +72,7 @@ class Repairer { options_.table_cache_remove_scan_count_limit)), next_file_number_(1) { table_cache_ = - new TableCache(&options_, storage_options_, raw_table_cache_.get()); + new TableCache(ioptions_, env_options_, raw_table_cache_.get()); edit_ = new VersionEdit(); } @@ -107,8 +114,9 @@ class Repairer { std::string const dbname_; Env* const env_; - InternalKeyComparator const icmp_; - Options const options_; + const InternalKeyComparator icmp_; + const Options options_; + const ImmutableCFOptions ioptions_; std::shared_ptr raw_table_cache_; TableCache* table_cache_; VersionEdit* edit_; @@ -118,7 +126,7 @@ class Repairer { std::vector logs_; std::vector tables_; uint64_t next_file_number_; - const EnvOptions storage_options_; + const EnvOptions env_options_; Status FindFiles() { std::vector filenames; @@ -190,7 +198,7 @@ class Repairer { // Open the log file std::string logname = LogFileName(dbname_, log); unique_ptr lfile; - Status status = env_->NewSequentialFile(logname, &lfile, storage_options_); + Status status = env_->NewSequentialFile(logname, &lfile, env_options_); if (!status.ok()) { return status; } @@ -236,12 +244,15 @@ class Repairer { // since ExtractMetaData() will also generate edits. FileMetaData meta; meta.fd = FileDescriptor(next_file_number_++, 0, 0); - ReadOptions ro; - ro.total_order_seek = true; - Iterator* iter = mem->NewIterator(ro); - status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_, - iter, &meta, icmp_, 0, 0, kNoCompression); - delete iter; + { + ReadOptions ro; + ro.total_order_seek = true; + Arena arena; + ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); + status = BuildTable(dbname_, env_, ioptions_, env_options_, table_cache_, + iter.get(), &meta, icmp_, 0, 0, kNoCompression, + CompressionOptions()); + } delete mem->Unref(); delete cf_mems_default; mem = nullptr; @@ -286,7 +297,7 @@ class Repairer { file_size); if (status.ok()) { Iterator* iter = table_cache_->NewIterator( - ReadOptions(), storage_options_, icmp_, t->meta.fd); + ReadOptions(), env_options_, icmp_, t->meta.fd); bool empty = true; ParsedInternalKey parsed; t->min_sequence = 0; @@ -326,7 +337,7 @@ class Repairer { std::string tmp = TempFileName(dbname_, 1); unique_ptr file; Status status = env_->NewWritableFile( - tmp, &file, env_->OptimizeForManifestWrite(storage_options_)); + tmp, &file, env_->OptimizeForManifestWrite(env_options_)); if (!status.ok()) { return status; } diff --git a/db/simple_table_db_test.cc b/db/simple_table_db_test.cc index e88485070e5..0a0ecf06485 100644 --- a/db/simple_table_db_test.cc +++ b/db/simple_table_db_test.cc @@ -79,7 +79,8 @@ class SimpleTableReader: public TableReader { // for the duration of the returned table's lifetime. // // *file must remain live while this Table is in use. - static Status Open(const Options& options, const EnvOptions& soptions, + static Status Open(const ImmutableCFOptions& options, + const EnvOptions& env_options, unique_ptr && file, uint64_t file_size, unique_ptr* table_reader); @@ -160,14 +161,14 @@ class SimpleTableIterator: public Iterator { struct SimpleTableReader::Rep { ~Rep() { } - Rep(const EnvOptions& storage_options, uint64_t index_start_offset, - int num_entries) : - soptions(storage_options), index_start_offset(index_start_offset), - num_entries(num_entries) { + Rep(const ImmutableCFOptions& ioptions, const EnvOptions& env_options, + uint64_t index_start_offset, int num_entries) : + ioptions(ioptions), env_options(env_options), + index_start_offset(index_start_offset), num_entries(num_entries) { } - Options options; - const EnvOptions& soptions; + const ImmutableCFOptions& ioptions; + const EnvOptions& env_options; Status status; unique_ptr file; uint64_t index_start_offset; @@ -187,8 +188,8 @@ SimpleTableReader::~SimpleTableReader() { delete rep_; } -Status SimpleTableReader::Open(const Options& options, - const EnvOptions& soptions, +Status SimpleTableReader::Open(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, unique_ptr && file, uint64_t size, unique_ptr* table_reader) { @@ -201,12 +202,10 @@ Status SimpleTableReader::Open(const Options& options, int num_entries = (size - Rep::offset_length - index_start_offset) / (Rep::GetInternalKeyLength() + Rep::offset_length); - SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(soptions, - index_start_offset, - num_entries); + SimpleTableReader::Rep* rep = new SimpleTableReader::Rep( + ioptions, env_options, index_start_offset, num_entries); rep->file = std::move(file); - rep->options = options; table_reader->reset(new SimpleTableReader(rep)); } return s; @@ -248,7 +247,7 @@ Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) { return s; } - InternalKeyComparator ikc(rep_->options.comparator); + InternalKeyComparator ikc(rep_->ioptions.comparator); int compare_result = ikc.Compare(tmp_slice, target); if (compare_result < 0) { @@ -382,7 +381,7 @@ void SimpleTableIterator::Prev() { } Slice SimpleTableIterator::key() const { - Log(table_->rep_->options.info_log, "key!!!!"); + Log(table_->rep_->ioptions.info_log, "key!!!!"); return key_; } @@ -401,7 +400,7 @@ class SimpleTableBuilder: public TableBuilder { // caller to close the file after calling Finish(). The output file // will be part of level specified by 'level'. A value of -1 means // that the caller does not know which level the output file will reside. - SimpleTableBuilder(const Options& options, WritableFile* file, + SimpleTableBuilder(const ImmutableCFOptions& ioptions, WritableFile* file, CompressionType compression_type); // REQUIRES: Either Finish() or Abandon() has been called. @@ -444,7 +443,7 @@ class SimpleTableBuilder: public TableBuilder { }; struct SimpleTableBuilder::Rep { - Options options; + const ImmutableCFOptions& ioptions; WritableFile* file; uint64_t offset = 0; Status status; @@ -463,17 +462,17 @@ struct SimpleTableBuilder::Rep { std::string index; - Rep(const Options& opt, WritableFile* f) : - options(opt), file(f) { + Rep(const ImmutableCFOptions& iopt, WritableFile* f) : + ioptions(iopt), file(f) { } ~Rep() { } }; -SimpleTableBuilder::SimpleTableBuilder(const Options& options, +SimpleTableBuilder::SimpleTableBuilder(const ImmutableCFOptions& ioptions, WritableFile* file, CompressionType compression_type) : - rep_(new SimpleTableBuilder::Rep(options, file)) { + rep_(new SimpleTableBuilder::Rep(ioptions, file)) { } SimpleTableBuilder::~SimpleTableBuilder() { @@ -546,15 +545,18 @@ class SimpleTableFactory: public TableFactory { const char* Name() const override { return "SimpleTable"; } - Status NewTableReader(const Options& options, const EnvOptions& soptions, + Status NewTableReader(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const InternalKeyComparator& internal_key, unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader) const; - TableBuilder* NewTableBuilder(const Options& options, - const InternalKeyComparator& internal_key, - WritableFile* file, - CompressionType compression_type) const; + TableBuilder* NewTableBuilder( + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& internal_key, + WritableFile* file, + const CompressionType compression_type, + const CompressionOptions& compression_opts) const; virtual Status SanitizeDBOptions(const DBOptions* db_opts) const override { return Status::OK(); @@ -566,19 +568,22 @@ class SimpleTableFactory: public TableFactory { }; Status SimpleTableFactory::NewTableReader( - const Options& options, const EnvOptions& soptions, + const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const InternalKeyComparator& internal_key, unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader) const { - return SimpleTableReader::Open(options, soptions, std::move(file), file_size, - table_reader); + return SimpleTableReader::Open(ioptions, env_options, std::move(file), + file_size, table_reader); } TableBuilder* SimpleTableFactory::NewTableBuilder( - const Options& options, const InternalKeyComparator& internal_key, - WritableFile* file, CompressionType compression_type) const { - return new SimpleTableBuilder(options, file, compression_type); + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& internal_key, + WritableFile* file, const CompressionType compression_type, + const CompressionOptions& compression_opts) const { + return new SimpleTableBuilder(ioptions, file, compression_type); } class SimpleTableDBTest { diff --git a/db/snapshot.h b/db/snapshot.h index 2c2e3eac803..51fa556c825 100644 --- a/db/snapshot.h +++ b/db/snapshot.h @@ -71,7 +71,7 @@ class SnapshotList { } // get the sequence number of the most recent snapshot - const SequenceNumber GetNewest() { + SequenceNumber GetNewest() { if (empty()) { return 0; } diff --git a/db/table_cache.cc b/db/table_cache.cc index c362499a685..5cb96f8bf52 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -36,12 +36,10 @@ static Slice GetSliceForFileNumber(const uint64_t* file_number) { sizeof(*file_number)); } -TableCache::TableCache(const Options* options, - const EnvOptions& storage_options, Cache* const cache) - : env_(options->env), - db_paths_(options->db_paths), - options_(options), - storage_options_(storage_options), +TableCache::TableCache(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, Cache* const cache) + : ioptions_(ioptions), + env_options_(env_options), cache_(cache) {} TableCache::~TableCache() { @@ -55,7 +53,7 @@ void TableCache::ReleaseHandle(Cache::Handle* handle) { cache_->Release(handle); } -Status TableCache::FindTable(const EnvOptions& toptions, +Status TableCache::FindTable(const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, Cache::Handle** handle, const bool no_io) { @@ -68,24 +66,24 @@ Status TableCache::FindTable(const EnvOptions& toptions, return Status::Incomplete("Table not found in table_cache, no_io is set"); } std::string fname = - TableFileName(db_paths_, fd.GetNumber(), fd.GetPathId()); + TableFileName(ioptions_.db_paths, fd.GetNumber(), fd.GetPathId()); unique_ptr file; unique_ptr table_reader; - s = env_->NewRandomAccessFile(fname, &file, toptions); - RecordTick(options_->statistics.get(), NO_FILE_OPENS); + s = ioptions_.env->NewRandomAccessFile(fname, &file, env_options); + RecordTick(ioptions_.statistics, NO_FILE_OPENS); if (s.ok()) { - if (options_->advise_random_on_open) { + if (ioptions_.advise_random_on_open) { file->Hint(RandomAccessFile::RANDOM); } - StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS); - s = options_->table_factory->NewTableReader( - *options_, toptions, internal_comparator, std::move(file), + StopWatch sw(ioptions_.env, ioptions_.statistics, TABLE_OPEN_IO_MICROS); + s = ioptions_.table_factory->NewTableReader( + ioptions_, env_options, internal_comparator, std::move(file), fd.GetFileSize(), &table_reader); } if (!s.ok()) { assert(table_reader == nullptr); - RecordTick(options_->statistics.get(), NO_FILE_ERRORS); + RecordTick(ioptions_.statistics, NO_FILE_ERRORS); // We do not cache error results so that if the error is transient, // or somebody repairs the file, we recover automatically. } else { @@ -97,7 +95,7 @@ Status TableCache::FindTable(const EnvOptions& toptions, } Iterator* TableCache::NewIterator(const ReadOptions& options, - const EnvOptions& toptions, + const EnvOptions& env_options, const InternalKeyComparator& icomparator, const FileDescriptor& fd, TableReader** table_reader_ptr, @@ -109,7 +107,7 @@ Iterator* TableCache::NewIterator(const ReadOptions& options, Cache::Handle* handle = nullptr; Status s; if (table_reader == nullptr) { - s = FindTable(toptions, icomparator, fd, &handle, + s = FindTable(env_options, icomparator, fd, &handle, options.read_tier == kBlockCacheTier); if (!s.ok()) { return NewErrorIterator(s, arena); @@ -142,7 +140,7 @@ Status TableCache::Get(const ReadOptions& options, Status s; Cache::Handle* handle = nullptr; if (!t) { - s = FindTable(storage_options_, internal_comparator, fd, &handle, + s = FindTable(env_options_, internal_comparator, fd, &handle, options.read_tier == kBlockCacheTier); if (s.ok()) { t = GetTableReaderFromHandle(handle); @@ -160,8 +158,9 @@ Status TableCache::Get(const ReadOptions& options, } return s; } + Status TableCache::GetTableProperties( - const EnvOptions& toptions, + const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, std::shared_ptr* properties, bool no_io) { Status s; @@ -174,7 +173,7 @@ Status TableCache::GetTableProperties( } Cache::Handle* table_handle = nullptr; - s = FindTable(toptions, internal_comparator, fd, &table_handle, no_io); + s = FindTable(env_options, internal_comparator, fd, &table_handle, no_io); if (!s.ok()) { return s; } @@ -186,7 +185,7 @@ Status TableCache::GetTableProperties( } size_t TableCache::GetMemoryUsageByTableReader( - const EnvOptions& toptions, + const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd) { Status s; @@ -197,7 +196,7 @@ size_t TableCache::GetMemoryUsageByTableReader( } Cache::Handle* table_handle = nullptr; - s = FindTable(toptions, internal_comparator, fd, &table_handle, true); + s = FindTable(env_options, internal_comparator, fd, &table_handle, true); if (!s.ok()) { return 0; } diff --git a/db/table_cache.h b/db/table_cache.h index 79090e0649b..2f6740d9f11 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -19,6 +19,7 @@ #include "rocksdb/cache.h" #include "rocksdb/env.h" #include "rocksdb/table.h" +#include "rocksdb/options.h" #include "table/table_reader.h" namespace rocksdb { @@ -29,8 +30,8 @@ struct FileDescriptor; class TableCache { public: - TableCache(const Options* options, const EnvOptions& storage_options, - Cache* cache); + TableCache(const ImmutableCFOptions& ioptions, + const EnvOptions& storage_options, Cache* cache); ~TableCache(); // Return an iterator for the specified file number (the corresponding @@ -91,10 +92,8 @@ class TableCache { void ReleaseHandle(Cache::Handle* handle); private: - Env* const env_; - const std::vector db_paths_; - const Options* options_; - const EnvOptions& storage_options_; + const ImmutableCFOptions& ioptions_; + const EnvOptions& env_options_; Cache* const cache_; }; diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc index 638b259f2a5..74abf867098 100644 --- a/db/table_properties_collector_test.cc +++ b/db/table_properties_collector_test.cc @@ -11,6 +11,7 @@ #include "db/dbformat.h" #include "db/table_properties_collector.h" #include "rocksdb/table.h" +#include "rocksdb/immutable_options.h" #include "table/block_based_table_factory.h" #include "table/meta_blocks.h" #include "table/plain_table_factory.h" @@ -85,12 +86,14 @@ class DumbLogger : public Logger { // Utilities test functions namespace { void MakeBuilder(const Options& options, + const ImmutableCFOptions& ioptions, const InternalKeyComparator& internal_comparator, std::unique_ptr* writable, std::unique_ptr* builder) { writable->reset(new FakeWritableFile); - builder->reset(options.table_factory->NewTableBuilder( - options, internal_comparator, writable->get(), options.compression)); + builder->reset(ioptions.table_factory->NewTableBuilder( + ioptions, internal_comparator, writable->get(), + options.compression, options.compression_opts)); } } // namespace @@ -153,7 +156,8 @@ void TestCustomizedTablePropertiesCollector( // -- Step 1: build table std::unique_ptr builder; std::unique_ptr writable; - MakeBuilder(options, internal_comparator, &writable, &builder); + const ImmutableCFOptions ioptions(options); + MakeBuilder(options, ioptions, internal_comparator, &writable, &builder); for (const auto& kv : kvs) { if (encode_as_internal) { @@ -264,9 +268,10 @@ void TestInternalKeyPropertiesCollector( options.table_properties_collector_factories = { std::make_shared()}; } + const ImmutableCFOptions ioptions(options); for (int iter = 0; iter < 2; ++iter) { - MakeBuilder(options, pikc, &writable, &builder); + MakeBuilder(options, ioptions, pikc, &writable, &builder); for (const auto& k : keys) { builder->Add(k.Encode(), "val"); } diff --git a/db/version_edit.h b/db/version_edit.h index 58edfed4515..db133402c95 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -163,13 +163,13 @@ class VersionEdit { // Add the specified file at the specified number. // REQUIRES: This version has not been saved (see VersionSet::SaveTo) // REQUIRES: "smallest" and "largest" are smallest and largest keys in file - void AddFile(int level, uint64_t file, uint64_t file_size, - uint64_t file_path_id, const InternalKey& smallest, + void AddFile(int level, uint64_t file, uint64_t file_path_id, + uint64_t file_size, const InternalKey& smallest, const InternalKey& largest, const SequenceNumber& smallest_seqno, const SequenceNumber& largest_seqno) { assert(smallest_seqno <= largest_seqno); FileMetaData f; - f.fd = FileDescriptor(file, file_size, file_path_id); + f.fd = FileDescriptor(file, file_path_id, file_size); f.smallest = smallest; f.largest = largest; f.smallest_seqno = smallest_seqno; diff --git a/db/version_set.cc b/db/version_set.cc index 3a15458532e..82183a982c3 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -9,7 +9,10 @@ #include "db/version_set.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include @@ -596,31 +599,6 @@ uint64_t Version::GetEstimatedActiveKeys() { return num_non_deletions_ - num_deletions_; } -void Version::AddIterators(const ReadOptions& read_options, - const EnvOptions& soptions, - std::vector* iters) { - // Merge all level zero files together since they may overlap - for (size_t i = 0; i < file_levels_[0].num_files; i++) { - const auto& file = file_levels_[0].files[i]; - iters->push_back(cfd_->table_cache()->NewIterator( - read_options, soptions, cfd_->internal_comparator(), file.fd)); - } - - // For levels > 0, we can use a concatenating iterator that sequentially - // walks through the non-overlapping files in the level, opening them - // lazily. - for (int level = 1; level < num_levels_; level++) { - if (file_levels_[level].num_files != 0) { - iters->push_back(NewTwoLevelIterator(new LevelFileIteratorState( - cfd_->table_cache(), read_options, soptions, - cfd_->internal_comparator(), false /* for_compaction */, - cfd_->options()->prefix_extractor != nullptr), - new LevelFileNumIterator(cfd_->internal_comparator(), - &file_levels_[level]))); - } - } -} - void Version::AddIterators(const ReadOptions& read_options, const EnvOptions& soptions, MergeIteratorBuilder* merge_iter_builder) { diff --git a/db/version_set.h b/db/version_set.h index 2f6d477a1db..e9747f83923 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -86,8 +86,6 @@ class Version { // Append to *iters a sequence of iterators that will // yield the contents of this Version when merged together. // REQUIRES: This version has been saved (see VersionSet::SaveTo) - void AddIterators(const ReadOptions&, const EnvOptions& soptions, - std::vector* iters); void AddIterators(const ReadOptions&, const EnvOptions& soptions, MergeIteratorBuilder* merger_iter_builder); diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index 1d30552b31b..aefb01e79ae 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -18,6 +18,7 @@ #include "rocksdb/utilities/write_batch_with_index.h" #include "util/logging.h" #include "util/testharness.h" +#include "util/scoped_arena_iterator.h" namespace rocksdb { @@ -32,7 +33,8 @@ static std::string PrintContents(WriteBatch* b) { ColumnFamilyMemTablesDefault cf_mems_default(mem, &options); Status s = WriteBatchInternal::InsertInto(b, &cf_mems_default); int count = 0; - Iterator* iter = mem->NewIterator(ReadOptions()); + Arena arena; + ScopedArenaIterator iter(mem->NewIterator(ReadOptions(), &arena)); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ParsedInternalKey ikey; memset((void *)&ikey, 0, sizeof(ikey)); @@ -67,7 +69,6 @@ static std::string PrintContents(WriteBatch* b) { state.append("@"); state.append(NumberToString(ikey.sequence)); } - delete iter; if (!s.ok()) { state.append(s.ToString()); } else if (count != WriteBatchInternal::Count(b)) { diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index c54e6707f05..e4b1bb75365 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -698,6 +698,10 @@ extern void rocksdb_readoptions_set_fill_cache( extern void rocksdb_readoptions_set_snapshot( rocksdb_readoptions_t*, const rocksdb_snapshot_t*); +extern void rocksdb_readoptions_set_iterate_upper_bound( + rocksdb_readoptions_t*, + const char* key, + size_t keylen); extern void rocksdb_readoptions_set_read_tier( rocksdb_readoptions_t*, int); extern void rocksdb_readoptions_set_tailing( diff --git a/include/rocksdb/immutable_options.h b/include/rocksdb/immutable_options.h new file mode 100644 index 00000000000..22084f6f0bb --- /dev/null +++ b/include/rocksdb/immutable_options.h @@ -0,0 +1,62 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include +#include "rocksdb/options.h" + +namespace rocksdb { + +// ImmutableCFOptions is a data struct used by RocksDB internal. It contains a +// subset of Options that should not be changed during the entire lifetime +// of DB. You shouldn't need to access this data structure unless you are +// implementing a new TableFactory. +struct ImmutableCFOptions { + explicit ImmutableCFOptions(const Options& options); + + const SliceTransform* prefix_extractor; + + const Comparator* comparator; + + MergeOperator* merge_operator; + + Logger* info_log; + + Statistics* statistics; + + InfoLogLevel info_log_level; + + Env* env; + + // Allow the OS to mmap file for reading sst tables. Default: false + bool allow_mmap_reads; + + // Allow the OS to mmap file for writing. Default: false + bool allow_mmap_writes; + + std::vector db_paths; + + TableFactory* table_factory; + + Options::TablePropertiesCollectorFactories + table_properties_collector_factories; + + bool advise_random_on_open; + + // This options is required by PlainTableReader. May need to move it + // to PlainTalbeOptions just like bloom_bits_per_key + uint32_t bloom_locality; + + bool purge_redundant_kvs_while_flush; + + uint32_t min_partial_merge_operands; + + bool disable_data_sync; + + bool use_fsync; +}; + +} // namespace rocksdb diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 0ca30334410..fbb3b6ddb4a 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -409,10 +409,24 @@ struct ColumnFamilyOptions { std::shared_ptr memtable_factory; // This is a factory that provides TableFactory objects. - // Default: a factory that provides a default implementation of - // Table and TableBuilder. + // Default: a block-based table factory that provides a default + // implementation of TableBuilder and TableReader with default + // BlockBasedTableOptions. std::shared_ptr table_factory; + // Block-based table related options are moved to BlockBasedTableOptions. + // Related options that were originally here but now moved include: + // no_block_cache + // block_cache + // block_cache_compressed + // block_size + // block_size_deviation + // block_restart_interval + // filter_policy + // whole_key_filtering + // If you'd like to customize some of these options, you will need to + // use NewBlockBasedTableFactory() to construct a new table factory. + // This option allows user to to collect their own interested statistics of // the tables. // Default: empty vector -- no user-defined statistics collection will be @@ -889,6 +903,18 @@ struct ReadOptions { // ! DEPRECATED // const Slice* prefix; + // "iterate_upper_bound" defines the extent upto which the forward iterator + // can returns entries. Once the bound is reached, Valid() will be false. + // "iterate_upper_bound" is exclusive ie the bound value is + // not a valid entry. If iterator_extractor is not null, the Seek target + // and iterator_upper_bound need to have the same prefix. + // This is because ordering is not guaranteed outside of prefix domain. + // There is no lower bound on the iterator. If needed, that can be easily + // implemented + // + // Default: nullptr + const Slice* iterate_upper_bound; + // Specify if this read request should process data that ALREADY // resides on a particular cache. If the required data is not // found at the specified cache, then Status::Incomplete is returned. @@ -912,6 +938,7 @@ struct ReadOptions { : verify_checksums(true), fill_cache(true), snapshot(nullptr), + iterate_upper_bound(nullptr), read_tier(kReadAllTier), tailing(false), total_order_seek(false) {} @@ -919,6 +946,7 @@ struct ReadOptions { : verify_checksums(cksum), fill_cache(cache), snapshot(nullptr), + iterate_upper_bound(nullptr), read_tier(kReadAllTier), tailing(false), total_order_seek(false) {} diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index 0f8b41074d7..2fb4f50ddbd 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -23,6 +23,7 @@ #include "rocksdb/env.h" #include "rocksdb/iterator.h" #include "rocksdb/options.h" +#include "rocksdb/immutable_options.h" #include "rocksdb/status.h" namespace rocksdb { @@ -293,14 +294,15 @@ class TableFactory { // and cache the table object returned. // (1) SstFileReader (for SST Dump) opens the table and dump the table // contents using the interator of the table. - // options and soptions are options. options is the general options. + // ImmutableCFOptions is a subset of Options that can not be altered. + // EnvOptions is a subset of Options that will be used by Env. // Multiple configured can be accessed from there, including and not // limited to block cache and key comparators. // file is a file handler to handle the file for the table // file_size is the physical file size of the file // table_reader is the output table reader virtual Status NewTableReader( - const Options& options, const EnvOptions& soptions, + const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader) const = 0; @@ -318,14 +320,17 @@ class TableFactory { // (4) When running Repairer, it creates a table builder to convert logs to // SST files (In Repairer::ConvertLogToTable() by calling BuildTable()) // - // options is the general options. Multiple configured can be acceseed from - // there, including and not limited to compression options. - // file is a handle of a writable file. It is the caller's responsibility to - // keep the file open and close the file after closing the table builder. - // compression_type is the compression type to use in this table. + // ImmutableCFOptions is a subset of Options that can not be altered. + // Multiple configured can be acceseed from there, including and not limited + // to compression options. file is a handle of a writable file. + // It is the caller's responsibility to keep the file open and close the file + // after closing the table builder. compression_type is the compression type + // to use in this table. virtual TableBuilder* NewTableBuilder( - const Options& options, const InternalKeyComparator& internal_comparator, - WritableFile* file, CompressionType compression_type) const = 0; + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& internal_comparator, + WritableFile* file, const CompressionType compression_type, + const CompressionOptions& compression_opts) const = 0; // Sanitizes the specified DB Options. // diff --git a/include/rocksdb/utilities/backupable_db.h b/include/rocksdb/utilities/backupable_db.h index 78365769d2e..bf3f919ae0a 100644 --- a/include/rocksdb/utilities/backupable_db.h +++ b/include/rocksdb/utilities/backupable_db.h @@ -10,7 +10,10 @@ #pragma once #ifndef ROCKSDB_LITE +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc index e8b2456eeed..9a4eb70fd0e 100644 --- a/java/rocksjni/write_batch.cc +++ b/java/rocksjni/write_batch.cc @@ -18,6 +18,7 @@ #include "rocksdb/env.h" #include "rocksdb/memtablerep.h" #include "util/logging.h" +#include "util/scoped_arena_iterator.h" #include "util/testharness.h" /* @@ -209,7 +210,9 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents( rocksdb::Status s = rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default); int count = 0; - rocksdb::Iterator* iter = mem->NewIterator(rocksdb::ReadOptions()); + Arena arena; + ScopedArenaIterator iter(mem->NewIterator( + rocksdb::ReadOptions(), false /*don't enforce total order*/, &arena)); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { rocksdb::ParsedInternalKey ikey; memset(reinterpret_cast(&ikey), 0, sizeof(ikey)); @@ -244,7 +247,6 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents( state.append("@"); state.append(rocksdb::NumberToString(ikey.sequence)); } - delete iter; if (!s.ok()) { state.append(s.ToString()); } else if (count != rocksdb::WriteBatchInternal::Count(b)) { diff --git a/table/adaptive_table_factory.cc b/table/adaptive_table_factory.cc index a259e79d8a5..c693064af9e 100644 --- a/table/adaptive_table_factory.cc +++ b/table/adaptive_table_factory.cc @@ -39,7 +39,7 @@ extern const uint64_t kLegacyBlockBasedTableMagicNumber; extern const uint64_t kCuckooTableMagicNumber; Status AdaptiveTableFactory::NewTableReader( - const Options& options, const EnvOptions& soptions, + const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& icomp, unique_ptr&& file, uint64_t file_size, unique_ptr* table) const { Footer footer; @@ -50,24 +50,26 @@ Status AdaptiveTableFactory::NewTableReader( if (footer.table_magic_number() == kPlainTableMagicNumber || footer.table_magic_number() == kLegacyPlainTableMagicNumber) { return plain_table_factory_->NewTableReader( - options, soptions, icomp, std::move(file), file_size, table); + ioptions, env_options, icomp, std::move(file), file_size, table); } else if (footer.table_magic_number() == kBlockBasedTableMagicNumber || footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) { return block_based_table_factory_->NewTableReader( - options, soptions, icomp, std::move(file), file_size, table); + ioptions, env_options, icomp, std::move(file), file_size, table); } else if (footer.table_magic_number() == kCuckooTableMagicNumber) { return cuckoo_table_factory_->NewTableReader( - options, soptions, icomp, std::move(file), file_size, table); + ioptions, env_options, icomp, std::move(file), file_size, table); } else { return Status::NotSupported("Unidentified table format"); } } TableBuilder* AdaptiveTableFactory::NewTableBuilder( - const Options& options, const InternalKeyComparator& internal_comparator, - WritableFile* file, CompressionType compression_type) const { - return table_factory_to_write_->NewTableBuilder(options, internal_comparator, - file, compression_type); + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& internal_comparator, + WritableFile* file, const CompressionType compression_type, + const CompressionOptions& compression_opts) const { + return table_factory_to_write_->NewTableBuilder( + ioptions, internal_comparator, file, compression_type, compression_opts); } std::string AdaptiveTableFactory::GetPrintableTableOptions() const { diff --git a/table/adaptive_table_factory.h b/table/adaptive_table_factory.h index f119d97b1cd..f0920db97f0 100644 --- a/table/adaptive_table_factory.h +++ b/table/adaptive_table_factory.h @@ -12,7 +12,6 @@ namespace rocksdb { -struct Options; struct EnvOptions; using std::unique_ptr; @@ -31,16 +30,21 @@ class AdaptiveTableFactory : public TableFactory { std::shared_ptr block_based_table_factory, std::shared_ptr plain_table_factory, std::shared_ptr cuckoo_table_factory); + const char* Name() const override { return "AdaptiveTableFactory"; } - Status NewTableReader(const Options& options, const EnvOptions& soptions, - const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table) const override; - TableBuilder* NewTableBuilder(const Options& options, - const InternalKeyComparator& icomparator, - WritableFile* file, - CompressionType compression_type) const - override; + + Status NewTableReader( + const ImmutableCFOptions& ioptions, const EnvOptions& env_options, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table) const override; + + TableBuilder* NewTableBuilder( + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& icomparator, + WritableFile* file, + const CompressionType compression_type, + const CompressionOptions& compression_opts) const override; // Sanitizes the specified DB Options. Status SanitizeDBOptions(const DBOptions* db_opts) const override { diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index ddfbe74a608..fde3637601c 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -25,7 +25,6 @@ #include "rocksdb/env.h" #include "rocksdb/filter_policy.h" #include "rocksdb/flush_block_policy.h" -#include "rocksdb/options.h" #include "rocksdb/table.h" #include "table/block.h" @@ -385,7 +384,7 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector }; struct BlockBasedTableBuilder::Rep { - const Options options; + const ImmutableCFOptions ioptions; const BlockBasedTableOptions table_options; const InternalKeyComparator& internal_comparator; WritableFile* file; @@ -397,7 +396,8 @@ struct BlockBasedTableBuilder::Rep { std::unique_ptr index_builder; std::string last_key; - CompressionType compression_type; + const CompressionType compression_type; + const CompressionOptions compression_opts; TableProperties props; bool closed = false; // Either Finish() or Abandon() has been called. @@ -413,27 +413,31 @@ struct BlockBasedTableBuilder::Rep { std::vector> table_properties_collectors; - Rep(const Options& opt, const BlockBasedTableOptions& table_opt, + Rep(const ImmutableCFOptions& ioptions, + const BlockBasedTableOptions& table_opt, const InternalKeyComparator& icomparator, - WritableFile* f, CompressionType compression_type) - : options(opt), + WritableFile* f, const CompressionType compression_type, + const CompressionOptions& compression_opts) + : ioptions(ioptions), table_options(table_opt), internal_comparator(icomparator), file(f), data_block(table_options.block_restart_interval), - internal_prefix_transform(options.prefix_extractor.get()), + internal_prefix_transform(ioptions.prefix_extractor), index_builder(CreateIndexBuilder( table_options.index_type, &internal_comparator, &this->internal_prefix_transform)), compression_type(compression_type), + compression_opts(compression_opts), filter_block(table_options.filter_policy == nullptr ? nullptr : - new FilterBlockBuilder(opt, table_options, &internal_comparator)), + new FilterBlockBuilder(ioptions.prefix_extractor, + table_options, &internal_comparator)), flush_block_policy( table_options.flush_block_policy_factory->NewFlushBlockPolicy( table_options, data_block)) { for (auto& collector_factories : - options.table_properties_collector_factories) { + ioptions.table_properties_collector_factories) { table_properties_collectors.emplace_back( collector_factories->CreateTablePropertiesCollector()); } @@ -443,11 +447,13 @@ struct BlockBasedTableBuilder::Rep { }; BlockBasedTableBuilder::BlockBasedTableBuilder( - const Options& options, const BlockBasedTableOptions& table_options, + const ImmutableCFOptions& ioptions, + const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, WritableFile* file, - CompressionType compression_type) - : rep_(new Rep(options, table_options, internal_comparator, - file, compression_type)) { + const CompressionType compression_type, + const CompressionOptions& compression_opts) + : rep_(new Rep(ioptions, table_options, internal_comparator, + file, compression_type, compression_opts)) { if (rep_->filter_block != nullptr) { rep_->filter_block->StartBlock(0); } @@ -502,7 +508,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { r->index_builder->OnKeyAdded(key); NotifyCollectTableCollectorsOnAdd(key, value, r->table_properties_collectors, - r->options.info_log.get()); + r->ioptions.info_log); } void BlockBasedTableBuilder::Flush() { @@ -540,10 +546,10 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents, Slice block_contents; if (raw_block_contents.size() < kCompressionSizeLimit) { block_contents = - CompressBlock(raw_block_contents, r->options.compression_opts, &type, + CompressBlock(raw_block_contents, r->compression_opts, &type, &r->compressed_output); } else { - RecordTick(r->options.statistics.get(), NUMBER_BLOCK_NOT_COMPRESSED); + RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED); type = kNoCompression; block_contents = raw_block_contents; } @@ -555,8 +561,7 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents, CompressionType type, BlockHandle* handle) { Rep* r = rep_; - StopWatch sw(r->options.env, r->options.statistics.get(), - WRITE_RAW_BLOCK_MICROS); + StopWatch sw(r->ioptions.env, r->ioptions.statistics, WRITE_RAW_BLOCK_MICROS); handle->set_offset(r->offset); handle->set_size(block_contents.size()); r->status = r->file->Append(block_contents); @@ -717,7 +722,7 @@ Status BlockBasedTableBuilder::Finish() { // Add use collected properties NotifyCollectTableCollectorsOnFinish(r->table_properties_collectors, - r->options.info_log.get(), + r->ioptions.info_log, &property_block_builder); BlockHandle properties_block_handle; @@ -776,14 +781,12 @@ Status BlockBasedTableBuilder::Finish() { } } - Log( - r->options.info_log, + Log(r->ioptions.info_log, "Table was constructed:\n" " [basic properties]: %s\n" " [user collected properties]: %s", r->props.ToString().c_str(), - user_collected.c_str() - ); + user_collected.c_str()); } return r->status; diff --git a/table/block_based_table_builder.h b/table/block_based_table_builder.h index 72a2f207a66..6fde329199f 100644 --- a/table/block_based_table_builder.h +++ b/table/block_based_table_builder.h @@ -28,10 +28,12 @@ class BlockBasedTableBuilder : public TableBuilder { // Create a builder that will store the contents of the table it is // building in *file. Does not close the file. It is up to the // caller to close the file after calling Finish(). - BlockBasedTableBuilder(const Options& options, + BlockBasedTableBuilder(const ImmutableCFOptions& ioptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, - WritableFile* file, CompressionType compression_type); + WritableFile* file, + const CompressionType compression_type, + const CompressionOptions& compression_opts); // REQUIRES: Either Finish() or Abandon() has been called. ~BlockBasedTableBuilder(); diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc index de30fb383eb..b4e2e7d1fec 100644 --- a/table/block_based_table_factory.cc +++ b/table/block_based_table_factory.cc @@ -41,21 +41,24 @@ BlockBasedTableFactory::BlockBasedTableFactory( } Status BlockBasedTableFactory::NewTableReader( - const Options& options, const EnvOptions& soptions, + const ImmutableCFOptions& ioptions, const EnvOptions& soptions, const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader) const { - return BlockBasedTable::Open(options, soptions, table_options_, + return BlockBasedTable::Open(ioptions, soptions, table_options_, internal_comparator, std::move(file), file_size, table_reader); } TableBuilder* BlockBasedTableFactory::NewTableBuilder( - const Options& options, const InternalKeyComparator& internal_comparator, - WritableFile* file, CompressionType compression_type) const { + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& internal_comparator, + WritableFile* file, const CompressionType compression_type, + const CompressionOptions& compression_opts) const { auto table_builder = new BlockBasedTableBuilder( - options, table_options_, internal_comparator, file, compression_type); + ioptions, table_options_, internal_comparator, file, + compression_type, compression_opts); return table_builder; } diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h index d7045346af0..2dcfda6d484 100644 --- a/table/block_based_table_factory.h +++ b/table/block_based_table_factory.h @@ -14,13 +14,11 @@ #include #include "rocksdb/flush_block_policy.h" -#include "rocksdb/options.h" #include "rocksdb/table.h" #include "db/dbformat.h" namespace rocksdb { -struct Options; struct EnvOptions; using std::unique_ptr; @@ -35,14 +33,17 @@ class BlockBasedTableFactory : public TableFactory { const char* Name() const override { return "BlockBasedTable"; } - Status NewTableReader(const Options& options, const EnvOptions& soptions, - const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table_reader) const override; + Status NewTableReader( + const ImmutableCFOptions& ioptions, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader) const override; TableBuilder* NewTableBuilder( - const Options& options, const InternalKeyComparator& internal_comparator, - WritableFile* file, CompressionType compression_type) const override; + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& internal_comparator, + WritableFile* file, const CompressionType compression_type, + const CompressionOptions& compression_opts) const override; // Sanitizes the specified DB Options. Status SanitizeDBOptions(const DBOptions* db_opts) const override { diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index 0be38a1dc33..cf915e10532 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -336,15 +336,16 @@ class HashIndexReader : public IndexReader { struct BlockBasedTable::Rep { - Rep(const EnvOptions& storage_options, + Rep(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const BlockBasedTableOptions& table_opt, const InternalKeyComparator& internal_comparator) - : soptions(storage_options), table_options(table_opt), + : ioptions(ioptions), env_options(env_options), table_options(table_opt), filter_policy(table_opt.filter_policy.get()), internal_comparator(internal_comparator) {} - Options options; - const EnvOptions& soptions; + const ImmutableCFOptions& ioptions; + const EnvOptions& env_options; const BlockBasedTableOptions& table_options; const FilterPolicy* const filter_policy; const InternalKeyComparator& internal_comparator; @@ -446,7 +447,8 @@ void BlockBasedTable::GenerateCachePrefix(Cache* cc, } } -Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions, +Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, unique_ptr&& file, @@ -461,8 +463,7 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions, // We've successfully read the footer and the index block: we're // ready to serve requests. Rep* rep = new BlockBasedTable::Rep( - soptions, table_options, internal_comparator); - rep->options = options; + ioptions, env_options, table_options, internal_comparator); rep->file = std::move(file); rep->footer = footer; rep->index_type = table_options.index_type; @@ -484,7 +485,7 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions, TableProperties* table_properties = nullptr; if (s.ok()) { s = ReadProperties(meta_iter->value(), rep->file.get(), rep->footer, - rep->options.env, rep->options.info_log.get(), + rep->ioptions.env, rep->ioptions.info_log, &table_properties); } @@ -492,12 +493,12 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions, auto err_msg = "[Warning] Encountered error while reading data from properties " "block " + s.ToString(); - Log(rep->options.info_log, "%s", err_msg.c_str()); + Log(rep->ioptions.info_log, "%s", err_msg.c_str()); } else { rep->table_properties.reset(table_properties); } } else { - Log(WARN_LEVEL, rep->options.info_log, + Log(WARN_LEVEL, rep->ioptions.info_log, "Cannot find Properties block from file."); } @@ -546,7 +547,8 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions, } void BlockBasedTable::SetupForCompaction() { - switch (rep_->options.access_hint_on_compaction_start) { + /* + switch (.access_hint_on_compaction_start) { case Options::NONE: break; case Options::NORMAL: @@ -562,6 +564,7 @@ void BlockBasedTable::SetupForCompaction() { assert(false); } compaction_optimized_ = true; + */ } std::shared_ptr BlockBasedTable::GetTableProperties() @@ -596,13 +599,13 @@ Status BlockBasedTable::ReadMetaBlock( ReadOptions(), rep->footer.metaindex_handle(), &meta, - rep->options.env); + rep->ioptions.env); if (!s.ok()) { auto err_msg = "[Warning] Encountered error while reading data from properties" "block " + s.ToString(); - Log(rep->options.info_log, "%s", err_msg.c_str()); + Log(rep->ioptions.info_log, "%s", err_msg.c_str()); } if (!s.ok()) { delete meta; @@ -746,7 +749,7 @@ FilterBlockReader* BlockBasedTable::ReadFilter(const BlockHandle& filter_handle, ReadOptions opt; BlockContents block; if (!ReadBlockContents(rep->file.get(), rep->footer, opt, filter_handle, - &block, rep->options.env, false).ok()) { + &block, rep->ioptions.env, false).ok()) { return nullptr; } @@ -755,7 +758,8 @@ FilterBlockReader* BlockBasedTable::ReadFilter(const BlockHandle& filter_handle, } return new FilterBlockReader( - rep->options, rep->table_options, block.data, block.heap_allocated); + rep->ioptions.prefix_extractor, rep->table_options, + block.data, block.heap_allocated); } BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( @@ -780,7 +784,7 @@ BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( cache_key ); - Statistics* statistics = rep_->options.statistics.get(); + Statistics* statistics = rep_->ioptions.statistics; auto cache_handle = GetEntryFromCache(block_cache, key, BLOCK_CACHE_FILTER_MISS, BLOCK_CACHE_FILTER_HIT, statistics); @@ -830,7 +834,7 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options, char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, rep_->footer.index_handle(), cache_key); - Statistics* statistics = rep_->options.statistics.get(); + Statistics* statistics = rep_->ioptions.statistics; auto cache_handle = GetEntryFromCache(block_cache, key, BLOCK_CACHE_INDEX_MISS, BLOCK_CACHE_INDEX_HIT, statistics); @@ -906,7 +910,7 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, // If either block cache is enabled, we'll try to read from it. if (block_cache != nullptr || block_cache_compressed != nullptr) { - Statistics* statistics = rep->options.statistics.get(); + Statistics* statistics = rep->ioptions.statistics; char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; Slice key, /* key to the block cache */ @@ -930,9 +934,9 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, if (block.value == nullptr && !no_io && ro.fill_cache) { Block* raw_block = nullptr; { - StopWatch sw(rep->options.env, statistics, READ_BLOCK_GET_MICROS); + StopWatch sw(rep->ioptions.env, statistics, READ_BLOCK_GET_MICROS); s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle, - &raw_block, rep->options.env, + &raw_block, rep->ioptions.env, block_cache_compressed == nullptr); } @@ -955,7 +959,7 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, } } s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle, - &block.value, rep->options.env); + &block.value, rep->ioptions.env); } Iterator* iter; @@ -982,7 +986,8 @@ class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState { public: BlockEntryIteratorState(BlockBasedTable* table, const ReadOptions& read_options) - : TwoLevelIteratorState(table->rep_->options.prefix_extractor != nullptr), + : TwoLevelIteratorState( + table->rep_->ioptions.prefix_extractor != nullptr), table_(table), read_options_(read_options) {} @@ -1020,8 +1025,8 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) { return true; } - assert(rep_->options.prefix_extractor != nullptr); - auto prefix = rep_->options.prefix_extractor->Transform( + assert(rep_->ioptions.prefix_extractor != nullptr); + auto prefix = rep_->ioptions.prefix_extractor->Transform( ExtractUserKey(internal_key)); InternalKey internal_key_prefix(prefix, 0, kTypeValue); auto internal_prefix = internal_key_prefix.Encode(); @@ -1072,7 +1077,7 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) { filter_entry.Release(rep_->table_options.block_cache.get()); } - Statistics* statistics = rep_->options.statistics.get(); + Statistics* statistics = rep_->ioptions.statistics; RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED); if (!may_match) { RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL); @@ -1111,7 +1116,7 @@ Status BlockBasedTable::Get( // Not found // TODO: think about interaction with Merge. If a user key cannot // cross one data block, we should be fine. - RecordTick(rep_->options.statistics.get(), BLOOM_FILTER_USEFUL); + RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); break; } else { BlockIter biter; @@ -1205,13 +1210,13 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader, } auto file = rep_->file.get(); - auto env = rep_->options.env; + auto env = rep_->ioptions.env; auto comparator = &rep_->internal_comparator; const Footer& footer = rep_->footer; if (index_type_on_file == BlockBasedTableOptions::kHashSearch && - rep_->options.prefix_extractor == nullptr) { - Log(rep_->options.info_log, + rep_->ioptions.prefix_extractor == nullptr) { + Log(rep_->ioptions.info_log, "BlockBasedTableOptions::kHashSearch requires " "options.prefix_extractor to be set." " Fall back to binary seach index."); @@ -1232,7 +1237,7 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader, if (!s.ok()) { // we simply fall back to binary search in case there is any // problem with prefix hash index loading. - Log(rep_->options.info_log, + Log(rep_->ioptions.info_log, "Unable to read the metaindex block." " Fall back to binary seach index."); return BinarySearchIndexReader::Create( @@ -1244,7 +1249,7 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader, // We need to wrap data with internal_prefix_transform to make sure it can // handle prefix correctly. rep_->internal_prefix_transform.reset( - new InternalKeySliceTransform(rep_->options.prefix_extractor.get())); + new InternalKeySliceTransform(rep_->ioptions.prefix_extractor)); return HashIndexReader::Create( rep_->internal_prefix_transform.get(), footer, file, env, comparator, footer.index_handle(), meta_index_iter, index_reader, diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 3ff97dda68f..b5686d2654f 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -14,6 +14,7 @@ #include #include +#include "rocksdb/options.h" #include "rocksdb/statistics.h" #include "rocksdb/status.h" #include "rocksdb/table.h" @@ -36,7 +37,6 @@ class TableReader; class WritableFile; struct BlockBasedTableOptions; struct EnvOptions; -struct Options; struct ReadOptions; using std::unique_ptr; @@ -58,7 +58,8 @@ class BlockBasedTable : public TableReader { // to nullptr and returns a non-ok status. // // *file must remain live while this Table is in use. - static Status Open(const Options& db_options, const EnvOptions& env_options, + static Status Open(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_key_comparator, unique_ptr&& file, uint64_t file_size, diff --git a/table/block_prefix_index.cc b/table/block_prefix_index.cc index f06dcd9fe76..d64b73b9849 100644 --- a/table/block_prefix_index.cc +++ b/table/block_prefix_index.cc @@ -210,8 +210,8 @@ Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor, return s; } -const uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, - uint32_t** blocks) { +uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, + uint32_t** blocks) { Slice prefix = internal_prefix_extractor_->Transform(key); uint32_t bucket = PrefixToBucket(prefix, num_buckets_); diff --git a/table/block_prefix_index.h b/table/block_prefix_index.h index 2afecadd268..662bc09aaeb 100644 --- a/table/block_prefix_index.h +++ b/table/block_prefix_index.h @@ -23,7 +23,7 @@ class BlockPrefixIndex { // the key, based on the prefix. // Returns the total number of relevant blocks, 0 means the key does // not exist. - const uint32_t GetBlocks(const Slice& key, uint32_t** blocks); + uint32_t GetBlocks(const Slice& key, uint32_t** blocks); size_t ApproximateMemoryUsage() const { return sizeof(BlockPrefixIndex) + diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo_table_builder.cc index 6326d37876b..e107071f2ab 100644 --- a/table/cuckoo_table_builder.cc +++ b/table/cuckoo_table_builder.cc @@ -56,7 +56,6 @@ CuckooTableBuilder::CuckooTableBuilder( ucomp_(user_comparator), get_slice_hash_(get_slice_hash), closed_(false) { - properties_.num_entries = 0; // Data is in a huge block. properties_.num_data_blocks = 1; properties_.index_size = 0; @@ -64,7 +63,7 @@ CuckooTableBuilder::CuckooTableBuilder( } void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { - if (properties_.num_entries >= kMaxVectorIdx - 1) { + if (kvs_.size() >= kMaxVectorIdx - 1) { status_ = Status::NotSupported("Number of keys in a file must be < 2^32-1"); return; } @@ -311,7 +310,7 @@ uint64_t CuckooTableBuilder::NumEntries() const { uint64_t CuckooTableBuilder::FileSize() const { if (closed_) { return file_->GetFileSize(); - } else if (properties_.num_entries == 0) { + } else if (kvs_.size() == 0) { return 0; } diff --git a/table/cuckoo_table_builder_test.cc b/table/cuckoo_table_builder_test.cc index 69647d410ec..be13dc9a36f 100644 --- a/table/cuckoo_table_builder_test.cc +++ b/table/cuckoo_table_builder_test.cc @@ -135,6 +135,7 @@ TEST(CuckooBuilderTest, SuccessWithEmptyFile) { CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, 4, 100, BytewiseComparator(), 1, GetSliceHash); ASSERT_OK(builder.status()); + ASSERT_EQ(0UL, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); CheckFileContents({}, {}, {}, "", 0, 2, false); @@ -155,6 +156,7 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) { for (auto& user_key : user_keys) { keys.push_back(GetInternalKey(user_key, false)); } + uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/NoCollisionFullKey"; @@ -167,10 +169,12 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) { ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } + uint32_t bucket_size = keys[0].size() + values[0].size(); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); - uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); std::string expected_unused_bucket = GetInternalKey("key00", true); expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(keys, values, expected_locations, @@ -192,6 +196,7 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) { for (auto& user_key : user_keys) { keys.push_back(GetInternalKey(user_key, false)); } + uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionFullKey"; @@ -204,10 +209,12 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) { ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } + uint32_t bucket_size = keys[0].size() + values[0].size(); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); - uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); std::string expected_unused_bucket = GetInternalKey("key00", true); expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(keys, values, expected_locations, @@ -229,6 +236,7 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) { for (auto& user_key : user_keys) { keys.push_back(GetInternalKey(user_key, false)); } + uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); unique_ptr writable_file; uint32_t cuckoo_block_size = 2; @@ -242,10 +250,12 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) { ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } + uint32_t bucket_size = keys[0].size() + values[0].size(); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); - uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); std::string expected_unused_bucket = GetInternalKey("key00", true); expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(keys, values, expected_locations, @@ -272,6 +282,7 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKey) { for (auto& user_key : user_keys) { keys.push_back(GetInternalKey(user_key, false)); } + uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionPathFullKey"; @@ -284,10 +295,12 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKey) { ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } + uint32_t bucket_size = keys[0].size() + values[0].size(); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); - uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); std::string expected_unused_bucket = GetInternalKey("key00", true); expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(keys, values, expected_locations, @@ -311,6 +324,7 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) { for (auto& user_key : user_keys) { keys.push_back(GetInternalKey(user_key, false)); } + uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionPathFullKeyAndCuckooBlock"; @@ -323,10 +337,12 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) { ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } + uint32_t bucket_size = keys[0].size() + values[0].size(); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); - uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); std::string expected_unused_bucket = GetInternalKey("key00", true); expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(keys, values, expected_locations, @@ -344,6 +360,7 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) { {user_keys[3], {3, 4, 5, 6}} }; std::vector expected_locations = {0, 1, 2, 3}; + uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/NoCollisionUserKey"; @@ -356,10 +373,12 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) { ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } + uint32_t bucket_size = user_keys[0].size() + values[0].size(); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); - uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio); std::string expected_unused_bucket = "key00"; expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(user_keys, values, expected_locations, @@ -377,6 +396,7 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) { {user_keys[3], {0, 1, 2, 3}}, }; std::vector expected_locations = {0, 1, 2, 3}; + uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionUserKey"; @@ -389,10 +409,12 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) { ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } + uint32_t bucket_size = user_keys[0].size() + values[0].size(); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); - uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio); std::string expected_unused_bucket = "key00"; expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(user_keys, values, expected_locations, @@ -412,6 +434,7 @@ TEST(CuckooBuilderTest, WithCollisionPathUserKey) { {user_keys[4], {0, 2}}, }; std::vector expected_locations = {0, 1, 3, 4, 2}; + uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionPathUserKey"; @@ -424,10 +447,12 @@ TEST(CuckooBuilderTest, WithCollisionPathUserKey) { ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } + uint32_t bucket_size = user_keys[0].size() + values[0].size(); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); - uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio); std::string expected_unused_bucket = "key00"; expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(user_keys, values, expected_locations, diff --git a/table/cuckoo_table_factory.cc b/table/cuckoo_table_factory.cc index e2cc6fd891c..5727a91c00c 100644 --- a/table/cuckoo_table_factory.cc +++ b/table/cuckoo_table_factory.cc @@ -11,11 +11,12 @@ #include "table/cuckoo_table_reader.h" namespace rocksdb { -Status CuckooTableFactory::NewTableReader(const Options& options, - const EnvOptions& soptions, const InternalKeyComparator& icomp, + +Status CuckooTableFactory::NewTableReader(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const InternalKeyComparator& icomp, std::unique_ptr&& file, uint64_t file_size, std::unique_ptr* table) const { - std::unique_ptr new_reader(new CuckooTableReader(options, + std::unique_ptr new_reader(new CuckooTableReader(ioptions, std::move(file), file_size, icomp.user_comparator(), nullptr)); Status s = new_reader->status(); if (s.ok()) { @@ -25,10 +26,13 @@ Status CuckooTableFactory::NewTableReader(const Options& options, } TableBuilder* CuckooTableFactory::NewTableBuilder( - const Options& options, const InternalKeyComparator& internal_comparator, - WritableFile* file, CompressionType compression_type) const { - return new CuckooTableBuilder(file, hash_table_ratio_, 64, max_search_depth_, - internal_comparator.user_comparator(), cuckoo_block_size_, nullptr); + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& internal_comparator, + WritableFile* file, const CompressionType, + const CompressionOptions&) const { + return new CuckooTableBuilder(file, hash_table_ratio_, 64, + max_search_depth_, internal_comparator.user_comparator(), + cuckoo_block_size_, nullptr); } std::string CuckooTableFactory::GetPrintableTableOptions() const { diff --git a/table/cuckoo_table_factory.h b/table/cuckoo_table_factory.h index 5799a7f23fe..2b575dc4522 100644 --- a/table/cuckoo_table_factory.h +++ b/table/cuckoo_table_factory.h @@ -9,6 +9,7 @@ #include #include "rocksdb/table.h" #include "util/murmurhash.h" +#include "rocksdb/options.h" namespace rocksdb { @@ -45,14 +46,14 @@ class CuckooTableFactory : public TableFactory { const char* Name() const override { return "CuckooTable"; } Status NewTableReader( - const Options& options, const EnvOptions& soptions, + const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table) const override; - TableBuilder* NewTableBuilder(const Options& options, + TableBuilder* NewTableBuilder(const ImmutableCFOptions& options, const InternalKeyComparator& icomparator, WritableFile* file, - CompressionType compression_type) const override; + const CompressionType, const CompressionOptions&) const override; // Sanitizes the specified DB Options. Status SanitizeDBOptions(const DBOptions* db_opts) const override { diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo_table_reader.cc index f1dcbc3bb63..1fdbc447556 100644 --- a/table/cuckoo_table_reader.cc +++ b/table/cuckoo_table_reader.cc @@ -29,7 +29,7 @@ namespace { extern const uint64_t kCuckooTableMagicNumber; CuckooTableReader::CuckooTableReader( - const Options& options, + const ImmutableCFOptions& ioptions, std::unique_ptr&& file, uint64_t file_size, const Comparator* comparator, @@ -37,12 +37,12 @@ CuckooTableReader::CuckooTableReader( : file_(std::move(file)), ucomp_(comparator), get_slice_hash_(get_slice_hash) { - if (!options.allow_mmap_reads) { + if (!ioptions.allow_mmap_reads) { status_ = Status::InvalidArgument("File is not mmaped"); } TableProperties* props = nullptr; status_ = ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber, - options.env, options.info_log.get(), &props); + ioptions.env, ioptions.info_log, &props); if (!status_.ok()) { return; } diff --git a/table/cuckoo_table_reader.h b/table/cuckoo_table_reader.h index 05d5c339787..61e048eb617 100644 --- a/table/cuckoo_table_reader.h +++ b/table/cuckoo_table_reader.h @@ -16,6 +16,7 @@ #include "db/dbformat.h" #include "rocksdb/env.h" +#include "rocksdb/options.h" #include "table/table_reader.h" namespace rocksdb { @@ -26,7 +27,7 @@ class TableReader; class CuckooTableReader: public TableReader { public: CuckooTableReader( - const Options& options, + const ImmutableCFOptions& ioptions, std::unique_ptr&& file, uint64_t file_size, const Comparator* user_comparator, @@ -40,7 +41,7 @@ class CuckooTableReader: public TableReader { Status status() const { return status_; } Status Get( - const ReadOptions& readOptions, const Slice& key, void* handle_context, + const ReadOptions& read_options, const Slice& key, void* handle_context, bool (*result_handler)(void* arg, const ParsedInternalKey& k, const Slice& v), void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo_table_reader_test.cc index 63fe0ae5ba1..3138fb9ef93 100644 --- a/table/cuckoo_table_reader_test.cc +++ b/table/cuckoo_table_reader_test.cc @@ -11,7 +11,10 @@ int main() { } #else +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include @@ -121,8 +124,9 @@ class CuckooReaderTest { // Check reader now. std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + const ImmutableCFOptions ioptions(options); CuckooTableReader reader( - options, + ioptions, std::move(read_file), file_size, ucomp, @@ -147,8 +151,9 @@ class CuckooReaderTest { void CheckIterator(const Comparator* ucomp = BytewiseComparator()) { std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + const ImmutableCFOptions ioptions(options); CuckooTableReader reader( - options, + ioptions, std::move(read_file), file_size, ucomp, @@ -325,8 +330,9 @@ TEST(CuckooReaderTest, WhenKeyNotFound) { CreateCuckooFileAndCheckReader(); std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + const ImmutableCFOptions ioptions(options); CuckooTableReader reader( - options, + ioptions, std::move(read_file), file_size, BytewiseComparator(), @@ -433,8 +439,9 @@ void WriteFile(const std::vector& keys, std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + const ImmutableCFOptions ioptions(options); CuckooTableReader reader( - options, std::move(read_file), file_size, + ioptions, std::move(read_file), file_size, test::Uint64Comparator(), nullptr); ASSERT_OK(reader.status()); ReadOptions r_options; @@ -460,8 +467,9 @@ void ReadKeys(uint64_t num, uint32_t batch_size) { std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + const ImmutableCFOptions ioptions(options); CuckooTableReader reader( - options, std::move(read_file), file_size, test::Uint64Comparator(), + ioptions, std::move(read_file), file_size, test::Uint64Comparator(), nullptr); ASSERT_OK(reader.status()); const UserCollectedProperties user_props = diff --git a/table/filter_block.cc b/table/filter_block.cc index 6b4ff1c10da..30284017b9f 100644 --- a/table/filter_block.cc +++ b/table/filter_block.cc @@ -21,11 +21,11 @@ namespace rocksdb { static const size_t kFilterBaseLg = 11; static const size_t kFilterBase = 1 << kFilterBaseLg; -FilterBlockBuilder::FilterBlockBuilder(const Options& opt, +FilterBlockBuilder::FilterBlockBuilder(const SliceTransform* prefix_extractor, const BlockBasedTableOptions& table_opt, const Comparator* internal_comparator) : policy_(table_opt.filter_policy.get()), - prefix_extractor_(opt.prefix_extractor.get()), + prefix_extractor_(prefix_extractor), whole_key_filtering_(table_opt.whole_key_filtering), comparator_(internal_comparator) {} @@ -126,10 +126,11 @@ void FilterBlockBuilder::GenerateFilter() { } FilterBlockReader::FilterBlockReader( - const Options& opt, const BlockBasedTableOptions& table_opt, + const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt, const Slice& contents, bool delete_contents_after_use) : policy_(table_opt.filter_policy.get()), - prefix_extractor_(opt.prefix_extractor.get()), + prefix_extractor_(prefix_extractor), whole_key_filtering_(table_opt.whole_key_filtering), data_(nullptr), offset_(nullptr), diff --git a/table/filter_block.h b/table/filter_block.h index 5041393f6b8..efee5ac710f 100644 --- a/table/filter_block.h +++ b/table/filter_block.h @@ -18,7 +18,6 @@ #include #include #include -#include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" @@ -36,7 +35,7 @@ class FilterPolicy; // (StartBlock AddKey*)* Finish class FilterBlockBuilder { public: - explicit FilterBlockBuilder(const Options& opt, + explicit FilterBlockBuilder(const SliceTransform* prefix_extractor, const BlockBasedTableOptions& table_opt, const Comparator* internal_comparator); @@ -71,7 +70,7 @@ class FilterBlockReader { public: // REQUIRES: "contents" and *policy must stay live while *this is live. FilterBlockReader( - const Options& opt, + const SliceTransform* prefix_extractor, const BlockBasedTableOptions& table_opt, const Slice& contents, bool delete_contents_after_use = false); diff --git a/table/filter_block_test.cc b/table/filter_block_test.cc index 95496a82c20..903247e80c2 100644 --- a/table/filter_block_test.cc +++ b/table/filter_block_test.cc @@ -45,26 +45,26 @@ class TestHashFilter : public FilterPolicy { class FilterBlockTest { public: - Options options_; + const Comparator* comparator_; BlockBasedTableOptions table_options_; - FilterBlockTest() { - options_ = Options(); + FilterBlockTest() + : comparator_(BytewiseComparator()) { table_options_.filter_policy.reset(new TestHashFilter()); } }; TEST(FilterBlockTest, EmptyBuilder) { - FilterBlockBuilder builder(options_, table_options_, options_.comparator); + FilterBlockBuilder builder(nullptr, table_options_, comparator_); Slice block = builder.Finish(); ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block)); - FilterBlockReader reader(options_, table_options_, block); + FilterBlockReader reader(nullptr, table_options_, block); ASSERT_TRUE(reader.KeyMayMatch(0, "foo")); ASSERT_TRUE(reader.KeyMayMatch(100000, "foo")); } TEST(FilterBlockTest, SingleChunk) { - FilterBlockBuilder builder(options_, table_options_, options_.comparator); + FilterBlockBuilder builder(nullptr, table_options_, comparator_); builder.StartBlock(100); builder.AddKey("foo"); builder.AddKey("bar"); @@ -74,7 +74,7 @@ TEST(FilterBlockTest, SingleChunk) { builder.StartBlock(300); builder.AddKey("hello"); Slice block = builder.Finish(); - FilterBlockReader reader(options_, table_options_, block); + FilterBlockReader reader(nullptr, table_options_, block); ASSERT_TRUE(reader.KeyMayMatch(100, "foo")); ASSERT_TRUE(reader.KeyMayMatch(100, "bar")); ASSERT_TRUE(reader.KeyMayMatch(100, "box")); @@ -85,7 +85,7 @@ TEST(FilterBlockTest, SingleChunk) { } TEST(FilterBlockTest, MultiChunk) { - FilterBlockBuilder builder(options_, table_options_, options_.comparator); + FilterBlockBuilder builder(nullptr, table_options_, comparator_); // First filter builder.StartBlock(0); @@ -105,7 +105,7 @@ TEST(FilterBlockTest, MultiChunk) { builder.AddKey("hello"); Slice block = builder.Finish(); - FilterBlockReader reader(options_, table_options_, block); + FilterBlockReader reader(nullptr, table_options_, block); // Check first filter ASSERT_TRUE(reader.KeyMayMatch(0, "foo")); diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc index 4f3b62ad4eb..49489ed64a9 100644 --- a/table/plain_table_builder.cc +++ b/table/plain_table_builder.cc @@ -58,24 +58,24 @@ extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull; extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull; PlainTableBuilder::PlainTableBuilder( - const Options& options, WritableFile* file, uint32_t user_key_len, - EncodingType encoding_type, size_t index_sparseness, + const ImmutableCFOptions& ioptions, WritableFile* file, + uint32_t user_key_len, EncodingType encoding_type, size_t index_sparseness, uint32_t bloom_bits_per_key, uint32_t num_probes, size_t huge_page_tlb_size, double hash_table_ratio, bool store_index_in_file) - : options_(options), + : ioptions_(ioptions), bloom_block_(num_probes), file_(file), bloom_bits_per_key_(bloom_bits_per_key), huge_page_tlb_size_(huge_page_tlb_size), - encoder_(encoding_type, user_key_len, options.prefix_extractor.get(), + encoder_(encoding_type, user_key_len, ioptions.prefix_extractor, index_sparseness), store_index_in_file_(store_index_in_file), - prefix_extractor_(options.prefix_extractor.get()) { + prefix_extractor_(ioptions.prefix_extractor) { // Build index block and save it in the file if hash_table_ratio > 0 if (store_index_in_file_) { assert(hash_table_ratio > 0 || IsTotalOrderMode()); index_builder_.reset( - new PlainTableIndexBuilder(&arena_, options, index_sparseness, + new PlainTableIndexBuilder(&arena_, ioptions, index_sparseness, hash_table_ratio, huge_page_tlb_size_)); assert(bloom_bits_per_key_ > 0); properties_.user_collected_properties @@ -93,10 +93,10 @@ PlainTableBuilder::PlainTableBuilder( // plain encoding. properties_.format_version = (encoding_type == kPlain) ? 0 : 1; - if (options_.prefix_extractor) { + if (ioptions_.prefix_extractor) { properties_.user_collected_properties [PlainTablePropertyNames::kPrefixExtractorName] = - options_.prefix_extractor->Name(); + ioptions_.prefix_extractor->Name(); } std::string val; @@ -105,7 +105,7 @@ PlainTableBuilder::PlainTableBuilder( [PlainTablePropertyNames::kEncodingType] = val; for (auto& collector_factories : - options.table_properties_collector_factories) { + ioptions.table_properties_collector_factories) { table_properties_collectors_.emplace_back( collector_factories->CreateTablePropertiesCollector()); } @@ -124,11 +124,11 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) { // Store key hash if (store_index_in_file_) { - if (options_.prefix_extractor.get() == nullptr) { + if (ioptions_.prefix_extractor == nullptr) { keys_or_prefixes_hashes_.push_back(GetSliceHash(internal_key.user_key)); } else { Slice prefix = - options_.prefix_extractor->Transform(internal_key.user_key); + ioptions_.prefix_extractor->Transform(internal_key.user_key); keys_or_prefixes_hashes_.push_back(GetSliceHash(prefix)); } } @@ -160,7 +160,7 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) { // notify property collectors NotifyCollectTableCollectorsOnAdd(key, value, table_properties_collectors_, - options_.info_log.get()); + ioptions_.info_log); } Status PlainTableBuilder::status() const { return status_; } @@ -183,7 +183,8 @@ Status PlainTableBuilder::Finish() { if (store_index_in_file_ && (properties_.num_entries > 0)) { bloom_block_.SetTotalBits( &arena_, properties_.num_entries * bloom_bits_per_key_, - options_.bloom_locality, huge_page_tlb_size_, options_.info_log.get()); + ioptions_.bloom_locality, huge_page_tlb_size_, + ioptions_.info_log); PutVarint32(&properties_.user_collected_properties [PlainTablePropertyNames::kNumBloomBlocks], @@ -224,7 +225,7 @@ Status PlainTableBuilder::Finish() { // -- Add user collected properties NotifyCollectTableCollectorsOnFinish(table_properties_collectors_, - options_.info_log.get(), + ioptions_.info_log, &property_block_builder); // -- Write property block diff --git a/table/plain_table_builder.h b/table/plain_table_builder.h index 2871d887e82..c3af0807279 100644 --- a/table/plain_table_builder.h +++ b/table/plain_table_builder.h @@ -30,7 +30,7 @@ class PlainTableBuilder: public TableBuilder { // caller to close the file after calling Finish(). The output file // will be part of level specified by 'level'. A value of -1 means // that the caller does not know which level the output file will reside. - PlainTableBuilder(const Options& options, WritableFile* file, + PlainTableBuilder(const ImmutableCFOptions& ioptions, WritableFile* file, uint32_t user_key_size, EncodingType encoding_type, size_t index_sparseness, uint32_t bloom_bits_per_key, uint32_t num_probes = 6, size_t huge_page_tlb_size = 0, @@ -71,7 +71,7 @@ class PlainTableBuilder: public TableBuilder { private: Arena arena_; - Options options_; + const ImmutableCFOptions& ioptions_; std::vector> table_properties_collectors_; diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc index 145179bae1c..de23cc902b0 100644 --- a/table/plain_table_factory.cc +++ b/table/plain_table_factory.cc @@ -14,22 +14,24 @@ namespace rocksdb { -Status PlainTableFactory::NewTableReader(const Options& options, - const EnvOptions& soptions, +Status PlainTableFactory::NewTableReader(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const InternalKeyComparator& icomp, unique_ptr&& file, uint64_t file_size, unique_ptr* table) const { - return PlainTableReader::Open(options, soptions, icomp, std::move(file), + return PlainTableReader::Open(ioptions, env_options, icomp, std::move(file), file_size, table, bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, huge_page_tlb_size_, full_scan_mode_); } TableBuilder* PlainTableFactory::NewTableBuilder( - const Options& options, const InternalKeyComparator& internal_comparator, - WritableFile* file, CompressionType compression_type) const { - return new PlainTableBuilder(options, file, user_key_len_, encoding_type_, + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& internal_comparator, + WritableFile* file, const CompressionType, + const CompressionOptions&) const { + return new PlainTableBuilder(ioptions, file, user_key_len_, encoding_type_, index_sparseness_, bloom_bits_per_key_, 6, huge_page_tlb_size_, hash_table_ratio_, store_index_in_file_); diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h index d1cf0cae61a..54c628c1529 100644 --- a/table/plain_table_factory.h +++ b/table/plain_table_factory.h @@ -14,7 +14,6 @@ namespace rocksdb { -struct Options; struct EnvOptions; using std::unique_ptr; @@ -154,15 +153,17 @@ class PlainTableFactory : public TableFactory { full_scan_mode_(options.full_scan_mode), store_index_in_file_(options.store_index_in_file) {} const char* Name() const override { return "PlainTable"; } - Status NewTableReader(const Options& options, const EnvOptions& soptions, - const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table) const override; - TableBuilder* NewTableBuilder(const Options& options, - const InternalKeyComparator& icomparator, - WritableFile* file, - CompressionType compression_type) const - override; + Status NewTableReader( + const ImmutableCFOptions& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table) const override; + TableBuilder* NewTableBuilder( + const ImmutableCFOptions& options, + const InternalKeyComparator& icomparator, + WritableFile* file, + const CompressionType, + const CompressionOptions&) const override; std::string GetPrintableTableOptions() const override; diff --git a/table/plain_table_index.cc b/table/plain_table_index.cc index efba9b71dac..61f9e335b4d 100644 --- a/table/plain_table_index.cc +++ b/table/plain_table_index.cc @@ -93,7 +93,7 @@ Slice PlainTableIndexBuilder::Finish() { BucketizeIndexes(&hash_to_offsets, &entries_per_bucket); keys_per_prefix_hist_.Add(num_keys_per_prefix_); - Log(options_.info_log, "Number of Keys per prefix Histogram: %s", + Log(ioptions_.info_log, "Number of Keys per prefix Histogram: %s", keys_per_prefix_hist_.ToString().c_str()); // From the temp data structure, populate indexes. @@ -147,11 +147,11 @@ void PlainTableIndexBuilder::BucketizeIndexes( Slice PlainTableIndexBuilder::FillIndexes( const std::vector& hash_to_offsets, const std::vector& entries_per_bucket) { - Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index", + Log(ioptions_.info_log, "Reserving %zu bytes for plain table's sub_index", sub_index_size_); auto total_allocate_size = GetTotalSize(); char* allocated = arena_->AllocateAligned( - total_allocate_size, huge_page_tlb_size_, options_.info_log.get()); + total_allocate_size, huge_page_tlb_size_, ioptions_.info_log); auto temp_ptr = EncodeVarint32(allocated, index_size_); uint32_t* index = @@ -191,7 +191,7 @@ Slice PlainTableIndexBuilder::FillIndexes( } assert(sub_index_offset == sub_index_size_); - Log(options_.info_log, "hash table size: %d, suffix_map length %zu", + Log(ioptions_.info_log, "hash table size: %d, suffix_map length %zu", index_size_, sub_index_size_); return Slice(allocated, GetTotalSize()); } diff --git a/table/plain_table_index.h b/table/plain_table_index.h index f63bbd0d522..0b26ecd0d0e 100644 --- a/table/plain_table_index.h +++ b/table/plain_table_index.h @@ -108,11 +108,11 @@ class PlainTableIndex { // #wiki-in-memory-index-format class PlainTableIndexBuilder { public: - PlainTableIndexBuilder(Arena* arena, const Options& options, + PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions, uint32_t index_sparseness, double hash_table_ratio, double huge_page_tlb_size) : arena_(arena), - options_(options), + ioptions_(ioptions), record_list_(kRecordsPerGroup), is_first_record_(true), due_index_(false), @@ -120,7 +120,7 @@ class PlainTableIndexBuilder { num_keys_per_prefix_(0), prev_key_prefix_hash_(0), index_sparseness_(index_sparseness), - prefix_extractor_(options.prefix_extractor.get()), + prefix_extractor_(ioptions.prefix_extractor), hash_table_ratio_(hash_table_ratio), huge_page_tlb_size_(huge_page_tlb_size) {} @@ -196,7 +196,7 @@ class PlainTableIndexBuilder { const std::vector& entries_per_bucket); Arena* arena_; - Options options_; + const ImmutableCFOptions ioptions_; HistogramImpl keys_per_prefix_hist_; IndexRecordList record_list_; bool is_first_record_; diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc index b5eccd310c5..3a6d48be82e 100644 --- a/table/plain_table_reader.cc +++ b/table/plain_table_reader.cc @@ -87,7 +87,7 @@ class PlainTableIterator : public Iterator { }; extern const uint64_t kPlainTableMagicNumber; -PlainTableReader::PlainTableReader(const Options& options, +PlainTableReader::PlainTableReader(const ImmutableCFOptions& ioptions, unique_ptr&& file, const EnvOptions& storage_options, const InternalKeyComparator& icomparator, @@ -99,10 +99,10 @@ PlainTableReader::PlainTableReader(const Options& options, full_scan_mode_(false), data_end_offset_(table_properties->data_size), user_key_len_(table_properties->fixed_key_len), - prefix_extractor_(options.prefix_extractor.get()), + prefix_extractor_(ioptions.prefix_extractor), enable_bloom_(false), bloom_(6, nullptr), - options_(options), + ioptions_(ioptions), file_(std::move(file)), file_size_(file_size), table_properties_(nullptr) {} @@ -110,8 +110,8 @@ PlainTableReader::PlainTableReader(const Options& options, PlainTableReader::~PlainTableReader() { } -Status PlainTableReader::Open(const Options& options, - const EnvOptions& soptions, +Status PlainTableReader::Open(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, @@ -119,14 +119,14 @@ Status PlainTableReader::Open(const Options& options, const int bloom_bits_per_key, double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size, bool full_scan_mode) { - assert(options.allow_mmap_reads); + assert(ioptions.allow_mmap_reads); if (file_size > PlainTableIndex::kMaxFileSize) { return Status::NotSupported("File is too large for PlainTableReader!"); } TableProperties* props = nullptr; auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, - options.env, options.info_log.get(), &props); + ioptions.env, ioptions.info_log, &props); if (!s.ok()) { return s; } @@ -137,12 +137,12 @@ Status PlainTableReader::Open(const Options& options, user_props.find(PlainTablePropertyNames::kPrefixExtractorName); if (!full_scan_mode && prefix_extractor_in_file != user_props.end()) { - if (!options.prefix_extractor) { + if (!ioptions.prefix_extractor) { return Status::InvalidArgument( "Prefix extractor is missing when opening a PlainTable built " "using a prefix extractor"); } else if (prefix_extractor_in_file->second.compare( - options.prefix_extractor->Name()) != 0) { + ioptions.prefix_extractor->Name()) != 0) { return Status::InvalidArgument( "Prefix extractor given doesn't match the one used to build " "PlainTable"); @@ -158,8 +158,8 @@ Status PlainTableReader::Open(const Options& options, } std::unique_ptr new_reader(new PlainTableReader( - options, std::move(file), soptions, internal_comparator, encoding_type, - file_size, props)); + ioptions, std::move(file), env_options, internal_comparator, + encoding_type, file_size, props)); s = new_reader->MmapDataFile(); if (!s.ok()) { @@ -207,7 +207,7 @@ Status PlainTableReader::PopulateIndexRecordList( bool is_first_record = true; Slice key_prefix_slice; PlainTableKeyDecoder decoder(encoding_type_, user_key_len_, - options_.prefix_extractor.get()); + ioptions_.prefix_extractor); while (pos < data_end_offset_) { uint32_t key_offset = pos; ParsedInternalKey key; @@ -252,8 +252,8 @@ void PlainTableReader::AllocateAndFillBloom(int bloom_bits_per_key, uint32_t bloom_total_bits = num_prefixes * bloom_bits_per_key; if (bloom_total_bits > 0) { enable_bloom_ = true; - bloom_.SetTotalBits(&arena_, bloom_total_bits, options_.bloom_locality, - huge_page_tlb_size, options_.info_log.get()); + bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality, + huge_page_tlb_size, ioptions_.info_log); FillBloom(prefix_hashes); } } @@ -281,14 +281,14 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, BlockContents bloom_block_contents; auto s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber, - options_.env, BloomBlockBuilder::kBloomBlock, + ioptions_.env, BloomBlockBuilder::kBloomBlock, &bloom_block_contents); bool index_in_file = s.ok(); BlockContents index_block_contents; s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber, - options_.env, PlainTableIndexBuilder::kPlainTableIndexBlock, - &index_block_contents); + ioptions_.env, PlainTableIndexBuilder::kPlainTableIndexBlock, + &index_block_contents); index_in_file &= s.ok(); @@ -310,8 +310,9 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, index_block = nullptr; } - if ((options_.prefix_extractor.get() == nullptr) && (hash_table_ratio != 0)) { - // options.prefix_extractor is requried for a hash-based look-up. + if ((ioptions_.prefix_extractor == nullptr) && + (hash_table_ratio != 0)) { + // ioptions.prefix_extractor is requried for a hash-based look-up. return Status::NotSupported( "PlainTable requires a prefix extractor enable prefix hash mode."); } @@ -328,8 +329,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, table_properties_->num_entries * bloom_bits_per_key; if (num_bloom_bits > 0) { enable_bloom_ = true; - bloom_.SetTotalBits(&arena_, num_bloom_bits, options_.bloom_locality, - huge_page_tlb_size, options_.info_log.get()); + bloom_.SetTotalBits(&arena_, num_bloom_bits, ioptions_.bloom_locality, + huge_page_tlb_size, ioptions_.info_log); } } } else { @@ -351,7 +352,7 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, bloom_block->size() * 8, num_blocks); } - PlainTableIndexBuilder index_builder(&arena_, options_, index_sparseness, + PlainTableIndexBuilder index_builder(&arena_, ioptions_, index_sparseness, hash_table_ratio, huge_page_tlb_size); std::vector prefix_hashes; @@ -422,7 +423,7 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, uint32_t file_offset = GetFixed32Element(base_ptr, mid); size_t tmp; Status s = PlainTableKeyDecoder(encoding_type_, user_key_len_, - options_.prefix_extractor.get()) + ioptions_.prefix_extractor) .NextKey(file_data_.data() + file_offset, file_data_.data() + data_end_offset_, &mid_key, nullptr, &tmp); @@ -451,7 +452,7 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, size_t tmp; uint32_t low_key_offset = GetFixed32Element(base_ptr, low); Status s = PlainTableKeyDecoder(encoding_type_, user_key_len_, - options_.prefix_extractor.get()) + ioptions_.prefix_extractor) .NextKey(file_data_.data() + low_key_offset, file_data_.data() + data_end_offset_, &low_key, nullptr, &tmp); @@ -565,7 +566,7 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, } Slice found_value; PlainTableKeyDecoder decoder(encoding_type_, user_key_len_, - options_.prefix_extractor.get()); + ioptions_.prefix_extractor); while (offset < data_end_offset_) { Status s = Next(&decoder, &offset, &found_key, nullptr, &found_value); if (!s.ok()) { diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h index 4a626979a83..fcc94a53e15 100644 --- a/table/plain_table_reader.h +++ b/table/plain_table_reader.h @@ -52,7 +52,8 @@ extern const uint32_t kPlainTableVariableLength; // The implementation of IndexedTableReader requires output file is mmaped class PlainTableReader: public TableReader { public: - static Status Open(const Options& options, const EnvOptions& soptions, + static Status Open(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table, @@ -82,8 +83,9 @@ class PlainTableReader: public TableReader { return arena_.MemoryAllocatedBytes(); } - PlainTableReader(const Options& options, unique_ptr&& file, - const EnvOptions& storage_options, + PlainTableReader(const ImmutableCFOptions& ioptions, + unique_ptr&& file, + const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, EncodingType encoding_type, uint64_t file_size, const TableProperties* table_properties); @@ -132,7 +134,7 @@ class PlainTableReader: public TableReader { DynamicBloom bloom_; Arena arena_; - const Options& options_; + const ImmutableCFOptions& ioptions_; unique_ptr file_; uint32_t file_size_; std::shared_ptr table_properties_; diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index ed2c7c52dc8..58493758729 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -88,10 +88,12 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, TableBuilder* tb = nullptr; DB* db = nullptr; Status s; + const ImmutableCFOptions ioptions(opts); if (!through_db) { env->NewWritableFile(file_name, &file, env_options); - tb = opts.table_factory->NewTableBuilder(opts, ikc, file.get(), - CompressionType::kNoCompression); + tb = opts.table_factory->NewTableBuilder(ioptions, ikc, file.get(), + CompressionType::kNoCompression, + CompressionOptions()); } else { s = DB::Open(opts, dbname, &db); ASSERT_OK(s); @@ -122,7 +124,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, uint64_t file_size; env->GetFileSize(file_name, &file_size); s = opts.table_factory->NewTableReader( - opts, env_options, ikc, std::move(raf), file_size, &table_reader); + ioptions, env_options, ikc, std::move(raf), file_size, &table_reader); } Random rnd(301); diff --git a/table/table_test.cc b/table/table_test.cc index 500abf48f24..a0f844014fd 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -42,6 +42,7 @@ #include "util/statistics.h" #include "util/testharness.h" #include "util/testutil.h" +#include "util/scoped_arena_iterator.h" namespace rocksdb { @@ -194,6 +195,7 @@ class Constructor { // been added so far. Returns the keys in sorted order in "*keys" // and stores the key/value pairs in "*kvmap" void Finish(const Options& options, + const ImmutableCFOptions& ioptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, std::vector* keys, KVMap* kvmap) { @@ -206,12 +208,14 @@ class Constructor { keys->push_back(it->first); } data_.clear(); - Status s = FinishImpl(options, table_options, internal_comparator, *kvmap); + Status s = FinishImpl(options, ioptions, table_options, + internal_comparator, *kvmap); ASSERT_TRUE(s.ok()) << s.ToString(); } // Construct the data structure from the data in "data" virtual Status FinishImpl(const Options& options, + const ImmutableCFOptions& ioptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, const KVMap& data) = 0; @@ -220,8 +224,12 @@ class Constructor { virtual const KVMap& data() { return data_; } + virtual bool IsArenaMode() const { return false; } + virtual DB* db() const { return nullptr; } // Overridden in DBConstructor + virtual bool AnywayDeleteIterator() const { return false; } + protected: const InternalKeyComparator* last_internal_key_; @@ -239,6 +247,7 @@ class BlockConstructor: public Constructor { delete block_; } virtual Status FinishImpl(const Options& options, + const ImmutableCFOptions& ioptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, const KVMap& data) { @@ -275,8 +284,15 @@ class BlockConstructor: public Constructor { // A helper class that converts internal format keys into user keys class KeyConvertingIterator: public Iterator { public: - explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { } - virtual ~KeyConvertingIterator() { delete iter_; } + KeyConvertingIterator(Iterator* iter, bool arena_mode = false) + : iter_(iter), arena_mode_(arena_mode) {} + virtual ~KeyConvertingIterator() { + if (arena_mode_) { + iter_->~Iterator(); + } else { + delete iter_; + } + } virtual bool Valid() const { return iter_->Valid(); } virtual void Seek(const Slice& target) { ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue); @@ -307,6 +323,7 @@ class KeyConvertingIterator: public Iterator { private: mutable Status status_; Iterator* iter_; + bool arena_mode_; // No copying allowed KeyConvertingIterator(const KeyConvertingIterator&); @@ -322,14 +339,16 @@ class TableConstructor: public Constructor { ~TableConstructor() { Reset(); } virtual Status FinishImpl(const Options& options, + const ImmutableCFOptions& ioptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, const KVMap& data) { Reset(); sink_.reset(new StringSink()); unique_ptr builder; - builder.reset(options.table_factory->NewTableBuilder( - options, internal_comparator, sink_.get(), options.compression)); + builder.reset(ioptions.table_factory->NewTableBuilder( + ioptions, internal_comparator, sink_.get(), options.compression, + CompressionOptions())); for (KVMap::const_iterator it = data.begin(); it != data.end(); @@ -352,9 +371,9 @@ class TableConstructor: public Constructor { // Open the table uniq_id_ = cur_uniq_id_++; source_.reset(new StringSource(sink_->contents(), uniq_id_, - options.allow_mmap_reads)); - return options.table_factory->NewTableReader( - options, soptions, internal_comparator, std::move(source_), + ioptions.allow_mmap_reads)); + return ioptions.table_factory->NewTableReader( + ioptions, soptions, internal_comparator, std::move(source_), sink_->contents().size(), &table_reader_); } @@ -372,12 +391,12 @@ class TableConstructor: public Constructor { return table_reader_->ApproximateOffsetOf(key); } - virtual Status Reopen(const Options& options) { + virtual Status Reopen(const ImmutableCFOptions& ioptions) { source_.reset( new StringSource(sink_->contents(), uniq_id_, - options.allow_mmap_reads)); - return options.table_factory->NewTableReader( - options, soptions, *last_internal_key_, std::move(source_), + ioptions.allow_mmap_reads)); + return ioptions.table_factory->NewTableReader( + ioptions, soptions, *last_internal_key_, std::move(source_), sink_->contents().size(), &table_reader_); } @@ -385,6 +404,10 @@ class TableConstructor: public Constructor { return table_reader_.get(); } + virtual bool AnywayDeleteIterator() const override { + return convert_to_internal_key_; + } + private: void Reset() { uniq_id_ = 0; @@ -392,12 +415,12 @@ class TableConstructor: public Constructor { sink_.reset(); source_.reset(); } - bool convert_to_internal_key_; uint64_t uniq_id_; unique_ptr sink_; unique_ptr source_; unique_ptr table_reader_; + bool convert_to_internal_key_; TableConstructor(); @@ -421,6 +444,7 @@ class MemTableConstructor: public Constructor { delete memtable_->Unref(); } virtual Status FinishImpl(const Options& options, + const ImmutableCFOptions& ioptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, const KVMap& data) { @@ -439,10 +463,16 @@ class MemTableConstructor: public Constructor { return Status::OK(); } virtual Iterator* NewIterator() const { - return new KeyConvertingIterator(memtable_->NewIterator(ReadOptions())); + return new KeyConvertingIterator( + memtable_->NewIterator(ReadOptions(), &arena_), true); } + virtual bool AnywayDeleteIterator() const override { return true; } + + virtual bool IsArenaMode() const override { return true; } + private: + mutable Arena arena_; InternalKeyComparator internal_comparator_; MemTable* memtable_; std::shared_ptr table_factory_; @@ -460,6 +490,7 @@ class DBConstructor: public Constructor { delete db_; } virtual Status FinishImpl(const Options& options, + const ImmutableCFOptions& ioptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, const KVMap& data) { @@ -670,7 +701,7 @@ class FixedOrLessPrefixTransform : public SliceTransform { class Harness { public: - Harness() : constructor_(nullptr) { } + Harness() : ioptions_(options_), constructor_(nullptr) {} void Init(const TestArgs& args) { delete constructor_; @@ -756,6 +787,7 @@ class Harness { constructor_ = new DBConstructor(options_.comparator); break; } + ioptions_ = ImmutableCFOptions(options_); } ~Harness() { @@ -769,8 +801,8 @@ class Harness { void Test(Random* rnd) { std::vector keys; KVMap data; - constructor_->Finish(options_, table_options_, *internal_comparator_, - &keys, &data); + constructor_->Finish(options_, ioptions_, table_options_, + *internal_comparator_, &keys, &data); TestForwardScan(keys, data); if (support_prev_) { @@ -791,7 +823,11 @@ class Harness { iter->Next(); } ASSERT_TRUE(!iter->Valid()); - delete iter; + if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) { + iter->~Iterator(); + } else { + delete iter; + } } void TestBackwardScan(const std::vector& keys, @@ -806,7 +842,11 @@ class Harness { iter->Prev(); } ASSERT_TRUE(!iter->Valid()); - delete iter; + if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) { + iter->~Iterator(); + } else { + delete iter; + } } void TestRandomAccess(Random* rnd, @@ -876,7 +916,11 @@ class Harness { } } } - delete iter; + if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) { + iter->~Iterator(); + } else { + delete iter; + } } std::string ToString(const KVMap& data, const KVMap::const_iterator& it) { @@ -939,6 +983,7 @@ class Harness { private: Options options_ = Options(); + ImmutableCFOptions ioptions_; BlockBasedTableOptions table_options_ = BlockBasedTableOptions(); Constructor* constructor_; bool support_prev_; @@ -1038,7 +1083,8 @@ TEST(BlockBasedTableTest, BasicBlockBasedTableProperties) { table_options.block_restart_interval = 1; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - c.Finish(options, table_options, + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); auto& props = *c.GetTableReader()->GetTableProperties(); @@ -1071,7 +1117,8 @@ TEST(BlockBasedTableTest, FilterPolicyNameProperties) { Options options; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - c.Finish(options, table_options, + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); auto& props = *c.GetTableReader()->GetTableProperties(); ASSERT_EQ("rocksdb.BuiltinBloomFilter", props.filter_policy_name); @@ -1122,7 +1169,8 @@ TEST(BlockBasedTableTest, TotalOrderSeekOnHashIndex) { c.Add("cccc2", std::string('a', 56)); std::vector keys; KVMap kvmap; - c.Finish(options, table_options, + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); auto props = c.GetTableReader()->GetTableProperties(); ASSERT_EQ(7u, props->num_data_blocks); @@ -1206,7 +1254,8 @@ TEST(TableTest, HashIndexTest) { std::unique_ptr comparator( new InternalKeyComparator(BytewiseComparator())); - c.Finish(options, table_options, *comparator, &keys, &kvmap); + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, *comparator, &keys, &kvmap); auto reader = c.GetTableReader(); auto props = reader->GetTableProperties(); @@ -1314,7 +1363,8 @@ TEST(BlockBasedTableTest, IndexSizeStat) { table_options.block_restart_interval = 1; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - c.Finish(options, table_options, + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, GetPlainInternalComparator(options.comparator), &ks, &kvmap); auto index_size = c.GetTableReader()->GetTableProperties()->index_size; ASSERT_GT(index_size, last_index_size); @@ -1340,7 +1390,8 @@ TEST(BlockBasedTableTest, NumBlockStat) { std::vector ks; KVMap kvmap; - c.Finish(options, table_options, + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, GetPlainInternalComparator(options.comparator), &ks, &kvmap); ASSERT_EQ(kvmap.size(), c.GetTableReader()->GetTableProperties()->num_data_blocks); @@ -1416,7 +1467,8 @@ TEST(BlockBasedTableTest, BlockCacheDisabledTest) { TableConstructor c(BytewiseComparator(), true); c.Add("key", "value"); - c.Finish(options, table_options, + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); // preloading filter/index blocks is enabled. @@ -1458,7 +1510,8 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) { TableConstructor c(BytewiseComparator()); c.Add("key", "value"); - c.Finish(options, table_options, + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); // preloading filter/index blocks is prohibited. auto reader = dynamic_cast(c.GetTableReader()); @@ -1512,7 +1565,8 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) { table_options.block_cache.reset(); options.table_factory.reset(new BlockBasedTableFactory(table_options)); options.statistics = CreateDBStatistics(); // reset the stats - c.Reopen(options); + const ImmutableCFOptions ioptions1(options); + c.Reopen(ioptions1); table_options.no_block_cache = false; { @@ -1529,7 +1583,8 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) { // too small to fit even one entry. table_options.block_cache = NewLRUCache(1); options.table_factory.reset(new BlockBasedTableFactory(table_options)); - c.Reopen(options); + const ImmutableCFOptions ioptions2(options); + c.Reopen(ioptions2); { BlockCachePropertiesSnapshot props(options.statistics.get()); props.AssertEqual(1, // index block miss @@ -1583,7 +1638,8 @@ TEST(BlockBasedTableTest, BlockCacheLeak) { c.Add("k07", std::string(100000, 'x')); std::vector keys; KVMap kvmap; - c.Finish(opt, table_options, *ikc, &keys, &kvmap); + const ImmutableCFOptions ioptions(opt); + c.Finish(opt, ioptions, table_options, *ikc, &keys, &kvmap); unique_ptr iter(c.NewIterator()); iter->SeekToFirst(); @@ -1594,7 +1650,8 @@ TEST(BlockBasedTableTest, BlockCacheLeak) { } ASSERT_OK(iter->status()); - ASSERT_OK(c.Reopen(opt)); + const ImmutableCFOptions ioptions1(opt); + ASSERT_OK(c.Reopen(ioptions1)); auto table_reader = dynamic_cast(c.GetTableReader()); for (const std::string& key : keys) { ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key)); @@ -1603,7 +1660,8 @@ TEST(BlockBasedTableTest, BlockCacheLeak) { // rerun with different block cache table_options.block_cache = NewLRUCache(16 * 1024 * 1024); opt.table_factory.reset(NewBlockBasedTableFactory(table_options)); - ASSERT_OK(c.Reopen(opt)); + const ImmutableCFOptions ioptions2(opt); + ASSERT_OK(c.Reopen(ioptions2)); table_reader = dynamic_cast(c.GetTableReader()); for (const std::string& key : keys) { ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), key)); @@ -1619,9 +1677,11 @@ TEST(PlainTableTest, BasicPlainTableProperties) { PlainTableFactory factory(plain_table_options); StringSink sink; Options options; + const ImmutableCFOptions ioptions(options); InternalKeyComparator ikc(options.comparator); std::unique_ptr builder( - factory.NewTableBuilder(options, ikc, &sink, kNoCompression)); + factory.NewTableBuilder(ioptions, ikc, &sink, kNoCompression, + CompressionOptions())); for (char c = 'a'; c <= 'z'; ++c) { std::string key(8, c); @@ -1664,7 +1724,9 @@ TEST(GeneralTableTest, ApproximateOffsetOfPlain) { options.compression = kNoCompression; BlockBasedTableOptions table_options; table_options.block_size = 1024; - c.Finish(options, table_options, internal_comparator, &keys, &kvmap); + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, internal_comparator, + &keys, &kvmap); ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); @@ -1694,7 +1756,8 @@ static void DoCompressionTest(CompressionType comp) { options.compression = comp; BlockBasedTableOptions table_options; table_options.block_size = 1024; - c.Finish(options, table_options, ikc, &keys, &kvmap); + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, ikc, &keys, &kvmap); ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); @@ -1807,7 +1870,8 @@ TEST(MemTableTest, Simple) { ColumnFamilyMemTablesDefault cf_mems_default(memtable, &options); ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &cf_mems_default).ok()); - Iterator* iter = memtable->NewIterator(ReadOptions()); + Arena arena; + ScopedArenaIterator iter(memtable->NewIterator(ReadOptions(), &arena)); iter->SeekToFirst(); while (iter->Valid()) { fprintf(stderr, "key: '%s' -> '%s'\n", @@ -1816,7 +1880,6 @@ TEST(MemTableTest, Simple) { iter->Next(); } - delete iter; delete memtable->Unref(); } diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc index 9b130c7c63d..6c496e8ddb4 100644 --- a/tools/sst_dump.cc +++ b/tools/sst_dump.cc @@ -68,6 +68,7 @@ class SstFileReader { // options_ and internal_comparator_ will also be used in // ReadSequential internally (specifically, seek-related operations) Options options_; + const ImmutableCFOptions ioptions_; InternalKeyComparator internal_comparator_; unique_ptr table_properties_; }; @@ -76,7 +77,8 @@ SstFileReader::SstFileReader(const std::string& file_path, bool verify_checksum, bool output_hex) :file_name_(file_path), read_num_(0), verify_checksum_(verify_checksum), - output_hex_(output_hex), internal_comparator_(BytewiseComparator()) { + output_hex_(output_hex), ioptions_(options_), + internal_comparator_(BytewiseComparator()) { fprintf(stdout, "Process %s\n", file_path.c_str()); init_result_ = NewTableReader(file_name_); @@ -123,7 +125,7 @@ Status SstFileReader::NewTableReader(const std::string& file_path) { if (s.ok()) { s = options_.table_factory->NewTableReader( - options_, soptions_, internal_comparator_, std::move(file_), file_size, + ioptions_, soptions_, internal_comparator_, std::move(file_), file_size, &table_reader_); } return s; diff --git a/util/cache_bench.cc b/util/cache_bench.cc new file mode 100644 index 00000000000..ccaf5ce5baf --- /dev/null +++ b/util/cache_bench.cc @@ -0,0 +1,257 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif +#ifndef GFLAGS +#include +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else + +#include +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/cache.h" +#include "rocksdb/env.h" +#include "port/port.h" +#include "util/mutexlock.h" +#include "util/random.h" + +using GFLAGS::ParseCommandLineFlags; + +static const uint32_t KB = 1024; + +DEFINE_int32(threads, 10, "Number of concurrent threads to run."); +DEFINE_int64(cache_size, 2 * KB * KB * KB, + "Number of bytes to use as a cache of uncompressed data."); +DEFINE_int32(num_shard_bits, 4, "shard_bits."); + +DEFINE_int64(max_key, 1 * KB* KB, "Max number of key to place in cache"); +DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread."); + +DEFINE_int32(insert_percent, 40, + "Ratio of insert to total workload (expressed as a percentage)"); +DEFINE_int32(lookup_percent, 50, + "Ratio of lookup to total workload (expressed as a percentage)"); +DEFINE_int32(erase_percent, 10, + "Ratio of erase to total workload (expressed as a percentage)"); + +namespace rocksdb { + +class CacheBench; +namespace { +void deleter(const Slice& key, void* value) { + delete reinterpret_cast(value); +} + +// State shared by all concurrent executions of the same benchmark. +class SharedState { + public: + explicit SharedState(CacheBench* cache_bench) + : cv_(&mu_), + num_threads_(FLAGS_threads), + num_initialized_(0), + start_(false), + num_done_(0), + cache_bench_(cache_bench) { + } + + ~SharedState() {} + + port::Mutex* GetMutex() { + return &mu_; + } + + port::CondVar* GetCondVar() { + return &cv_; + } + + CacheBench* GetCacheBench() const { + return cache_bench_; + } + + void IncInitialized() { + num_initialized_++; + } + + void IncDone() { + num_done_++; + } + + bool AllInitialized() const { + return num_initialized_ >= num_threads_; + } + + bool AllDone() const { + return num_done_ >= num_threads_; + } + + void SetStart() { + start_ = true; + } + + bool Started() const { + return start_; + } + + private: + port::Mutex mu_; + port::CondVar cv_; + + const uint64_t num_threads_; + uint64_t num_initialized_; + bool start_; + uint64_t num_done_; + + CacheBench* cache_bench_; +}; + +// Per-thread state for concurrent executions of the same benchmark. +struct ThreadState { + uint32_t tid; + Random rnd; + SharedState* shared; + + ThreadState(uint32_t index, SharedState *shared) + : tid(index), + rnd(1000 + index), + shared(shared) {} +}; +} // namespace + +class CacheBench { + public: + CacheBench() : + cache_(NewLRUCache(FLAGS_cache_size, FLAGS_num_shard_bits)), + num_threads_(FLAGS_threads) {} + + ~CacheBench() {} + + bool Run() { + rocksdb::Env* env = rocksdb::Env::Default(); + + PrintEnv(); + SharedState shared(this); + std::vector threads(num_threads_); + for (uint32_t i = 0; i < num_threads_; i++) { + threads[i] = new ThreadState(i, &shared); + env->StartThread(ThreadBody, threads[i]); + } + { + MutexLock l(shared.GetMutex()); + while (!shared.AllInitialized()) { + shared.GetCondVar()->Wait(); + } + // Record start time + uint64_t start_time = env->NowMicros(); + + // Start all threads + shared.SetStart(); + shared.GetCondVar()->SignalAll(); + + // Wait threads to complete + while (!shared.AllDone()) { + shared.GetCondVar()->Wait(); + } + + // Record end time + uint64_t end_time = env->NowMicros(); + fprintf(stdout, "Complete in %" PRIu64 "ms\n", end_time - start_time); + } + return true; + } + + private: + std::shared_ptr cache_; + uint32_t num_threads_; + + static void ThreadBody(void* v) { + ThreadState* thread = reinterpret_cast(v); + SharedState* shared = thread->shared; + + { + MutexLock l(shared->GetMutex()); + shared->IncInitialized(); + if (shared->AllInitialized()) { + shared->GetCondVar()->SignalAll(); + } + while (!shared->Started()) { + shared->GetCondVar()->Wait(); + } + } + thread->shared->GetCacheBench()->OperateCache(thread); + + { + MutexLock l(shared->GetMutex()); + shared->IncDone(); + if (shared->AllDone()) { + shared->GetCondVar()->SignalAll(); + } + } + } + + void OperateCache(ThreadState* thread) { + for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) { + uint64_t rand_key = thread->rnd.Next() % FLAGS_max_key; + // Cast uint64* to be char*, data would be copied to cache + Slice key(reinterpret_cast(&rand_key), 8); + int32_t prob_op = thread->rnd.Uniform(100); + if (prob_op >= 0 && prob_op < FLAGS_insert_percent) { + // do insert + auto handle = cache_->Insert(key, new char[10], 1, &deleter); + cache_->Release(handle); + } else if (prob_op -= FLAGS_insert_percent && + prob_op < FLAGS_lookup_percent) { + // do lookup + auto handle = cache_->Lookup(key); + if (handle) { + cache_->Release(handle); + } + } else if (prob_op -= FLAGS_lookup_percent && + prob_op < FLAGS_erase_percent) { + // do erase + cache_->Erase(key); + } + } + } + + void PrintEnv() const { + printf("RocksDB version : %d.%d\n", kMajorVersion, kMinorVersion); + printf("Number of threads : %d\n", FLAGS_threads); + printf("Ops per thread : %" PRIu64 "\n", FLAGS_ops_per_thread); + printf("Cache size : %" PRIu64 "\n", FLAGS_cache_size); + printf("Num shard bits : %d\n", FLAGS_num_shard_bits); + printf("Max key : %" PRIu64 "\n", FLAGS_max_key); + printf("Insert percentage : %d%%\n", FLAGS_insert_percent); + printf("Lookup percentage : %d%%\n", FLAGS_lookup_percent); + printf("Erase percentage : %d%%\n", FLAGS_erase_percent); + printf("----------------------------\n"); + } +}; +} // namespace rocksdb + +int main(int argc, char** argv) { + ParseCommandLineFlags(&argc, &argv, true); + + if (FLAGS_threads <= 0) { + fprintf(stderr, "threads number <= 0\n"); + exit(1); + } + + rocksdb::CacheBench bench; + if (bench.Run()) { + return 0; + } else { + return 1; + } +} + +#endif // GFLAGS diff --git a/util/db_info_dummper.cc b/util/db_info_dummper.cc index d5dd97ad2c1..2e0d3448166 100644 --- a/util/db_info_dummper.cc +++ b/util/db_info_dummper.cc @@ -6,7 +6,10 @@ // Must not be included from any .h files to avoid polluting the namespace // with macros. +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc index 3e55488f223..6d228e81df7 100644 --- a/util/dynamic_bloom_test.cc +++ b/util/dynamic_bloom_test.cc @@ -11,7 +11,10 @@ int main() { } #else +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include diff --git a/util/env_test.cc b/util/env_test.cc index c0d00ce94dc..1c4d0bba0ad 100644 --- a/util/env_test.cc +++ b/util/env_test.cc @@ -392,6 +392,9 @@ TEST(EnvPosixTest, DecreaseNumBgThreads) { } #ifdef OS_LINUX +// Travis doesn't support fallocate or getting unique ID from files for whatever +// reason. +#ifndef TRAVIS // To make sure the Env::GetUniqueId() related tests work correctly, The files // should be stored in regular storage like "hard disk" or "flash device". // Otherwise we cannot get the correct id. @@ -507,7 +510,7 @@ TEST(EnvPosixTest, AllocateTest) { // verify that preallocated blocks were deallocated on file close ASSERT_GT(st_blocks, f_stat.st_blocks); } -#endif +#endif // ROCKSDB_FALLOCATE_PRESENT // Returns true if any of the strings in ss are the prefix of another string. bool HasPrefix(const std::unordered_set& ss) { @@ -638,7 +641,8 @@ TEST(EnvPosixTest, InvalidateCache) { // Delete the file ASSERT_OK(env_->DeleteFile(fname)); } -#endif +#endif // not TRAVIS +#endif // OS_LINUX TEST(EnvPosixTest, PosixRandomRWFileTest) { EnvOptions soptions; diff --git a/util/histogram.cc b/util/histogram.cc index 968769cef59..0dbfba7d629 100644 --- a/util/histogram.cc +++ b/util/histogram.cc @@ -53,7 +53,7 @@ HistogramBucketMapper::HistogramBucketMapper() } } -const size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const { +size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const { if (value >= maxBucketValue_) { return bucketValues_.size() - 1; } else if ( value >= minBucketValue_ ) { diff --git a/util/histogram.h b/util/histogram.h index d95588dc2d5..af3a019d803 100644 --- a/util/histogram.h +++ b/util/histogram.h @@ -23,10 +23,10 @@ class HistogramBucketMapper { HistogramBucketMapper(); // converts a value to the bucket index. - const size_t IndexForValue(const uint64_t value) const; + size_t IndexForValue(const uint64_t value) const; // number of buckets required. - const size_t BucketCount() const { + size_t BucketCount() const { return bucketValues_.size(); } @@ -65,6 +65,8 @@ class HistogramImpl { virtual double StandardDeviation() const; virtual void Data(HistogramData * const data) const; + virtual ~HistogramImpl() {} + private: // To be able to use HistogramImpl as thread local variable, its constructor // has to be static. That's why we're using manually values from BucketMapper diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc index 1aa3856a356..aef84fa356f 100644 --- a/util/ldb_cmd.cc +++ b/util/ldb_cmd.cc @@ -14,6 +14,7 @@ #include "rocksdb/write_batch.h" #include "rocksdb/cache.h" #include "util/coding.h" +#include "util/scoped_arena_iterator.h" #include "utilities/ttl/db_ttl_impl.h" #include @@ -739,7 +740,8 @@ void InternalDumpCommand::DoCommand() { uint64_t c=0; uint64_t s1=0,s2=0; // Setup internal key iterator - auto iter = unique_ptr(idb->TEST_NewInternalIterator()); + Arena arena; + ScopedArenaIterator iter(idb->TEST_NewInternalIterator(&arena)); Status st = iter->status(); if (!st.ok()) { exec_state_ = LDBCommandExecuteResult::FAILED("Iterator error:" diff --git a/util/logging.cc b/util/logging.cc index 1b5549d7319..4dfb9a44921 100644 --- a/util/logging.cc +++ b/util/logging.cc @@ -9,7 +9,10 @@ #include "util/logging.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include diff --git a/util/options.cc b/util/options.cc index b16c6f2f57a..371ecda789e 100644 --- a/util/options.cc +++ b/util/options.cc @@ -8,8 +8,12 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "rocksdb/options.h" +#include "rocksdb/immutable_options.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include @@ -28,6 +32,26 @@ namespace rocksdb { +ImmutableCFOptions::ImmutableCFOptions(const Options& options) + : prefix_extractor(options.prefix_extractor.get()), + comparator(options.comparator), + merge_operator(options.merge_operator.get()), + info_log(options.info_log.get()), + statistics(options.statistics.get()), + env(options.env), + allow_mmap_reads(options.allow_mmap_reads), + allow_mmap_writes(options.allow_mmap_writes), + db_paths(options.db_paths), + table_factory(options.table_factory.get()), + table_properties_collector_factories( + options.table_properties_collector_factories), + advise_random_on_open(options.advise_random_on_open), + bloom_locality(options.bloom_locality), + purge_redundant_kvs_while_flush(options.purge_redundant_kvs_while_flush), + min_partial_merge_operands(options.min_partial_merge_operands), + disable_data_sync(options.disableDataSync), + use_fsync(options.use_fsync) {} + ColumnFamilyOptions::ColumnFamilyOptions() : comparator(BytewiseComparator()), merge_operator(nullptr), diff --git a/util/options_test.cc b/util/options_test.cc index be07a83f529..afe3795f99f 100644 --- a/util/options_test.cc +++ b/util/options_test.cc @@ -7,7 +7,10 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include diff --git a/util/rate_limiter_test.cc b/util/rate_limiter_test.cc index 1b72e4ed0b2..9d6cfb7e6a0 100644 --- a/util/rate_limiter_test.cc +++ b/util/rate_limiter_test.cc @@ -7,7 +7,10 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include "util/testharness.h" diff --git a/util/scoped_arena_iterator.h b/util/scoped_arena_iterator.h new file mode 100644 index 00000000000..2021d2dc226 --- /dev/null +++ b/util/scoped_arena_iterator.h @@ -0,0 +1,28 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include "rocksdb/iterator.h" + +namespace rocksdb { +class ScopedArenaIterator { + public: + explicit ScopedArenaIterator(Iterator* iter = nullptr) : iter_(iter) {} + + Iterator* operator->() { return iter_; } + + void set(Iterator* iter) { iter_ = iter; } + + Iterator* get() { return iter_; } + + ~ScopedArenaIterator() { iter_->~Iterator(); } + + private: + Iterator* iter_; +}; +} // namespace rocksdb diff --git a/util/statistics.cc b/util/statistics.cc index 24957c9b6fc..9d828a6feb5 100644 --- a/util/statistics.cc +++ b/util/statistics.cc @@ -5,7 +5,10 @@ // #include "util/statistics.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include "rocksdb/statistics.h" #include "port/likely.h" diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc index 436f4c2d685..4d1a9b76b00 100644 --- a/utilities/backupable/backupable_db.cc +++ b/utilities/backupable/backupable_db.cc @@ -15,7 +15,9 @@ #include "util/crc32c.h" #include "rocksdb/transaction_log.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif #include #include diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index 1d876cd5010..a585d1a9cfb 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -916,7 +916,7 @@ TEST(BackupableDBTest, RateLimiting) { auto backup_time = env_->NowMicros() - start_backup; auto rate_limited_backup_time = (bytes_written * kMicrosPerSec) / backupable_options_->backup_rate_limit; - ASSERT_GT(backup_time, 0.9 * rate_limited_backup_time); + ASSERT_GT(backup_time, 0.8 * rate_limited_backup_time); CloseBackupableDB(); @@ -927,7 +927,7 @@ TEST(BackupableDBTest, RateLimiting) { CloseRestoreDB(); auto rate_limited_restore_time = (bytes_written * kMicrosPerSec) / backupable_options_->restore_rate_limit; - ASSERT_GT(restore_time, 0.9 * rate_limited_restore_time); + ASSERT_GT(restore_time, 0.8 * rate_limited_restore_time); AssertBackupConsistency(0, 0, 100000, 100010); } diff --git a/utilities/document/document_db.cc b/utilities/document/document_db.cc index c12a1f253ff..8e15a52ca8b 100644 --- a/utilities/document/document_db.cc +++ b/utilities/document/document_db.cc @@ -736,6 +736,7 @@ class DocumentDBImpl : public DocumentDB { CreateColumnFamily(ColumnFamilyOptions(rocksdb_options_), InternalSecondaryIndexName(index.name), &cf_handle); if (!s.ok()) { + delete index_obj; return s; } diff --git a/utilities/document/json_document.cc b/utilities/document/json_document.cc index 641f4ee09ed..4368b759d58 100644 --- a/utilities/document/json_document.cc +++ b/utilities/document/json_document.cc @@ -6,7 +6,10 @@ #include "rocksdb/utilities/json_document.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include diff --git a/utilities/geodb/geodb_impl.cc b/utilities/geodb/geodb_impl.cc index f63c91c3e59..6c13fd69167 100644 --- a/utilities/geodb/geodb_impl.cc +++ b/utilities/geodb/geodb_impl.cc @@ -7,7 +7,9 @@ #include "utilities/geodb/geodb_impl.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif #include #include diff --git a/utilities/spatialdb/spatial_db.cc b/utilities/spatialdb/spatial_db.cc index 8b9e49bd447..21a111d3ebb 100644 --- a/utilities/spatialdb/spatial_db.cc +++ b/utilities/spatialdb/spatial_db.cc @@ -7,7 +7,10 @@ #include "rocksdb/utilities/spatial_db.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include