diff --git a/.travis.yml b/.travis.yml index 66f37a5d284..bcb852cf04e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,6 @@ before_install: - sudo dpkg -i libgflags-dev_2.0-1_amd64.deb # Lousy hack to disable use and testing of fallocate, which doesn't behave quite # as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment. - - sed -i "s/fallocate(/HACK_NO_fallocate(/" build_tools/build_detect_platform -script: make check -j8 +script: OPT=-DTRAVIS make check -j8 notifications: email: false diff --git a/HISTORY.md b/HISTORY.md index b61cc4fff72..a8b89f54f3f 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,10 +1,34 @@ # Rocksdb Change Log -### Unreleased +## Unreleased (will be released with 3.6) +### Disk format changes +* If you're using RocksDB on ARM platforms and you're using default bloom filter, there is a disk format change you need to be aware of. There are three steps you need to do when you convert to new release: 1. turn off filter policy, 2. compact the whole database, 3. turn on filter policy + +### Behavior changes +* We have refactored our system of stalling writes. Any stall-related statistics' meanings are changed. Instead of per-write stall counts, we now count stalls per-epoch, where epochs are periods between flushes and compactions. You'll find more information in our Tuning Perf Guide once we release RocksDB 3.6. +* When disableDataSync=true, we no longer sync the MANIFEST file. +* Add identity_as_first_hash property to CuckooTable. SST file needs to be rebuilt to be opened by reader properly. +* Change target_file_size_base type to uint64_t from int. + +----- Past Releases ----- + +## 3.5.0 (9/3/2014) +### New Features +* Add include/utilities/write_batch_with_index.h, providing a utilitiy class to query data out of WriteBatch when building it. +* Move BlockBasedTable related options to BlockBasedTableOptions from Options. Change corresponding JNI interface. Options affected include: + no_block_cache, block_cache, block_cache_compressed, block_size, block_size_deviation, block_restart_interval, filter_policy, whole_key_filtering. filter_policy is changed to shared_ptr from a raw pointer. +* Remove deprecated options: disable_seek_compaction and db_stats_log_interval +* OptimizeForPointLookup() takes one parameter for block cache size. It now builds hash index, bloom filter, and block cache. + +### Public API changes +* The Prefix Extractor used with V2 compaction filters is now passed user key to SliceTransform::Transform instead of unparsed RocksDB key. + +## 3.4.0 (8/18/2014) ### New Features * Support Multiple DB paths in universal style compactions * Add feature of storing plain table index and bloom filter in SST file. * CompactRange() will never output compacted files to level 0. This used to be the case when all the compaction input files were at level 0. +* Added iterate_upper_bound to define the extent upto which the forward iterator will return entries. This will prevent iterating over delete markers and overwritten entries for edge cases where you want to break out the iterator anyways. This may improve perfomance in case there are a large number of delete markers or overwritten entries. ### Public API changes * DBOptions.db_paths now is a vector of a DBPath structure which indicates both of path and target size diff --git a/Makefile b/Makefile index 4c58e0b0ab0..9d626e17fcc 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,6 @@ # found in the LICENSE file. See the AUTHORS file for names of contributors. # Inherit some settings from environment variables, if available -INSTALL_PATH ?= $(CURDIR) #----------------------------------------------- @@ -49,6 +48,27 @@ else PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC endif +#------------------------------------------------- +# make install related stuff +INSTALL_PATH ?= /usr/local + +uninstall: + @rm -rf $(INSTALL_PATH)/include/rocksdb + @rm -rf $(INSTALL_PATH)/lib/$(LIBRARY) + @rm -rf $(INSTALL_PATH)/lib/$(SHARED) + +install: + @install -d $(INSTALL_PATH)/lib + @for header_dir in `find "include/rocksdb" -type d`; do \ + install -d $(INSTALL_PATH)/$$header_dir; \ + done + @for header in `find "include/rocksdb" -type f -name *.h`; do \ + install -C -m 644 $$header $(INSTALL_PATH)/$$header; \ + done + @[ ! -e $(LIBRARY) ] || install -C -m 644 $(LIBRARY) $(INSTALL_PATH)/lib + @[ ! -e $(SHARED) ] || install -C -m 644 $(SHARED) $(INSTALL_PATH)/lib +#------------------------------------------------- + WARNING_FLAGS = -Wall -Werror -Wsign-compare CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT) CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual @@ -90,12 +110,14 @@ TESTS = \ blob_store_test \ filelock_test \ filename_test \ - filter_block_test \ + block_based_filter_block_test \ + full_filter_block_test \ histogram_test \ log_test \ manual_compaction_test \ memenv_test \ merge_test \ + merger_test \ redis_test \ reduce_levels_test \ plain_table_db_test \ @@ -111,17 +133,18 @@ TESTS = \ version_edit_test \ version_set_test \ file_indexer_test \ - write_batch_test\ + write_batch_test \ + write_controller_test\ deletefile_test \ table_test \ thread_local_test \ geodb_test \ rate_limiter_test \ - cuckoo_table_builder_test \ options_test \ cuckoo_table_builder_test \ cuckoo_table_reader_test \ - cuckoo_table_db_test + cuckoo_table_db_test \ + write_batch_with_index_test TOOLS = \ sst_dump \ @@ -132,7 +155,7 @@ TOOLS = \ options_test \ blob_store_bench -PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench $(TOOLS) +PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench cache_bench $(TOOLS) # The library name is configurable since we are maintaining libraries of both # debug/release mode. @@ -175,7 +198,7 @@ endif # PLATFORM_SHARED_EXT .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \ release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \ - dbg rocksdbjavastatic rocksdbjava + dbg rocksdbjavastatic rocksdbjava install uninstall all: $(LIBRARY) $(PROGRAMS) $(TESTS) @@ -264,6 +287,9 @@ $(LIBRARY): $(LIBOBJECTS) db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +cache_bench: util/cache_bench.o $(LIBOBJECTS) $(TESTUTIL) + $(CXX) util/cache_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + block_hash_index_test: table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) @@ -375,6 +401,9 @@ spatial_db_test: utilities/spatialdb/spatial_db_test.o $(LIBOBJECTS) $(TESTHARNE ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +write_batch_with_index_test: utilities/write_batch_with_index/write_batch_with_index_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) utilities/write_batch_with_index/write_batch_with_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) @@ -387,8 +416,11 @@ rate_limiter_test: util/rate_limiter_test.o $(LIBOBJECTS) $(TESTHARNESS) filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +block_based_filter_block_test: table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +full_filter_block_test: table/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) table/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) @@ -417,9 +449,15 @@ reduce_levels_test: tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS) write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +write_controller_test: db/write_controller_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/write_controller_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +merger_test: table/merger_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) table/merger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) diff --git a/README.md b/README.md index bda801fd77b..916bdecdeed 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![Build Status](https://travis-ci.org/facebook/rocksdb.svg?branch=master)](https://travis-ci.org/facebook/rocksdb) RocksDB is developed and maintained by Facebook Database Engineering Team. -It is built on on earlier work on LevelDB by Sanjay Ghemawat (sanjay@google.com) +It is built on earlier work on LevelDB by Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com) This code is a library that forms the core building block for a fast diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index 3389d2851b0..8479e31274e 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -46,7 +46,7 @@ PLATFORM_CXXFLAGS="-std=c++11" COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX" # Default to fbcode gcc on internal fb machines -if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then +if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then FBCODE_BUILD="true" if [ -z "$USE_CLANG" ]; then CENTOS_VERSION=`rpm -q --qf "%{VERSION}" \ diff --git a/build_tools/regression_build_test.sh b/build_tools/regression_build_test.sh index 5e335afde21..ee2d334f0bf 100755 --- a/build_tools/regression_build_test.sh +++ b/build_tools/regression_build_test.sh @@ -344,6 +344,38 @@ common_in_mem_args="--db=/dev/shm/rocksdb \ --threads=32 \ --writes_per_second=81920 > ${STAT_FILE}.seekwhilewriting_in_ram +# measure fillseq with bunch of column families +./db_bench \ + --benchmarks=fillseq \ + --num_column_families=500 \ + --write_buffer_size=1048576 \ + --db=$DATA_DIR \ + --use_existing_db=0 \ + --num=$NUM \ + --writes=$NUM \ + --open_files=55000 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 > ${STAT_FILE}.fillseq_lots_column_families + +# measure overwrite performance with bunch of column families +./db_bench \ + --benchmarks=overwrite \ + --num_column_families=500 \ + --write_buffer_size=1048576 \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --num=$NUM \ + --writes=$((NUM / 10)) \ + --open_files=55000 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=8 > ${STAT_FILE}.overwrite_lots_column_families # send data to ods function send_to_ods { @@ -392,3 +424,5 @@ send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadr send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting send_benchmark_to_ods readwhilewriting readwhilewriting_in_ram ${STAT_FILE}.readwhilewriting_in_ram send_benchmark_to_ods seekrandomwhilewriting seekwhilewriting_in_ram ${STAT_FILE}.seekwhilewriting_in_ram +send_benchmark_to_ods fillseq fillseq_lots_column_families ${STAT_FILE}.fillseq_lots_column_families +send_benchmark_to_ods overwrite overwrite_lots_column_families ${STAT_FILE}.overwrite_lots_column_families diff --git a/db/builder.cc b/db/builder.cc index 1084f04138e..2c509437038 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -26,21 +26,24 @@ namespace rocksdb { class TableFactory; -TableBuilder* NewTableBuilder(const Options& options, +TableBuilder* NewTableBuilder(const ImmutableCFOptions& ioptions, const InternalKeyComparator& internal_comparator, WritableFile* file, - CompressionType compression_type) { - return options.table_factory->NewTableBuilder(options, internal_comparator, - file, compression_type); + const CompressionType compression_type, + const CompressionOptions& compression_opts) { + return ioptions.table_factory->NewTableBuilder( + ioptions, internal_comparator, file, compression_type, compression_opts); } -Status BuildTable(const std::string& dbname, Env* env, const Options& options, - const EnvOptions& soptions, TableCache* table_cache, +Status BuildTable(const std::string& dbname, Env* env, + const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, TableCache* table_cache, Iterator* iter, FileMetaData* meta, const InternalKeyComparator& internal_comparator, const SequenceNumber newest_snapshot, const SequenceNumber earliest_seqno_in_memtable, const CompressionType compression, + const CompressionOptions& compression_opts, const Env::IOPriority io_priority) { Status s; meta->fd.file_size = 0; @@ -50,23 +53,24 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options, // If the sequence number of the smallest entry in the memtable is // smaller than the most recent snapshot, then we do not trigger // removal of duplicate/deleted keys as part of this builder. - bool purge = options.purge_redundant_kvs_while_flush; + bool purge = ioptions.purge_redundant_kvs_while_flush; if (earliest_seqno_in_memtable <= newest_snapshot) { purge = false; } - std::string fname = TableFileName(options.db_paths, meta->fd.GetNumber(), + std::string fname = TableFileName(ioptions.db_paths, meta->fd.GetNumber(), meta->fd.GetPathId()); if (iter->Valid()) { unique_ptr file; - s = env->NewWritableFile(fname, &file, soptions); + s = env->NewWritableFile(fname, &file, env_options); if (!s.ok()) { return s; } file->SetIOPriority(io_priority); - TableBuilder* builder = - NewTableBuilder(options, internal_comparator, file.get(), compression); + TableBuilder* builder = NewTableBuilder( + ioptions, internal_comparator, file.get(), + compression, compression_opts); // the first key is the smallest key Slice key = iter->key(); @@ -75,8 +79,8 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options, meta->largest_seqno = meta->smallest_seqno; MergeHelper merge(internal_comparator.user_comparator(), - options.merge_operator.get(), options.info_log.get(), - options.min_partial_merge_operands, + ioptions.merge_operator, ioptions.info_log, + ioptions.min_partial_merge_operands, true /* internal key corruption is not ok */); if (purge) { @@ -196,12 +200,12 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options, delete builder; // Finish and check for file errors - if (s.ok() && !options.disableDataSync) { - if (options.use_fsync) { - StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS); + if (s.ok() && !ioptions.disable_data_sync) { + if (ioptions.use_fsync) { + StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS); s = file->Fsync(); } else { - StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS); + StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS); s = file->Sync(); } } @@ -211,7 +215,7 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options, if (s.ok()) { // Verify that the table is usable - Iterator* it = table_cache->NewIterator(ReadOptions(), soptions, + Iterator* it = table_cache->NewIterator(ReadOptions(), env_options, internal_comparator, meta->fd); s = it->status(); delete it; diff --git a/db/builder.h b/db/builder.h index f57501abd13..cf3ebd1ae0e 100644 --- a/db/builder.h +++ b/db/builder.h @@ -11,6 +11,7 @@ #include "rocksdb/status.h" #include "rocksdb/types.h" #include "rocksdb/options.h" +#include "rocksdb/immutable_options.h" namespace rocksdb { @@ -26,8 +27,10 @@ class TableBuilder; class WritableFile; extern TableBuilder* NewTableBuilder( - const Options& options, const InternalKeyComparator& internal_comparator, - WritableFile* file, CompressionType compression_type); + const ImmutableCFOptions& options, + const InternalKeyComparator& internal_comparator, + WritableFile* file, const CompressionType compression_type, + const CompressionOptions& compression_opts); // Build a Table file from the contents of *iter. The generated file // will be named according to number specified in meta. On success, the rest of @@ -35,13 +38,15 @@ extern TableBuilder* NewTableBuilder( // If no data is present in *iter, meta->file_size will be set to // zero, and no Table file will be produced. extern Status BuildTable(const std::string& dbname, Env* env, - const Options& options, const EnvOptions& soptions, + const ImmutableCFOptions& options, + const EnvOptions& env_options, TableCache* table_cache, Iterator* iter, FileMetaData* meta, const InternalKeyComparator& internal_comparator, const SequenceNumber newest_snapshot, const SequenceNumber earliest_seqno_in_memtable, const CompressionType compression, + const CompressionOptions& compression_opts, const Env::IOPriority io_priority = Env::IO_HIGH); } // namespace rocksdb diff --git a/db/c.cc b/db/c.cc index da2c88a1195..d9dee46fb7b 100644 --- a/db/c.cc +++ b/db/c.cc @@ -55,6 +55,7 @@ using rocksdb::MergeOperator; using rocksdb::NewBloomFilterPolicy; using rocksdb::NewLRUCache; using rocksdb::Options; +using rocksdb::BlockBasedTableOptions; using rocksdb::RandomAccessFile; using rocksdb::Range; using rocksdb::ReadOptions; @@ -81,6 +82,7 @@ struct rocksdb_fifo_compaction_options_t { CompactionOptionsFIFO rep; }; struct rocksdb_readoptions_t { ReadOptions rep; }; struct rocksdb_writeoptions_t { WriteOptions rep; }; struct rocksdb_options_t { Options rep; }; +struct rocksdb_block_based_table_options_t { BlockBasedTableOptions rep; }; struct rocksdb_seqfile_t { SequentialFile* rep; }; struct rocksdb_randomfile_t { RandomAccessFile* rep; }; struct rocksdb_writablefile_t { WritableFile* rep; }; @@ -116,7 +118,7 @@ struct rocksdb_compactionfilter_t : public CompactionFilter { const Slice& existing_value, std::string* new_value, bool* value_changed) const { - char* c_new_value = NULL; + char* c_new_value = nullptr; size_t new_value_length = 0; unsigned char c_value_changed = 0; unsigned char result = (*filter_)( @@ -1053,6 +1055,74 @@ const char* rocksdb_writebatch_data(rocksdb_writebatch_t* b, size_t* size) { return b->rep.Data().c_str(); } +rocksdb_block_based_table_options_t* +rocksdb_block_based_options_create() { + return new rocksdb_block_based_table_options_t; +} + +void rocksdb_block_based_options_destroy( + rocksdb_block_based_table_options_t* options) { + delete options; +} + +void rocksdb_block_based_options_set_block_size( + rocksdb_block_based_table_options_t* options, size_t block_size) { + options->rep.block_size = block_size; +} + +void rocksdb_block_based_options_set_block_size_deviation( + rocksdb_block_based_table_options_t* options, int block_size_deviation) { + options->rep.block_size_deviation = block_size_deviation; +} + +void rocksdb_block_based_options_set_block_restart_interval( + rocksdb_block_based_table_options_t* options, int block_restart_interval) { + options->rep.block_restart_interval = block_restart_interval; +} + +void rocksdb_block_based_options_set_filter_policy( + rocksdb_block_based_table_options_t* options, + rocksdb_filterpolicy_t* filter_policy) { + options->rep.filter_policy.reset(filter_policy); +} + +void rocksdb_block_based_options_set_no_block_cache( + rocksdb_block_based_table_options_t* options, + unsigned char no_block_cache) { + options->rep.no_block_cache = no_block_cache; +} + +void rocksdb_block_based_options_set_block_cache( + rocksdb_block_based_table_options_t* options, + rocksdb_cache_t* block_cache) { + if (block_cache) { + options->rep.block_cache = block_cache->rep; + } +} + +void rocksdb_block_based_options_set_block_cache_compressed( + rocksdb_block_based_table_options_t* options, + rocksdb_cache_t* block_cache_compressed) { + if (block_cache_compressed) { + options->rep.block_cache_compressed = block_cache_compressed->rep; + } +} + +void rocksdb_block_based_options_set_whole_key_filtering( + rocksdb_block_based_table_options_t* options, unsigned char v) { + options->rep.whole_key_filtering = v; +} + +void rocksdb_options_set_block_based_table_factory( + rocksdb_options_t *opt, + rocksdb_block_based_table_options_t* table_options) { + if (table_options) { + opt->rep.table_factory.reset( + rocksdb::NewBlockBasedTableFactory(table_options->rep)); + } +} + + rocksdb_options_t* rocksdb_options_create() { return new rocksdb_options_t; } @@ -1067,8 +1137,8 @@ void rocksdb_options_increase_parallelism( } void rocksdb_options_optimize_for_point_lookup( - rocksdb_options_t* opt) { - opt->rep.OptimizeForPointLookup(); + rocksdb_options_t* opt, uint64_t block_cache_size_mb) { + opt->rep.OptimizeForPointLookup(block_cache_size_mb); } void rocksdb_options_optimize_level_style_compaction( @@ -1111,12 +1181,6 @@ void rocksdb_options_set_compaction_filter_factory_v2( opt->rep.compaction_filter_factory_v2 = std::shared_ptr(compaction_filter_factory_v2); } -void rocksdb_options_set_filter_policy( - rocksdb_options_t* opt, - rocksdb_filterpolicy_t* policy) { - opt->rep.filter_policy = policy; -} - void rocksdb_options_set_create_if_missing( rocksdb_options_t* opt, unsigned char v) { opt->rep.create_if_missing = v; @@ -1160,26 +1224,6 @@ void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) { opt->rep.max_open_files = n; } -void rocksdb_options_set_cache(rocksdb_options_t* opt, rocksdb_cache_t* c) { - if (c) { - opt->rep.block_cache = c->rep; - } -} - -void rocksdb_options_set_cache_compressed(rocksdb_options_t* opt, rocksdb_cache_t* c) { - if (c) { - opt->rep.block_cache_compressed = c->rep; - } -} - -void rocksdb_options_set_block_size(rocksdb_options_t* opt, size_t s) { - opt->rep.block_size = s; -} - -void rocksdb_options_set_block_restart_interval(rocksdb_options_t* opt, int n) { - opt->rep.block_restart_interval = n; -} - void rocksdb_options_set_target_file_size_base( rocksdb_options_t* opt, uint64_t n) { opt->rep.target_file_size_base = n; @@ -1272,11 +1316,6 @@ void rocksdb_options_set_prefix_extractor( opt->rep.prefix_extractor.reset(prefix_extractor); } -void rocksdb_options_set_whole_key_filtering( - rocksdb_options_t* opt, unsigned char v) { - opt->rep.whole_key_filtering = v; -} - void rocksdb_options_set_disable_data_sync( rocksdb_options_t* opt, int disable_data_sync) { opt->rep.disableDataSync = disable_data_sync; @@ -1287,11 +1326,6 @@ void rocksdb_options_set_use_fsync( opt->rep.use_fsync = use_fsync; } -void rocksdb_options_set_db_stats_log_interval( - rocksdb_options_t* opt, int db_stats_log_interval) { - opt->rep.db_stats_log_interval = db_stats_log_interval; -} - void rocksdb_options_set_db_log_dir( rocksdb_options_t* opt, const char* db_log_dir) { opt->rep.db_log_dir = db_log_dir; @@ -1351,11 +1385,6 @@ void rocksdb_options_set_stats_dump_period_sec( opt->rep.stats_dump_period_sec = v; } -void rocksdb_options_set_block_size_deviation( - rocksdb_options_t* opt, int v) { - opt->rep.block_size_deviation = v; -} - void rocksdb_options_set_advise_random_on_open( rocksdb_options_t* opt, unsigned char v) { opt->rep.advise_random_on_open = v; @@ -1450,11 +1479,6 @@ void rocksdb_options_set_max_manifest_file_size( opt->rep.max_manifest_file_size = v; } -void rocksdb_options_set_no_block_cache( - rocksdb_options_t* opt, unsigned char v) { - opt->rep.no_block_cache = v; -} - void rocksdb_options_set_table_cache_numshardbits( rocksdb_options_t* opt, int v) { opt->rep.table_cache_numshardbits = v; @@ -1474,10 +1498,6 @@ void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt, int di opt->rep.disable_auto_compactions = disable; } -void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t* opt, int disable) { - opt->rep.disable_seek_compaction = disable; -} - void rocksdb_options_set_delete_obsolete_files_period_micros( rocksdb_options_t* opt, uint64_t v) { opt->rep.delete_obsolete_files_period_micros = v; @@ -1824,6 +1844,13 @@ void rocksdb_readoptions_set_snapshot( opt->rep.snapshot = (snap ? snap->rep : nullptr); } +void rocksdb_readoptions_set_iterate_upper_bound( + rocksdb_readoptions_t* opt, + const char* key, size_t keylen) { + Slice prefix = Slice(key, keylen); + opt->rep.iterate_upper_bound = &prefix; +} + void rocksdb_readoptions_set_read_tier( rocksdb_readoptions_t* opt, int v) { opt->rep.read_tier = static_cast(v); diff --git a/db/c_test.c b/db/c_test.c index 065ea3423cd..171fd6d5c88 100644 --- a/db/c_test.c +++ b/db/c_test.c @@ -335,6 +335,7 @@ int main(int argc, char** argv) { rocksdb_cache_t* cache; rocksdb_env_t* env; rocksdb_options_t* options; + rocksdb_block_based_table_options_t* table_options; rocksdb_readoptions_t* roptions; rocksdb_writeoptions_t* woptions; char* err = NULL; @@ -353,14 +354,15 @@ int main(int argc, char** argv) { options = rocksdb_options_create(); rocksdb_options_set_comparator(options, cmp); rocksdb_options_set_error_if_exists(options, 1); - rocksdb_options_set_cache(options, cache); rocksdb_options_set_env(options, env); rocksdb_options_set_info_log(options, NULL); rocksdb_options_set_write_buffer_size(options, 100000); rocksdb_options_set_paranoid_checks(options, 1); rocksdb_options_set_max_open_files(options, 10); - rocksdb_options_set_block_size(options, 1024); - rocksdb_options_set_block_restart_interval(options, 8); + table_options = rocksdb_block_based_options_create(); + rocksdb_block_based_options_set_block_cache(table_options, cache); + rocksdb_options_set_block_based_table_factory(options, table_options); + rocksdb_options_set_compression(options, rocksdb_no_compression); rocksdb_options_set_compression_options(options, -14, -1, 0); int compression_levels[] = {rocksdb_no_compression, rocksdb_no_compression, @@ -540,10 +542,12 @@ int main(int argc, char** argv) { policy = rocksdb_filterpolicy_create_bloom(10); } + rocksdb_block_based_options_set_filter_policy(table_options, policy); + // Create new database rocksdb_close(db); rocksdb_destroy_db(options, dbname, &err); - rocksdb_options_set_filter_policy(options, policy); + rocksdb_options_set_block_based_table_factory(options, table_options); db = rocksdb_open(options, dbname, &err); CheckNoError(err); rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err); @@ -565,8 +569,9 @@ int main(int argc, char** argv) { CheckGet(db, roptions, "foo", "foovalue"); CheckGet(db, roptions, "bar", "barvalue"); } - rocksdb_options_set_filter_policy(options, NULL); - rocksdb_filterpolicy_destroy(policy); + // Reset the policy + rocksdb_block_based_options_set_filter_policy(table_options, NULL); + rocksdb_options_set_block_based_table_factory(options, table_options); } StartPhase("compaction_filter"); @@ -757,8 +762,7 @@ int main(int argc, char** argv) { StartPhase("prefix"); { // Create new database - rocksdb_filterpolicy_t* policy = rocksdb_filterpolicy_create_bloom(10); - rocksdb_options_set_filter_policy(options, policy); + rocksdb_options_set_allow_mmap_reads(options, 1); rocksdb_options_set_prefix_extractor(options, rocksdb_slicetransform_create_fixed_prefix(3)); rocksdb_options_set_hash_skip_list_rep(options, 5000, 4, 4); rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16); @@ -795,13 +799,13 @@ int main(int argc, char** argv) { rocksdb_iter_get_error(iter, &err); CheckNoError(err); rocksdb_iter_destroy(iter); - rocksdb_filterpolicy_destroy(policy); } StartPhase("cleanup"); rocksdb_close(db); rocksdb_options_destroy(options); + rocksdb_block_based_options_destroy(table_options); rocksdb_readoptions_destroy(roptions); rocksdb_writeoptions_destroy(woptions); rocksdb_cache_destroy(cache); diff --git a/db/column_family.cc b/db/column_family.cc index d4467eea088..8b4e007ede6 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -9,6 +9,11 @@ #include "db/column_family.h" +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include #include #include #include @@ -19,11 +24,43 @@ #include "db/internal_stats.h" #include "db/compaction_picker.h" #include "db/table_properties_collector.h" +#include "db/write_controller.h" #include "util/autovector.h" #include "util/hash_skiplist_rep.h" +#include "util/options_helper.h" namespace rocksdb { +namespace { +// This function computes the amount of time in microseconds by which a write +// should be delayed based on the number of level-0 files according to the +// following formula: +// if n < bottom, return 0; +// if n >= top, return 1000; +// otherwise, let r = (n - bottom) / +// (top - bottom) +// and return r^2 * 1000. +// The goal of this formula is to gradually increase the rate at which writes +// are slowed. We also tried linear delay (r * 1000), but it seemed to do +// slightly worse. There is no other particular reason for choosing quadratic. +uint64_t SlowdownAmount(int n, double bottom, double top) { + uint64_t delay; + if (n >= top) { + delay = 1000; + } else if (n < bottom) { + delay = 0; + } else { + // If we are here, we know that: + // level0_start_slowdown <= n < level0_slowdown + // since the previous two conditions are false. + double how_much = static_cast(n - bottom) / (top - bottom); + delay = std::max(how_much * how_much * 1000, 100.0); + } + assert(delay <= 1000); + return delay; +} +} // namespace + ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db, port::Mutex* mutex) : cfd_(cfd), db_(db), mutex_(mutex) { @@ -49,12 +86,14 @@ ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() { uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); } +const Comparator* ColumnFamilyHandleImpl::user_comparator() const { + return cfd()->user_comparator(); +} + ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp, - const InternalFilterPolicy* ipolicy, const ColumnFamilyOptions& src) { ColumnFamilyOptions result = src; result.comparator = icmp; - result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr; #ifdef OS_MACOSX // TODO(icanadi) make write_buffer_size uint64_t instead of size_t ClipToRange(&result.write_buffer_size, ((size_t)64) << 10, ((size_t)1) << 30); @@ -70,13 +109,7 @@ ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp, result.min_write_buffer_number_to_merge = std::min(result.min_write_buffer_number_to_merge, result.max_write_buffer_number - 1); - if (result.block_cache == nullptr && !result.no_block_cache) { - result.block_cache = NewLRUCache(8 << 20); - } result.compression_per_level = src.compression_per_level; - if (result.block_size_deviation < 0 || result.block_size_deviation > 100) { - result.block_size_deviation = 0; - } if (result.max_mem_compaction_level >= result.num_levels) { result.max_mem_compaction_level = result.num_levels - 1; } @@ -184,9 +217,9 @@ void SuperVersionUnrefHandle(void* ptr) { ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name, Version* dummy_versions, Cache* table_cache, - const ColumnFamilyOptions& options, + const ColumnFamilyOptions& cf_options, const DBOptions* db_options, - const EnvOptions& storage_options, + const EnvOptions& env_options, ColumnFamilySet* column_family_set) : id_(id), name_(name), @@ -194,10 +227,10 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name, current_(nullptr), refs_(0), dropped_(false), - internal_comparator_(options.comparator), - internal_filter_policy_(options.filter_policy), - options_(*db_options, SanitizeOptions(&internal_comparator_, - &internal_filter_policy_, options)), + internal_comparator_(cf_options.comparator), + options_(*db_options, SanitizeOptions(&internal_comparator_, cf_options)), + ioptions_(options_), + mutable_cf_options_(options_), mem_(nullptr), imm_(options_.min_write_buffer_number_to_merge), super_version_(nullptr), @@ -206,7 +239,6 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name, next_(nullptr), prev_(nullptr), log_number_(0), - need_slowdown_for_num_level0_files_(false), column_family_set_(column_family_set) { Ref(); @@ -214,7 +246,7 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name, if (dummy_versions != nullptr) { internal_stats_.reset( new InternalStats(options_.num_levels, db_options->env, this)); - table_cache_.reset(new TableCache(&options_, storage_options, table_cache)); + table_cache_.reset(new TableCache(ioptions_, env_options, table_cache)); if (options_.compaction_style == kCompactionStyleUniversal) { compaction_picker_.reset( new UniversalCompactionPicker(&options_, &internal_comparator_)); @@ -287,57 +319,82 @@ ColumnFamilyData::~ColumnFamilyData() { } void ColumnFamilyData::RecalculateWriteStallConditions() { - need_wait_for_num_memtables_ = - (imm()->size() == options()->max_write_buffer_number - 1); - - if (current_ != nullptr) { - need_wait_for_num_level0_files_ = - (current_->NumLevelFiles(0) >= options()->level0_stop_writes_trigger); - } else { - need_wait_for_num_level0_files_ = false; - } - - RecalculateWriteStallRateLimitsConditions(); -} - -void ColumnFamilyData::RecalculateWriteStallRateLimitsConditions() { if (current_ != nullptr) { - exceeds_hard_rate_limit_ = - (options()->hard_rate_limit > 1.0 && - current_->MaxCompactionScore() > options()->hard_rate_limit); - - exceeds_soft_rate_limit_ = - (options()->soft_rate_limit > 0.0 && - current_->MaxCompactionScore() > options()->soft_rate_limit); - } else { - exceeds_hard_rate_limit_ = false; - exceeds_soft_rate_limit_ = false; + const double score = current_->MaxCompactionScore(); + const int max_level = current_->MaxCompactionScoreLevel(); + + auto write_controller = column_family_set_->write_controller_; + + if (imm()->size() == options_.max_write_buffer_number) { + write_controller_token_ = write_controller->GetStopToken(); + internal_stats_->AddCFStats(InternalStats::MEMTABLE_COMPACTION, 1); + Log(options_.info_log, + "[%s] Stopping writes because we have %d immutable memtables " + "(waiting for flush)", + name_.c_str(), imm()->size()); + } else if (current_->NumLevelFiles(0) >= + options_.level0_stop_writes_trigger) { + write_controller_token_ = write_controller->GetStopToken(); + internal_stats_->AddCFStats(InternalStats::LEVEL0_NUM_FILES, 1); + Log(options_.info_log, + "[%s] Stopping writes because we have %d level-0 files", + name_.c_str(), current_->NumLevelFiles(0)); + } else if (options_.level0_slowdown_writes_trigger >= 0 && + current_->NumLevelFiles(0) >= + options_.level0_slowdown_writes_trigger) { + uint64_t slowdown = SlowdownAmount( + current_->NumLevelFiles(0), options_.level0_slowdown_writes_trigger, + options_.level0_stop_writes_trigger); + write_controller_token_ = write_controller->GetDelayToken(slowdown); + internal_stats_->AddCFStats(InternalStats::LEVEL0_SLOWDOWN, slowdown); + Log(options_.info_log, + "[%s] Stalling writes because we have %d level-0 files (%" PRIu64 + "us)", + name_.c_str(), current_->NumLevelFiles(0), slowdown); + } else if (options_.hard_rate_limit > 1.0 && + score > options_.hard_rate_limit) { + uint64_t kHardLimitSlowdown = 1000; + write_controller_token_ = + write_controller->GetDelayToken(kHardLimitSlowdown); + internal_stats_->RecordLevelNSlowdown(max_level, kHardLimitSlowdown, + false); + Log(options_.info_log, + "[%s] Stalling writes because we hit hard limit on level %d. " + "(%" PRIu64 "us)", + name_.c_str(), max_level, kHardLimitSlowdown); + } else if (options_.soft_rate_limit > 0.0 && + score > options_.soft_rate_limit) { + uint64_t slowdown = SlowdownAmount(score, options_.soft_rate_limit, + options_.hard_rate_limit); + write_controller_token_ = write_controller->GetDelayToken(slowdown); + internal_stats_->RecordLevelNSlowdown(max_level, slowdown, true); + Log(options_.info_log, + "[%s] Stalling writes because we hit soft limit on level %d (%" PRIu64 + "us)", + name_.c_str(), max_level, slowdown); + } else { + write_controller_token_.reset(); + } } } const EnvOptions* ColumnFamilyData::soptions() const { - return &(column_family_set_->storage_options_); + return &(column_family_set_->env_options_); } -void ColumnFamilyData::SetCurrent(Version* current) { - current_ = current; - need_slowdown_for_num_level0_files_ = - (options_.level0_slowdown_writes_trigger >= 0 && - current_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger); -} +void ColumnFamilyData::SetCurrent(Version* current) { current_ = current; } -void ColumnFamilyData::CreateNewMemtable() { +void ColumnFamilyData::CreateNewMemtable(const MemTableOptions& moptions) { assert(current_ != nullptr); if (mem_ != nullptr) { delete mem_->Unref(); } - mem_ = new MemTable(internal_comparator_, options_); + mem_ = new MemTable(internal_comparator_, ioptions_, moptions); mem_->Ref(); } Compaction* ColumnFamilyData::PickCompaction(LogBuffer* log_buffer) { auto result = compaction_picker_->PickCompaction(current_, log_buffer); - RecalculateWriteStallRateLimitsConditions(); return result; } @@ -434,7 +491,15 @@ bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) { SuperVersion* ColumnFamilyData::InstallSuperVersion( SuperVersion* new_superversion, port::Mutex* db_mutex) { + db_mutex->AssertHeld(); + return InstallSuperVersion(new_superversion, db_mutex, mutable_cf_options_); +} + +SuperVersion* ColumnFamilyData::InstallSuperVersion( + SuperVersion* new_superversion, port::Mutex* db_mutex, + const MutableCFOptions& mutable_cf_options) { new_superversion->db_mutex = db_mutex; + new_superversion->mutable_cf_options = mutable_cf_options; new_superversion->Init(mem_, imm_.current(), current_); SuperVersion* old_superversion = super_version_; super_version_ = new_superversion; @@ -470,19 +535,32 @@ void ColumnFamilyData::ResetThreadLocalSuperVersions() { } } +bool ColumnFamilyData::SetOptions( + const std::unordered_map& options_map) { + MutableCFOptions new_mutable_cf_options; + if (GetMutableOptionsFromStrings(mutable_cf_options_, options_map, + &new_mutable_cf_options)) { + mutable_cf_options_ = new_mutable_cf_options; + return true; + } + return false; +} + ColumnFamilySet::ColumnFamilySet(const std::string& dbname, const DBOptions* db_options, - const EnvOptions& storage_options, - Cache* table_cache) + const EnvOptions& env_options, + Cache* table_cache, + WriteController* write_controller) : max_column_family_(0), dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr, ColumnFamilyOptions(), db_options, - storage_options_, nullptr)), + env_options, nullptr)), default_cfd_cache_(nullptr), db_name_(dbname), db_options_(db_options), - storage_options_(storage_options), + env_options_(env_options), table_cache_(table_cache), + write_controller_(write_controller), spin_lock_(ATOMIC_FLAG_INIT) { // initialize linked list dummy_cfd_->prev_ = dummy_cfd_; @@ -547,7 +625,7 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily( assert(column_families_.find(name) == column_families_.end()); ColumnFamilyData* new_cfd = new ColumnFamilyData(id, name, dummy_versions, table_cache_, options, - db_options_, storage_options_, this); + db_options_, env_options_, this); Lock(); column_families_.insert({name, id}); column_family_data_.insert({id, new_cfd}); @@ -606,6 +684,11 @@ bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) { column_family_set_->Lock(); current_ = column_family_set_->GetColumnFamily(column_family_id); column_family_set_->Unlock(); + // TODO(icanadi) Maybe remove column family from the hash table when it's + // dropped? + if (current_ != nullptr && current_->IsDropped()) { + current_ = nullptr; + } } handle_.SetCFD(current_); return current_ != nullptr; @@ -631,4 +714,29 @@ ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() { return &handle_; } +void ColumnFamilyMemTablesImpl::CheckMemtableFull() { + if (current_ != nullptr && current_->mem()->ShouldScheduleFlush()) { + flush_scheduler_->ScheduleFlush(current_); + current_->mem()->MarkFlushScheduled(); + } +} + +uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) { + uint32_t column_family_id = 0; + if (column_family != nullptr) { + auto cfh = reinterpret_cast(column_family); + column_family_id = cfh->GetID(); + } + return column_family_id; +} + +const Comparator* GetColumnFamilyUserComparator( + ColumnFamilyHandle* column_family) { + if (column_family != nullptr) { + auto cfh = reinterpret_cast(column_family); + return cfh->user_comparator(); + } + return nullptr; +} + } // namespace rocksdb diff --git a/db/column_family.h b/db/column_family.h index ecd9f21fb40..65b4b53bad4 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -19,8 +19,11 @@ #include "rocksdb/env.h" #include "db/memtable_list.h" #include "db/write_batch_internal.h" +#include "db/write_controller.h" #include "db/table_cache.h" #include "util/thread_local.h" +#include "db/flush_scheduler.h" +#include "util/mutable_cf_options.h" namespace rocksdb { @@ -46,6 +49,7 @@ class ColumnFamilyHandleImpl : public ColumnFamilyHandle { // destroy without mutex virtual ~ColumnFamilyHandleImpl(); virtual ColumnFamilyData* cfd() const { return cfd_; } + virtual const Comparator* user_comparator() const; virtual uint32_t GetID() const; @@ -78,6 +82,7 @@ struct SuperVersion { MemTable* mem; MemTableListVersion* imm; Version* current; + MutableCFOptions mutable_cf_options; std::atomic refs; // We need to_delete because during Cleanup(), imm->Unref() returns // all memtables that we need to free through this vector. We then @@ -113,7 +118,6 @@ struct SuperVersion { }; extern ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp, - const InternalFilterPolicy* ipolicy, const ColumnFamilyOptions& src); class ColumnFamilySet; @@ -133,7 +137,7 @@ class ColumnFamilyData { void Ref() { ++refs_; } // will just decrease reference count to 0, but will not delete it. returns // true if the ref count was decreased to zero. in that case, it can be - // deleted by the caller immediatelly, or later, by calling + // deleted by the caller immediately, or later, by calling // FreeDeadColumnFamilies() bool Unref() { assert(refs_ > 0); @@ -157,6 +161,7 @@ class ColumnFamilyData { // can't drop default CF assert(id_ != 0); dropped_ = true; + write_controller_token_.reset(); } bool IsDropped() const { return dropped_; } @@ -169,6 +174,21 @@ class ColumnFamilyData { // thread-safe const Options* options() const { return &options_; } const EnvOptions* soptions() const; + const ImmutableCFOptions* ioptions() const { return &ioptions_; } + // REQUIRES: DB mutex held + // This returns the MutableCFOptions used by current SuperVersion + // You shoul use this API to reference MutableCFOptions most of the time. + const MutableCFOptions* mutable_cf_options() const { + return &(super_version_->mutable_cf_options); + } + // REQUIRES: DB mutex held + // This returns the latest MutableCFOptions, which may be not in effect yet. + const MutableCFOptions* GetLatestMutableCFOptions() const { + return &mutable_cf_options_; + } + // REQUIRES: DB mutex held + bool SetOptions( + const std::unordered_map& options_map); InternalStats* internal_stats() { return internal_stats_.get(); } @@ -178,7 +198,7 @@ class ColumnFamilyData { Version* dummy_versions() { return dummy_versions_; } void SetMemtable(MemTable* new_mem) { mem_ = new_mem; } void SetCurrent(Version* current); - void CreateNewMemtable(); + void CreateNewMemtable(const MemTableOptions& moptions); TableCache* table_cache() const { return table_cache_.get(); } @@ -219,40 +239,20 @@ class ColumnFamilyData { // if its reference count is zero and needs deletion or nullptr if not // As argument takes a pointer to allocated SuperVersion to enable // the clients to allocate SuperVersion outside of mutex. + SuperVersion* InstallSuperVersion(SuperVersion* new_superversion, + port::Mutex* db_mutex, + const MutableCFOptions& mutable_cf_options); SuperVersion* InstallSuperVersion(SuperVersion* new_superversion, port::Mutex* db_mutex); void ResetThreadLocalSuperVersions(); - // A Flag indicating whether write needs to slowdown because of there are - // too many number of level0 files. - bool NeedSlowdownForNumLevel0Files() const { - return need_slowdown_for_num_level0_files_; - } - - bool NeedWaitForNumLevel0Files() const { - return need_wait_for_num_level0_files_; - } - - bool NeedWaitForNumMemtables() const { - return need_wait_for_num_memtables_; - } - - bool ExceedsSoftRateLimit() const { - return exceeds_soft_rate_limit_; - } - - bool ExceedsHardRateLimit() const { - return exceeds_hard_rate_limit_; - } - private: friend class ColumnFamilySet; ColumnFamilyData(uint32_t id, const std::string& name, Version* dummy_versions, Cache* table_cache, const ColumnFamilyOptions& options, - const DBOptions* db_options, - const EnvOptions& storage_options, + const DBOptions* db_options, const EnvOptions& env_options, ColumnFamilySet* column_family_set); // Recalculate some small conditions, which are changed only during @@ -261,7 +261,6 @@ class ColumnFamilyData { // DBImpl::MakeRoomForWrite function to decide, if it need to make // a write stall void RecalculateWriteStallConditions(); - void RecalculateWriteStallRateLimitsConditions(); uint32_t id_; const std::string name_; @@ -272,9 +271,10 @@ class ColumnFamilyData { bool dropped_; // true if client dropped it const InternalKeyComparator internal_comparator_; - const InternalFilterPolicy internal_filter_policy_; - Options const options_; + const Options options_; + const ImmutableCFOptions ioptions_; + MutableCFOptions mutable_cf_options_; std::unique_ptr table_cache_; @@ -303,31 +303,13 @@ class ColumnFamilyData { // recovered from uint64_t log_number_; - // A flag indicating whether we should delay writes because - // we have too many level 0 files - bool need_slowdown_for_num_level0_files_; - - // These 4 variables are updated only after compaction, - // adding new memtable, flushing memtables to files - // and/or add recalculation of compaction score. - // That's why theirs values are cached in ColumnFamilyData. - // Recalculation is made by RecalculateWriteStallConditions and - // RecalculateWriteStallRateLimitsConditions function. They are used - // in DBImpl::MakeRoomForWrite function to decide, if it need - // to sleep during write operation - bool need_wait_for_num_memtables_; - - bool need_wait_for_num_level0_files_; - - bool exceeds_hard_rate_limit_; - - bool exceeds_soft_rate_limit_; - // An object that keeps all the compaction stats // and picks the next compaction std::unique_ptr compaction_picker_; ColumnFamilySet* column_family_set_; + + std::unique_ptr write_controller_token_; }; // ColumnFamilySet has interesting thread-safety requirements @@ -369,7 +351,8 @@ class ColumnFamilySet { }; ColumnFamilySet(const std::string& dbname, const DBOptions* db_options, - const EnvOptions& storage_options, Cache* table_cache); + const EnvOptions& env_options, Cache* table_cache, + WriteController* write_controller); ~ColumnFamilySet(); ColumnFamilyData* GetDefault() const; @@ -422,8 +405,9 @@ class ColumnFamilySet { const std::string db_name_; const DBOptions* const db_options_; - const EnvOptions storage_options_; + const EnvOptions env_options_; Cache* table_cache_; + WriteController* write_controller_; std::atomic_flag spin_lock_; }; @@ -431,8 +415,11 @@ class ColumnFamilySet { // memtables of different column families (specified by ID in the write batch) class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables { public: - explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set) - : column_family_set_(column_family_set), current_(nullptr) {} + explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set, + FlushScheduler* flush_scheduler) + : column_family_set_(column_family_set), + current_(nullptr), + flush_scheduler_(flush_scheduler) {} // sets current_ to ColumnFamilyData with column_family_id // returns false if column family doesn't exist @@ -451,10 +438,18 @@ class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables { // Returns column family handle for the selected column family virtual ColumnFamilyHandle* GetColumnFamilyHandle() override; + virtual void CheckMemtableFull() override; + private: ColumnFamilySet* column_family_set_; ColumnFamilyData* current_; + FlushScheduler* flush_scheduler_; ColumnFamilyHandleInternal handle_; }; +extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family); + +extern const Comparator* GetColumnFamilyUserComparator( + ColumnFamilyHandle* column_family); + } // namespace rocksdb diff --git a/db/column_family_test.cc b/db/column_family_test.cc index 75a4bc5c75c..b96e66829a5 100644 --- a/db/column_family_test.cc +++ b/db/column_family_test.cc @@ -408,9 +408,15 @@ TEST(ColumnFamilyTest, WriteBatchFailure) { Open(); CreateColumnFamiliesAndReopen({"one", "two"}); WriteBatch batch; + batch.Put(handles_[0], Slice("existing"), Slice("column-family")); batch.Put(handles_[1], Slice("non-existing"), Slice("column-family")); ASSERT_OK(db_->Write(WriteOptions(), &batch)); DropColumnFamilies({1}); + WriteOptions woptions_ignore_missing_cf; + woptions_ignore_missing_cf.ignore_missing_column_families = true; + batch.Put(handles_[0], Slice("still here"), Slice("column-family")); + ASSERT_OK(db_->Write(woptions_ignore_missing_cf, &batch)); + ASSERT_EQ("column-family", Get(0, "still here")); Status s = db_->Write(WriteOptions(), &batch); ASSERT_TRUE(s.IsInvalidArgument()); Close(); @@ -746,9 +752,10 @@ TEST(ColumnFamilyTest, DifferentCompactionStyles) { default_cf.num_levels = 3; default_cf.write_buffer_size = 64 << 10; // 64KB default_cf.target_file_size_base = 30 << 10; - default_cf.filter_policy = nullptr; - default_cf.no_block_cache = true; default_cf.source_compaction_factor = 100; + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); one.compaction_style = kCompactionStyleUniversal; // trigger compaction if there are >= 4 files diff --git a/db/compaction.cc b/db/compaction.cc index 0bffa0162fa..28a3174b015 100644 --- a/db/compaction.cc +++ b/db/compaction.cc @@ -9,7 +9,10 @@ #include "db/compaction.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include @@ -110,8 +113,8 @@ void Compaction::AddInputDeletions(VersionEdit* edit) { } bool Compaction::KeyNotExistsBeyondOutputLevel(const Slice& user_key) { - assert(cfd_->options()->compaction_style != kCompactionStyleFIFO); - if (cfd_->options()->compaction_style == kCompactionStyleUniversal) { + assert(cfd_->ioptions()->compaction_style != kCompactionStyleFIFO); + if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) { return bottommost_level_; } // Maybe use binary search to find right entry instead of linear search? @@ -174,8 +177,8 @@ void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) { // Is this compaction producing files at the bottommost level? void Compaction::SetupBottomMostLevel(bool is_manual) { - assert(cfd_->options()->compaction_style != kCompactionStyleFIFO); - if (cfd_->options()->compaction_style == kCompactionStyleUniversal) { + assert(cfd_->ioptions()->compaction_style != kCompactionStyleFIFO); + if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) { // If universal compaction style is used and manual // compaction is occuring, then we are guaranteed that // all files will be picked in a single compaction @@ -267,7 +270,7 @@ void Compaction::Summary(char* output, int len) { uint64_t Compaction::OutputFilePreallocationSize() { uint64_t preallocation_size = 0; - if (cfd_->options()->compaction_style == kCompactionStyleLevel) { + if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) { preallocation_size = cfd_->compaction_picker()->MaxFileSizeForLevel(output_level()); } else { diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc index 512abf8fa6b..04d5c6f4735 100644 --- a/db/compaction_picker.cc +++ b/db/compaction_picker.cc @@ -9,7 +9,10 @@ #include "db/compaction_picker.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include "db/filename.h" @@ -39,13 +42,13 @@ CompressionType GetCompressionType(const Options& options, int level, return kNoCompression; } // If the use has specified a different compression level for each level, - // then pick the compresison for that level. + // then pick the compression for that level. if (!options.compression_per_level.empty()) { const int n = options.compression_per_level.size() - 1; // It is possible for level_ to be -1; in that case, we use level // 0's compression. This occurs mostly in backwards compatibility // situations when the builder doesn't know what level the file - // belongs to. Likewise, if level_ is beyond the end of the + // belongs to. Likewise, if level is beyond the end of the // specified compression levels, use the last value. return options.compression_per_level[std::max(0, std::min(level, n))]; } else { @@ -173,9 +176,12 @@ void CompactionPicker::GetRange(const std::vector& inputs1, } bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) { + assert(c != nullptr); // If inputs are empty then there is nothing to expand. - if (!c || c->inputs_[0].empty()) { - return true; + if (c->inputs_[0].empty()) { + assert(c->inputs_[1].empty()); + // This isn't good compaction + return false; } // GetOverlappingInputs will always do the right thing for level-0. @@ -427,7 +433,7 @@ Compaction* LevelCompactionPicker::PickCompaction(Version* version, level = version->compaction_level_[i]; if ((version->compaction_score_[i] >= 1)) { c = PickCompactionBySize(version, level, version->compaction_score_[i]); - if (ExpandWhileOverlapping(c) == false) { + if (c == nullptr || ExpandWhileOverlapping(c) == false) { delete c; c = nullptr; } else { diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 4726e92b920..09d78f89fb9 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -45,7 +45,9 @@ class CorruptionTest { db_ = nullptr; options_.create_if_missing = true; - options_.block_size_deviation = 0; // make unit test pass for now + BlockBasedTableOptions table_options; + table_options.block_size_deviation = 0; // make unit test pass for now + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(); options_.create_if_missing = false; } @@ -60,9 +62,11 @@ class CorruptionTest { db_ = nullptr; Options opt = (options ? *options : options_); opt.env = &env_; - opt.block_cache = tiny_cache_; - opt.block_size_deviation = 0; opt.arena_block_size = 4096; + BlockBasedTableOptions table_options; + table_options.block_cache = tiny_cache_; + table_options.block_size_deviation = 0; + opt.table_factory.reset(NewBlockBasedTableFactory(table_options)); return DB::Open(opt, dbname_, &db_); } @@ -328,6 +332,9 @@ TEST(CorruptionTest, CorruptedDescriptor) { } TEST(CorruptionTest, CompactionInputError) { + Options options; + options.max_background_flushes = 0; + Reopen(&options); Build(10); DBImpl* dbi = reinterpret_cast(db_); dbi->TEST_FlushMemTable(); @@ -347,6 +354,7 @@ TEST(CorruptionTest, CompactionInputErrorParanoid) { options.paranoid_checks = true; options.write_buffer_size = 131072; options.max_write_buffer_number = 2; + options.max_background_flushes = 0; Reopen(&options); DBImpl* dbi = reinterpret_cast(db_); diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc index aa479d2ffc1..2652d1776c8 100644 --- a/db/cuckoo_table_db_test.cc +++ b/db/cuckoo_table_db_test.cc @@ -131,8 +131,6 @@ TEST(CuckooTableDBTest, Flush) { ASSERT_EQ("v2", Get("key2")); ASSERT_EQ("v3", Get("key3")); ASSERT_EQ("NOT_FOUND", Get("key4")); - ASSERT_EQ("Invalid argument: Length of key is invalid.", Get("somelongkey")); - ASSERT_EQ("Invalid argument: Length of key is invalid.", Get("s")); // Now add more keys and flush. ASSERT_OK(Put("key4", "v4")); @@ -195,6 +193,38 @@ static std::string Key(int i) { snprintf(buf, sizeof(buf), "key_______%06d", i); return std::string(buf); } +static std::string Uint64Key(uint64_t i) { + std::string str; + str.resize(8); + memcpy(&str[0], static_cast(&i), 8); + return str; +} +} // namespace. + +TEST(CuckooTableDBTest, Uint64Comparator) { + Options options = CurrentOptions(); + options.comparator = test::Uint64Comparator(); + Reopen(&options); + + ASSERT_OK(Put(Uint64Key(1), "v1")); + ASSERT_OK(Put(Uint64Key(2), "v2")); + ASSERT_OK(Put(Uint64Key(3), "v3")); + dbfull()->TEST_FlushMemTable(); + + ASSERT_EQ("v1", Get(Uint64Key(1))); + ASSERT_EQ("v2", Get(Uint64Key(2))); + ASSERT_EQ("v3", Get(Uint64Key(3))); + ASSERT_EQ("NOT_FOUND", Get(Uint64Key(4))); + + // Add more keys. + ASSERT_OK(Delete(Uint64Key(2))); // Delete. + ASSERT_OK(Put(Uint64Key(3), "v0")); // Update. + ASSERT_OK(Put(Uint64Key(4), "v4")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v1", Get(Uint64Key(1))); + ASSERT_EQ("NOT_FOUND", Get(Uint64Key(2))); + ASSERT_EQ("v0", Get(Uint64Key(3))); + ASSERT_EQ("v4", Get(Uint64Key(4))); } TEST(CuckooTableDBTest, CompactionTrigger) { @@ -215,14 +245,38 @@ TEST(CuckooTableDBTest, CompactionTrigger) { ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx))); } dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_EQ("2", FilesPerLevel()); + dbfull()->TEST_CompactRange(0, nullptr, nullptr); ASSERT_EQ("0,2", FilesPerLevel()); for (int idx = 0; idx < 22; ++idx) { ASSERT_EQ(std::string(10000, 'a' + idx), Get(Key(idx))); } } +TEST(CuckooTableDBTest, CompactionIntoMultipleFiles) { + // Create a big L0 file and check it compacts into multiple files in L1. + Options options = CurrentOptions(); + options.write_buffer_size = 270 << 10; + // Two SST files should be created, each containing 14 keys. + // Number of buckets will be 16. Total size ~156 KB. + options.target_file_size_base = 160 << 10; + Reopen(&options); + + // Write 28 values, each 10016 B ~ 10KB + for (int idx = 0; idx < 28; ++idx) { + ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx))); + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ("1", FilesPerLevel()); + + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_EQ("0,2", FilesPerLevel()); + for (int idx = 0; idx < 28; ++idx) { + ASSERT_EQ(std::string(10000, 'a' + idx), Get(Key(idx))); + } +} + TEST(CuckooTableDBTest, SameKeyInsertedInTwoDifferentFilesAndCompacted) { // Insert same key twice so that they go to different SST files. Then wait for // compaction and check if the latest value is stored and old value removed. diff --git a/db/db_bench.cc b/db/db_bench.cc index f208b8181ea..d90c628a9ff 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -7,7 +7,9 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif #ifndef GFLAGS #include @@ -37,8 +39,8 @@ int main() { #include "rocksdb/memtablerep.h" #include "rocksdb/write_batch.h" #include "rocksdb/slice.h" +#include "rocksdb/filter_policy.h" #include "rocksdb/slice_transform.h" -#include "rocksdb/statistics.h" #include "rocksdb/perf_context.h" #include "port/port.h" #include "port/stack_trace.h" @@ -146,6 +148,7 @@ DEFINE_int64(merge_keys, -1, "Number of distinct keys to use for MergeRandom and " "ReadRandomMergeRandom. " "If negative, there will be FLAGS_num keys."); +DEFINE_int32(num_column_families, 1, "Number of Column Families to use."); DEFINE_int64(reads, -1, "Number of read operations to do. " "If negative, do FLAGS_num reads."); @@ -162,6 +165,7 @@ DEFINE_int32(duration, 0, "Time in seconds for the random-ops tests to run." DEFINE_int32(value_size, 100, "Size of each value"); +DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator"); static bool ValidateKeySize(const char* flagname, int32_t value) { return true; @@ -238,10 +242,11 @@ DEFINE_int32(universal_compression_size_percent, -1, DEFINE_int64(cache_size, -1, "Number of bytes to use as a cache of uncompressed" "data. Negative means use default settings."); -DEFINE_int32(block_size, rocksdb::Options().block_size, +DEFINE_int32(block_size, rocksdb::BlockBasedTableOptions().block_size, "Number of bytes in a block."); -DEFINE_int32(block_restart_interval, rocksdb::Options().block_restart_interval, +DEFINE_int32(block_restart_interval, + rocksdb::BlockBasedTableOptions().block_restart_interval, "Number of keys between restart points " "for delta encoding of keys."); @@ -302,7 +307,7 @@ DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL"); DEFINE_int32(num_levels, 7, "The total number of levels"); -DEFINE_int32(target_file_size_base, 2 * 1048576, "Target file size at level-1"); +DEFINE_int64(target_file_size_base, 2 * 1048576, "Target file size at level-1"); DEFINE_int32(target_file_size_multiplier, 1, "A multiplier to compute target level-N file size (N >= 2)"); @@ -509,6 +514,9 @@ DEFINE_int64(keys_per_prefix, 0, "control average number of keys generated " "i.e. use the prefix comes with the generated random number."); DEFINE_bool(enable_io_prio, false, "Lower the background flush/compaction " "threads' IO priority"); +DEFINE_bool(identity_as_first_hash, false, "the first hash function of cuckoo " + "table becomes an identity function. This is only valid when key " + "is 8 bytes"); enum RepFactory { kSkipList, @@ -548,7 +556,9 @@ DEFINE_double(cuckoo_hash_ratio, 0.9, "Hash ratio for Cuckoo SST table."); DEFINE_bool(use_hash_search, false, "if use kHashSearch " "instead of kBinarySearch. " "This is valid if only we use BlockTable"); - +DEFINE_bool(use_block_based_filter, false, "if use kBlockBasedFilter " + "instead of kFullFilter for filter block. " + "This is valid if only we use BlockTable"); DEFINE_string(merge_operator, "", "The merge operator to use with the database." "If a new merge operator is specified, be sure to use fresh" " database The possible merge operators are defined in" @@ -843,12 +853,19 @@ class Duration { class Benchmark { private: - shared_ptr cache_; - shared_ptr compressed_cache_; - const FilterPolicy* filter_policy_; + std::shared_ptr cache_; + std::shared_ptr compressed_cache_; + std::shared_ptr filter_policy_; const SliceTransform* prefix_extractor_; - DB* db_; - std::vector multi_dbs_; + struct DBWithColumnFamilies { + std::vector cfh; + DB* db; + DBWithColumnFamilies() : db(nullptr) { + cfh.clear(); + } + }; + DBWithColumnFamilies db_; + std::vector multi_dbs_; int64_t num_; int value_size_; int key_size_; @@ -1064,11 +1081,10 @@ class Benchmark { (FLAGS_cache_numshardbits >= 1 ? NewLRUCache(FLAGS_compressed_cache_size, FLAGS_cache_numshardbits) : NewLRUCache(FLAGS_compressed_cache_size)) : nullptr), - filter_policy_(FLAGS_bloom_bits >= 0 - ? NewBloomFilterPolicy(FLAGS_bloom_bits) - : nullptr), + filter_policy_(FLAGS_bloom_bits >= 0 ? + NewBloomFilterPolicy(FLAGS_bloom_bits, FLAGS_use_block_based_filter) + : nullptr), prefix_extractor_(NewFixedPrefixTransform(FLAGS_prefix_size)), - db_(nullptr), num_(FLAGS_num), value_size_(FLAGS_value_size), key_size_(FLAGS_key_size), @@ -1099,8 +1115,9 @@ class Benchmark { } ~Benchmark() { - delete db_; - delete filter_policy_; + std::for_each(db_.cfh.begin(), db_.cfh.end(), + [](ColumnFamilyHandle* cfh) { delete cfh; }); + delete db_.db; delete prefix_extractor_; } @@ -1159,6 +1176,16 @@ class Benchmark { return base_name + std::to_string(id); } + std::string ColumnFamilyName(int i) { + if (i == 0) { + return kDefaultColumnFamilyName; + } else { + char name[100]; + snprintf(name, sizeof(name), "column_family_name_%06d", i); + return std::string(name); + } + } + void Run() { if (!SanityCheck()) { exit(1); @@ -1313,13 +1340,16 @@ class Benchmark { name.ToString().c_str()); method = nullptr; } else { - if (db_ != nullptr) { - delete db_; - db_ = nullptr; + if (db_.db != nullptr) { + std::for_each(db_.cfh.begin(), db_.cfh.end(), + [](ColumnFamilyHandle* cfh) { delete cfh; }); + delete db_.db; + db_.db = nullptr; + db_.cfh.clear(); DestroyDB(FLAGS_db, Options()); } for (size_t i = 0; i < multi_dbs_.size(); i++) { - delete multi_dbs_[i]; + delete multi_dbs_[i].db; DestroyDB(GetDbNameForMultiple(FLAGS_db, i), Options()); } multi_dbs_.clear(); @@ -1491,7 +1521,7 @@ class Benchmark { void Compress(ThreadState *thread) { RandomGenerator gen; - Slice input = gen.Generate(Options().block_size); + Slice input = gen.Generate(FLAGS_block_size); int64_t bytes = 0; int64_t produced = 0; bool ok = true; @@ -1541,7 +1571,7 @@ class Benchmark { void Uncompress(ThreadState *thread) { RandomGenerator gen; - Slice input = gen.Generate(Options().block_size); + Slice input = gen.Generate(FLAGS_block_size); std::string compressed; bool ok; @@ -1617,14 +1647,10 @@ class Benchmark { } void Open() { - assert(db_ == nullptr); + assert(db_.db == nullptr); Options options; options.create_if_missing = !FLAGS_use_existing_db; - options.block_cache = cache_; - options.block_cache_compressed = compressed_cache_; - if (cache_ == nullptr) { - options.no_block_cache = true; - } + options.create_missing_column_families = FLAGS_num_column_families > 1; options.write_buffer_size = FLAGS_write_buffer_size; options.max_write_buffer_number = FLAGS_max_write_buffer_number; options.min_write_buffer_number_to_merge = @@ -1632,13 +1658,17 @@ class Benchmark { options.max_background_compactions = FLAGS_max_background_compactions; options.max_background_flushes = FLAGS_max_background_flushes; options.compaction_style = FLAGS_compaction_style_e; - options.block_size = FLAGS_block_size; - options.block_restart_interval = FLAGS_block_restart_interval; - options.filter_policy = filter_policy_; if (FLAGS_prefix_size != 0) { options.prefix_extractor.reset( NewFixedPrefixTransform(FLAGS_prefix_size)); } + if (FLAGS_use_uint64_comparator) { + options.comparator = test::Uint64Comparator(); + if (FLAGS_key_size != 8) { + fprintf(stderr, "Using Uint64 comparator but key size is not 8.\n"); + exit(1); + } + } options.memtable_prefix_bloom_bits = FLAGS_memtable_bloom_bits; options.bloom_locality = FLAGS_bloom_locality; options.max_open_files = FLAGS_open_files; @@ -1712,8 +1742,11 @@ class Benchmark { fprintf(stderr, "Invalid cuckoo_hash_ratio\n"); exit(1); } + rocksdb::CuckooTableOptions table_options; + table_options.hash_table_ratio = FLAGS_cuckoo_hash_ratio; + table_options.identity_as_first_hash = FLAGS_identity_as_first_hash; options.table_factory = std::shared_ptr( - NewCuckooTableFactory(FLAGS_cuckoo_hash_ratio)); + NewCuckooTableFactory(table_options)); } else { BlockBasedTableOptions block_based_options; if (FLAGS_use_hash_search) { @@ -1726,6 +1759,14 @@ class Benchmark { } else { block_based_options.index_type = BlockBasedTableOptions::kBinarySearch; } + if (cache_ == nullptr) { + block_based_options.no_block_cache = true; + } + block_based_options.block_cache = cache_; + block_based_options.block_cache_compressed = compressed_cache_; + block_based_options.block_size = FLAGS_block_size; + block_based_options.block_restart_interval = FLAGS_block_restart_interval; + block_based_options.filter_policy = filter_policy_; options.table_factory.reset( NewBlockBasedTableFactory(block_based_options)); } @@ -1816,10 +1857,9 @@ class Benchmark { OpenDb(options, FLAGS_db, &db_); } else { multi_dbs_.clear(); + multi_dbs_.resize(FLAGS_num_multi_db); for (int i = 0; i < FLAGS_num_multi_db; i++) { - DB* db; - OpenDb(options, GetDbNameForMultiple(FLAGS_db, i), &db); - multi_dbs_.push_back(db); + OpenDb(options, GetDbNameForMultiple(FLAGS_db, i), &multi_dbs_[i]); } } if (FLAGS_min_level_to_compress >= 0) { @@ -1827,12 +1867,27 @@ class Benchmark { } } - void OpenDb(Options options, std::string db_name, DB** db) { + void OpenDb(const Options& options, const std::string& db_name, + DBWithColumnFamilies* db) { Status s; - if(FLAGS_readonly) { - s = DB::OpenForReadOnly(options, db_name, db); + // Open with column families if necessary. + if (FLAGS_num_column_families > 1) { + db->cfh.resize(FLAGS_num_column_families); + std::vector column_families; + for (int i = 0; i < FLAGS_num_column_families; i++) { + column_families.push_back(ColumnFamilyDescriptor( + ColumnFamilyName(i), ColumnFamilyOptions(options))); + } + if (FLAGS_readonly) { + s = DB::OpenForReadOnly(options, db_name, column_families, + &db->cfh, &db->db); + } else { + s = DB::Open(options, db_name, column_families, &db->cfh, &db->db); + } + } else if (FLAGS_readonly) { + s = DB::OpenForReadOnly(options, db_name, &db->db); } else { - s = DB::Open(options, db_name, db); + s = DB::Open(options, db_name, &db->db); } if (!s.ok()) { fprintf(stderr, "open error: %s\n", s.ToString().c_str()); @@ -1900,10 +1955,18 @@ class Benchmark { }; DB* SelectDB(ThreadState* thread) { - if (db_ != nullptr) { - return db_; - } else { - return multi_dbs_[thread->rand.Next() % multi_dbs_.size()]; + return SelectDBWithCfh(thread)->db; + } + + DBWithColumnFamilies* SelectDBWithCfh(ThreadState* thread) { + return SelectDBWithCfh(thread->rand.Next()); + } + + DBWithColumnFamilies* SelectDBWithCfh(uint64_t rand_int) { + if (db_.db != nullptr) { + return &db_; + } else { + return &multi_dbs_[rand_int % multi_dbs_.size()]; } } @@ -1912,7 +1975,7 @@ class Benchmark { const int64_t num_ops = writes_ == 0 ? num_ : writes_; size_t num_key_gens = 1; - if (db_ == nullptr) { + if (db_.db == nullptr) { num_key_gens = multi_dbs_.size(); } std::vector> key_gens(num_key_gens); @@ -1935,20 +1998,25 @@ class Benchmark { Slice key = AllocateKey(); std::unique_ptr key_guard(key.data()); while (!duration.Done(entries_per_batch_)) { - size_t id = 0; - DB* db_to_write = db_; - if (db_to_write == nullptr) { - id = thread->rand.Next() % num_key_gens; - db_to_write = multi_dbs_[id]; - } + size_t id = thread->rand.Next() % num_key_gens; + DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(id); batch.Clear(); for (int64_t j = 0; j < entries_per_batch_; j++) { - GenerateKeyFromInt(key_gens[id]->Next(), FLAGS_num, &key); - batch.Put(key, gen.Generate(value_size_)); + int64_t rand_num = key_gens[id]->Next(); + GenerateKeyFromInt(rand_num, FLAGS_num, &key); + if (FLAGS_num_column_families <= 1) { + batch.Put(key, gen.Generate(value_size_)); + } else { + // We use same rand_num as seed for key and column family so that we + // can deterministically find the cfh corresponding to a particular + // key while reading the key. + batch.Put(db_with_cfh->cfh[rand_num % db_with_cfh->cfh.size()], + key, gen.Generate(value_size_)); + } bytes += value_size_ + key_size_; } - s = db_to_write->Write(write_options_, &batch); - thread->stats.FinishedOps(db_to_write, entries_per_batch_); + s = db_with_cfh->db->Write(write_options_, &batch); + thread->stats.FinishedOps(db_with_cfh->db, entries_per_batch_); if (!s.ok()) { fprintf(stderr, "put error: %s\n", s.ToString().c_str()); exit(1); @@ -1958,11 +2026,11 @@ class Benchmark { } void ReadSequential(ThreadState* thread) { - if (db_ != nullptr) { - ReadSequential(thread, db_); + if (db_.db != nullptr) { + ReadSequential(thread, db_.db); } else { - for (DB* db : multi_dbs_) { - ReadSequential(thread, db); + for (const auto& db_with_cfh : multi_dbs_) { + ReadSequential(thread, db_with_cfh.db); } } } @@ -1981,11 +2049,11 @@ class Benchmark { } void ReadReverse(ThreadState* thread) { - if (db_ != nullptr) { - ReadReverse(thread, db_); + if (db_.db != nullptr) { + ReadReverse(thread, db_.db); } else { - for (DB* db : multi_dbs_) { - ReadReverse(thread, db); + for (const auto& db_with_cfh : multi_dbs_) { + ReadReverse(thread, db_with_cfh.db); } } } @@ -1996,7 +2064,7 @@ class Benchmark { int64_t bytes = 0; for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) { bytes += iter->key().size() + iter->value().size(); - thread->stats.FinishedOps(db_, 1); + thread->stats.FinishedOps(db, 1); ++i; } delete iter; @@ -2013,13 +2081,24 @@ class Benchmark { Duration duration(FLAGS_duration, reads_); while (!duration.Done(1)) { - DB* db = SelectDB(thread); - GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); + DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread); + // We use same key_rand as seed for key and column family so that we can + // deterministically find the cfh corresponding to a particular key, as it + // is done in DoWrite method. + int64_t key_rand = thread->rand.Next() % FLAGS_num; + GenerateKeyFromInt(key_rand, FLAGS_num, &key); read++; - if (db->Get(options, key, &value).ok()) { + Status s; + if (FLAGS_num_column_families > 1) { + s = db_with_cfh->db->Get(options, + db_with_cfh->cfh[key_rand % db_with_cfh->cfh.size()], key, &value); + } else { + s = db_with_cfh->db->Get(options, key, &value); + } + if (s.ok()) { found++; } - thread->stats.FinishedOps(db_, 1); + thread->stats.FinishedOps(db_with_cfh->db, 1); } char msg[100]; @@ -2061,6 +2140,7 @@ class Benchmark { ++found; } } + thread->stats.FinishedOps(db, entries_per_batch_); } for (auto& k : keys) { delete k.data(); @@ -2099,11 +2179,11 @@ class Benchmark { Iterator* single_iter = nullptr; std::vector multi_iters; - if (db_ != nullptr) { - single_iter = db_->NewIterator(options); + if (db_.db != nullptr) { + single_iter = db_.db->NewIterator(options); } else { - for (DB* db : multi_dbs_) { - multi_iters.push_back(db->NewIterator(options)); + for (const auto& db_with_cfh : multi_dbs_) { + multi_iters.push_back(db_with_cfh.db->NewIterator(options)); } } uint64_t last_refresh = FLAGS_env->NowMicros(); @@ -2116,16 +2196,16 @@ class Benchmark { if (!FLAGS_use_tailing_iterator && FLAGS_iter_refresh_interval_us >= 0) { uint64_t now = FLAGS_env->NowMicros(); if (now - last_refresh > (uint64_t)FLAGS_iter_refresh_interval_us) { - if (db_ != nullptr) { + if (db_.db != nullptr) { delete single_iter; - single_iter = db_->NewIterator(options); + single_iter = db_.db->NewIterator(options); } else { for (auto iter : multi_iters) { delete iter; } multi_iters.clear(); - for (DB* db : multi_dbs_) { - multi_iters.push_back(db->NewIterator(options)); + for (const auto& db_with_cfh : multi_dbs_) { + multi_iters.push_back(db_with_cfh.db->NewIterator(options)); } } } @@ -2143,7 +2223,7 @@ class Benchmark { if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) { found++; } - thread->stats.FinishedOps(db_, 1); + thread->stats.FinishedOps(db_.db, 1); } delete single_iter; for (auto iter : multi_iters) { @@ -2243,7 +2323,7 @@ class Benchmark { fprintf(stderr, "put error: %s\n", s.ToString().c_str()); exit(1); } - thread->stats.FinishedOps(db_, 1); + thread->stats.FinishedOps(db_.db, 1); ++num_writes; if (writes_per_second_by_10 && num_writes >= writes_per_second_by_10) { @@ -2403,7 +2483,7 @@ class Benchmark { deletes_done++; } - thread->stats.FinishedOps(db_, 1); + thread->stats.FinishedOps(db_.db, 1); } char msg[100]; snprintf(msg, sizeof(msg), @@ -2542,7 +2622,7 @@ class Benchmark { fprintf(stderr, "put error: %s\n", s.ToString().c_str()); exit(1); } - thread->stats.FinishedOps(db_, 1); + thread->stats.FinishedOps(db, 1); } char msg[100]; @@ -2578,7 +2658,7 @@ class Benchmark { fprintf(stderr, "merge error: %s\n", s.ToString().c_str()); exit(1); } - thread->stats.FinishedOps(db_, 1); + thread->stats.FinishedOps(db, 1); } // Print some statistics @@ -2639,7 +2719,7 @@ class Benchmark { } - thread->stats.FinishedOps(db_, 1); + thread->stats.FinishedOps(db, 1); } char msg[100]; @@ -2656,11 +2736,11 @@ class Benchmark { } void PrintStats(const char* key) { - if (db_ != nullptr) { - PrintStats(db_, key, false); + if (db_.db != nullptr) { + PrintStats(db_.db, key, false); } - for (DB* db : multi_dbs_) { - PrintStats(db, key, true); + for (const auto& db_with_cfh : multi_dbs_) { + PrintStats(db_with_cfh.db, key, true); } } diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index 5286ca782c8..9f05b8d3079 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -9,7 +9,10 @@ #ifndef ROCKSDB_LITE +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include @@ -29,9 +32,9 @@ Status DBImpl::DisableFileDeletions() { MutexLock l(&mutex_); ++disable_delete_obsolete_files_; if (disable_delete_obsolete_files_ == 1) { - Log(options_.info_log, "File Deletions Disabled"); + Log(db_options_.info_log, "File Deletions Disabled"); } else { - Log(options_.info_log, + Log(db_options_.info_log, "File Deletions Disabled, but already disabled. Counter: %d", disable_delete_obsolete_files_); } @@ -50,11 +53,11 @@ Status DBImpl::EnableFileDeletions(bool force) { --disable_delete_obsolete_files_; } if (disable_delete_obsolete_files_ == 0) { - Log(options_.info_log, "File Deletions Enabled"); + Log(db_options_.info_log, "File Deletions Enabled"); should_purge_files = true; FindObsoleteFiles(deletion_state, true); } else { - Log(options_.info_log, + Log(db_options_.info_log, "File Deletions Enable, but not really enabled. Counter: %d", disable_delete_obsolete_files_); } @@ -62,10 +65,14 @@ Status DBImpl::EnableFileDeletions(bool force) { if (should_purge_files) { PurgeObsoleteFiles(deletion_state); } - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); return Status::OK(); } +int DBImpl::IsFileDeletionsEnabled() const { + return disable_delete_obsolete_files_; +} + Status DBImpl::GetLiveFiles(std::vector& ret, uint64_t* manifest_file_size, bool flush_memtable) { @@ -91,7 +98,7 @@ Status DBImpl::GetLiveFiles(std::vector& ret, if (!status.ok()) { mutex_.Unlock(); - Log(options_.info_log, "Cannot Flush data %s\n", + Log(db_options_.info_log, "Cannot Flush data %s\n", status.ToString().c_str()); return status; } @@ -129,7 +136,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { Status s; // list wal files in main db dir. VectorLogPtr logs; - s = GetSortedWalsOfType(options_.wal_dir, logs, kAliveLogFile); + s = GetSortedWalsOfType(db_options_.wal_dir, logs, kAliveLogFile); if (!s.ok()) { return s; } @@ -142,7 +149,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { files.clear(); // list wal files in archive dir. - std::string archivedir = ArchivalDirectory(options_.wal_dir); + std::string archivedir = ArchivalDirectory(db_options_.wal_dir); if (env_->FileExists(archivedir)) { s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile); if (!s.ok()) { @@ -153,7 +160,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { uint64_t latest_archived_log_number = 0; if (!files.empty()) { latest_archived_log_number = files.back()->LogNumber(); - Log(options_.info_log, "Latest Archived log: %" PRIu64, + Log(db_options_.info_log, "Latest Archived log: %" PRIu64, latest_archived_log_number); } @@ -166,7 +173,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { // same log in both db dir and archived dir. Simply // ignore the one in db dir. Note that, if we read // archived dir first, we would have missed the log file. - Log(options_.info_log, "%s already moved to archive", + Log(db_options_.info_log, "%s already moved to archive", log->PathName().c_str()); } } diff --git a/db/db_impl.cc b/db/db_impl.cc index 88e358416c3..2609398101a 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -9,7 +9,10 @@ #include "db/db_impl.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include @@ -74,20 +77,6 @@ const std::string kDefaultColumnFamilyName("default"); void DumpLeveldbBuildVersion(Logger * log); -// Information kept for every waiting writer -struct DBImpl::Writer { - Status status; - WriteBatch* batch; - bool sync; - bool disableWAL; - bool in_batch_group; - bool done; - uint64_t timeout_hint_us; - port::CondVar cv; - - explicit Writer(port::Mutex* mu) : cv(mu) { } -}; - struct DBImpl::WriteContext { autovector superversions_to_free_; autovector logs_to_free_; @@ -246,10 +235,9 @@ struct DBImpl::CompactionState { Options SanitizeOptions(const std::string& dbname, const InternalKeyComparator* icmp, - const InternalFilterPolicy* ipolicy, const Options& src) { auto db_options = SanitizeOptions(dbname, DBOptions(src)); - auto cf_options = SanitizeOptions(icmp, ipolicy, ColumnFamilyOptions(src)); + auto cf_options = SanitizeOptions(icmp, ColumnFamilyOptions(src)); return Options(db_options, cf_options); } @@ -292,24 +280,38 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { } namespace { -CompressionType GetCompressionFlush(const Options& options) { + +Status SanitizeDBOptionsByCFOptions( + const DBOptions* db_opts, + const std::vector& column_families) { + Status s; + for (auto cf : column_families) { + s = cf.options.table_factory->SanitizeDBOptions(db_opts); + if (!s.ok()) { + return s; + } + } + return Status::OK(); +} + +CompressionType GetCompressionFlush(const ImmutableCFOptions& ioptions) { // Compressing memtable flushes might not help unless the sequential load // optimization is used for leveled compaction. Otherwise the CPU and // latency overhead is not offset by saving much space. bool can_compress; - if (options.compaction_style == kCompactionStyleUniversal) { + if (ioptions.compaction_style == kCompactionStyleUniversal) { can_compress = - (options.compaction_options_universal.compression_size_percent < 0); + (ioptions.compaction_options_universal.compression_size_percent < 0); } else { // For leveled compress when min_level_to_compress == 0. - can_compress = options.compression_per_level.empty() || - options.compression_per_level[0] != kNoCompression; + can_compress = ioptions.compression_per_level.empty() || + ioptions.compression_per_level[0] != kNoCompression; } if (can_compress) { - return options.compression; + return ioptions.compression; } else { return kNoCompression; } @@ -319,8 +321,8 @@ CompressionType GetCompressionFlush(const Options& options) { DBImpl::DBImpl(const DBOptions& options, const std::string& dbname) : env_(options.env), dbname_(dbname), - options_(SanitizeOptions(dbname, options)), - stats_(options_.statistics.get()), + db_options_(SanitizeOptions(dbname, options)), + stats_(db_options_.statistics.get()), db_lock_(nullptr), mutex_(options.use_adaptive_mutex), shutting_down_(nullptr), @@ -342,8 +344,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname) last_stats_dump_time_microsec_(0), default_interval_to_delete_obsolete_WAL_(600), flush_on_destroy_(false), - delayed_writes_(0), - storage_options_(options), + env_options_(options), bg_work_gate_closed_(false), refitting_level_(false), opened_successfully_(false) { @@ -351,30 +352,30 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname) // Reserve ten files or so for other uses and give the rest to TableCache. // Give a large number for setting of "infinite" open files. - const int table_cache_size = - (options_.max_open_files == -1) ? 4194304 : options_.max_open_files - 10; + const int table_cache_size = (db_options_.max_open_files == -1) ? + 4194304 : db_options_.max_open_files - 10; // Reserve ten files or so for other uses and give the rest to TableCache. table_cache_ = - NewLRUCache(table_cache_size, options_.table_cache_numshardbits, - options_.table_cache_remove_scan_count_limit); + NewLRUCache(table_cache_size, db_options_.table_cache_numshardbits, + db_options_.table_cache_remove_scan_count_limit); - versions_.reset( - new VersionSet(dbname_, &options_, storage_options_, table_cache_.get())); - column_family_memtables_.reset( - new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); + versions_.reset(new VersionSet(dbname_, &db_options_, env_options_, + table_cache_.get(), &write_controller_)); + column_family_memtables_.reset(new ColumnFamilyMemTablesImpl( + versions_->GetColumnFamilySet(), &flush_scheduler_)); - DumpLeveldbBuildVersion(options_.info_log.get()); - DumpDBFileSummary(options_, dbname_); - options_.Dump(options_.info_log.get()); + DumpLeveldbBuildVersion(db_options_.info_log.get()); + DumpDBFileSummary(db_options_, dbname_); + db_options_.Dump(db_options_.info_log.get()); - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); } DBImpl::~DBImpl() { mutex_.Lock(); if (flush_on_destroy_) { for (auto cfd : *versions_->GetColumnFamilySet()) { - if (cfd->mem()->GetFirstSequenceNumber() != 0) { + if (!cfd->mem()->IsEmpty()) { cfd->Ref(); mutex_.Unlock(); FlushMemTable(cfd, FlushOptions()); @@ -391,6 +392,8 @@ DBImpl::~DBImpl() { bg_cv_.Wait(); } + flush_scheduler_.Clear(); + if (default_cf_handle_ != nullptr) { // we need to delete handle outside of lock because it does its own locking mutex_.Unlock(); @@ -398,7 +401,7 @@ DBImpl::~DBImpl() { mutex_.Lock(); } - if (options_.allow_thread_local) { + if (db_options_.allow_thread_local) { // Clean up obsolete files due to SuperVersion release. // (1) Need to delete to obsolete files before closing because RepairDB() // scans all existing files in the file system and builds manifest file. @@ -427,7 +430,7 @@ DBImpl::~DBImpl() { env_->UnlockFile(db_lock_); } - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); } Status DBImpl::NewDB() { @@ -436,15 +439,15 @@ Status DBImpl::NewDB() { new_db.SetNextFile(2); new_db.SetLastSequence(0); - Log(options_.info_log, "Creating manifest 1 \n"); + Log(db_options_.info_log, "Creating manifest 1 \n"); const std::string manifest = DescriptorFileName(dbname_, 1); unique_ptr file; Status s = env_->NewWritableFile( - manifest, &file, env_->OptimizeForManifestWrite(storage_options_)); + manifest, &file, env_->OptimizeForManifestWrite(env_options_)); if (!s.ok()) { return s; } - file->SetPreallocationBlockSize(options_.manifest_preallocation_size); + file->SetPreallocationBlockSize(db_options_.manifest_preallocation_size); { log::Writer log(std::move(file)); std::string record; @@ -461,38 +464,38 @@ Status DBImpl::NewDB() { } void DBImpl::MaybeIgnoreError(Status* s) const { - if (s->ok() || options_.paranoid_checks) { + if (s->ok() || db_options_.paranoid_checks) { // No change needed } else { - Log(options_.info_log, "Ignoring error %s", s->ToString().c_str()); + Log(db_options_.info_log, "Ignoring error %s", s->ToString().c_str()); *s = Status::OK(); } } const Status DBImpl::CreateArchivalDirectory() { - if (options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0) { - std::string archivalPath = ArchivalDirectory(options_.wal_dir); + if (db_options_.WAL_ttl_seconds > 0 || db_options_.WAL_size_limit_MB > 0) { + std::string archivalPath = ArchivalDirectory(db_options_.wal_dir); return env_->CreateDirIfMissing(archivalPath); } return Status::OK(); } void DBImpl::PrintStatistics() { - auto dbstats = options_.statistics.get(); + auto dbstats = db_options_.statistics.get(); if (dbstats) { - Log(options_.info_log, + Log(db_options_.info_log, "STATISTCS:\n %s", dbstats->ToString().c_str()); } } void DBImpl::MaybeDumpStats() { - if (options_.stats_dump_period_sec == 0) return; + if (db_options_.stats_dump_period_sec == 0) return; const uint64_t now_micros = env_->NowMicros(); if (last_stats_dump_time_microsec_ + - options_.stats_dump_period_sec * 1000000 + db_options_.stats_dump_period_sec * 1000000 <= now_micros) { // Multiple threads could race in here simultaneously. // However, the last one will update last_stats_dump_time_microsec_ @@ -516,8 +519,8 @@ void DBImpl::MaybeDumpStats() { default_cf_internal_stats_->GetStringProperty(db_property_type, "rocksdb.dbstats", &stats); } - Log(options_.info_log, "------- DUMPING STATS -------"); - Log(options_.info_log, "%s", stats.c_str()); + Log(db_options_.info_log, "------- DUMPING STATS -------"); + Log(db_options_.info_log, "%s", stats.c_str()); PrintStatistics(); } @@ -527,7 +530,7 @@ void DBImpl::MaybeDumpStats() { // of all files in the filesystem in 'candidate_files'. // no_full_scan = true -- never do the full scan using GetChildren() // force = false -- don't force the full scan, except every -// options_.delete_obsolete_files_period_micros +// db_options_.delete_obsolete_files_period_micros // force = true -- force the full scan void DBImpl::FindObsoleteFiles(DeletionState& deletion_state, bool force, @@ -544,12 +547,12 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state, // logic for figurint out if we're doing the full scan if (no_full_scan) { doing_the_full_scan = false; - } else if (force || options_.delete_obsolete_files_period_micros == 0) { + } else if (force || db_options_.delete_obsolete_files_period_micros == 0) { doing_the_full_scan = true; } else { const uint64_t now_micros = env_->NowMicros(); if (delete_obsolete_files_last_run_ + - options_.delete_obsolete_files_period_micros < now_micros) { + db_options_.delete_obsolete_files_period_micros < now_micros) { doing_the_full_scan = true; delete_obsolete_files_last_run_ = now_micros; } @@ -581,11 +584,12 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state, versions_->AddLiveFiles(&deletion_state.sst_live); if (doing_the_full_scan) { - for (uint32_t path_id = 0; path_id < options_.db_paths.size(); path_id++) { + for (uint32_t path_id = 0; + path_id < db_options_.db_paths.size(); path_id++) { // set of all files in the directory. We'll exclude files that are still // alive in the subsequent processings. std::vector files; - env_->GetChildren(options_.db_paths[path_id].path, + env_->GetChildren(db_options_.db_paths[path_id].path, &files); // Ignore errors for (std::string file : files) { deletion_state.candidate_files.emplace_back(file, path_id); @@ -593,17 +597,18 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state, } //Add log files in wal_dir - if (options_.wal_dir != dbname_) { + if (db_options_.wal_dir != dbname_) { std::vector log_files; - env_->GetChildren(options_.wal_dir, &log_files); // Ignore errors + env_->GetChildren(db_options_.wal_dir, &log_files); // Ignore errors for (std::string log_file : log_files) { deletion_state.candidate_files.emplace_back(log_file, 0); } } // Add info log files in db_log_dir - if (!options_.db_log_dir.empty() && options_.db_log_dir != dbname_) { + if (!db_options_.db_log_dir.empty() && db_options_.db_log_dir != dbname_) { std::vector info_log_files; - env_->GetChildren(options_.db_log_dir, &info_log_files); // Ignore errors + // Ignore errors + env_->GetChildren(db_options_.db_log_dir, &info_log_files); for (std::string log_file : info_log_files) { deletion_state.candidate_files.emplace_back(log_file, 0); } @@ -619,7 +624,7 @@ bool CompareCandidateFile(const rocksdb::DBImpl::CandidateFileInfo& first, } else if (first.file_name < second.file_name) { return false; } else { - return (first.path_id > first.path_id); + return (first.path_id > second.path_id); } } }; // namespace @@ -674,7 +679,7 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) { candidate_files.end()); std::vector old_info_log_files; - InfoLogPrefix info_log_prefix(!options_.db_log_dir.empty(), dbname_); + InfoLogPrefix info_log_prefix(!db_options_.db_log_dir.empty(), dbname_); for (const auto& candidate_file : candidate_files) { std::string to_delete = candidate_file.file_name; uint32_t path_id = candidate_file.path_id; @@ -730,51 +735,51 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) { if (type == kTableFile) { // evict from cache TableCache::Evict(table_cache_.get(), number); - fname = TableFileName(options_.db_paths, number, path_id); + fname = TableFileName(db_options_.db_paths, number, path_id); } else { - fname = - ((type == kLogFile) ? options_.wal_dir : dbname_) + "/" + to_delete; + fname = ((type == kLogFile) ? + db_options_.wal_dir : dbname_) + "/" + to_delete; } if (type == kLogFile && - (options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0)) { - auto archived_log_name = ArchivedLogFileName(options_.wal_dir, number); + (db_options_.WAL_ttl_seconds > 0 || + db_options_.WAL_size_limit_MB > 0)) { + auto archived_log_name = ArchivedLogFileName(db_options_.wal_dir, number); // The sync point below is used in (DBTest,TransactionLogIteratorRace) TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:1"); Status s = env_->RenameFile(fname, archived_log_name); // The sync point below is used in (DBTest,TransactionLogIteratorRace) TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:2"); - Log(options_.info_log, + Log(db_options_.info_log, "Move log file %s to %s -- %s\n", fname.c_str(), archived_log_name.c_str(), s.ToString().c_str()); } else { Status s = env_->DeleteFile(fname); - Log(options_.info_log, "Delete %s type=%d #%" PRIu64 " -- %s\n", + Log(db_options_.info_log, "Delete %s type=%d #%" PRIu64 " -- %s\n", fname.c_str(), type, number, s.ToString().c_str()); } } // Delete old info log files. size_t old_info_log_file_count = old_info_log_files.size(); - if (old_info_log_file_count >= options_.keep_log_file_num) { + if (old_info_log_file_count >= db_options_.keep_log_file_num) { std::sort(old_info_log_files.begin(), old_info_log_files.end()); - size_t end = old_info_log_file_count - options_.keep_log_file_num; + size_t end = old_info_log_file_count - db_options_.keep_log_file_num; for (unsigned int i = 0; i <= end; i++) { std::string& to_delete = old_info_log_files.at(i); - std::string full_path_to_delete = - (options_.db_log_dir.empty() ? dbname_ : options_.db_log_dir) + "/" + - to_delete; - Log(options_.info_log, "Delete info log file %s\n", + std::string full_path_to_delete = (db_options_.db_log_dir.empty() ? + dbname_ : db_options_.db_log_dir) + "/" + to_delete; + Log(db_options_.info_log, "Delete info log file %s\n", full_path_to_delete.c_str()); Status s = env_->DeleteFile(full_path_to_delete); if (!s.ok()) { - Log(options_.info_log, "Delete info log file %s FAILED -- %s\n", + Log(db_options_.info_log, "Delete info log file %s FAILED -- %s\n", to_delete.c_str(), s.ToString().c_str()); } } } PurgeObsoleteWALFiles(); - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); } void DBImpl::DeleteObsoleteFiles() { @@ -796,8 +801,8 @@ void DBImpl::DeleteObsoleteFiles() { // b. get sorted non-empty archived logs // c. delete what should be deleted void DBImpl::PurgeObsoleteWALFiles() { - bool const ttl_enabled = options_.WAL_ttl_seconds > 0; - bool const size_limit_enabled = options_.WAL_size_limit_MB > 0; + bool const ttl_enabled = db_options_.WAL_ttl_seconds > 0; + bool const size_limit_enabled = db_options_.WAL_size_limit_MB > 0; if (!ttl_enabled && !size_limit_enabled) { return; } @@ -805,13 +810,14 @@ void DBImpl::PurgeObsoleteWALFiles() { int64_t current_time; Status s = env_->GetCurrentTime(¤t_time); if (!s.ok()) { - Log(options_.info_log, "Can't get current time: %s", s.ToString().c_str()); + Log(db_options_.info_log, "Can't get current time: %s", + s.ToString().c_str()); assert(false); return; } uint64_t const now_seconds = static_cast(current_time); uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled) ? - options_.WAL_ttl_seconds / 2 : default_interval_to_delete_obsolete_WAL_; + db_options_.WAL_ttl_seconds / 2 : default_interval_to_delete_obsolete_WAL_; if (purge_wal_files_last_run_ + time_to_check > now_seconds) { return; @@ -819,11 +825,12 @@ void DBImpl::PurgeObsoleteWALFiles() { purge_wal_files_last_run_ = now_seconds; - std::string archival_dir = ArchivalDirectory(options_.wal_dir); + std::string archival_dir = ArchivalDirectory(db_options_.wal_dir); std::vector files; s = env_->GetChildren(archival_dir, &files); if (!s.ok()) { - Log(options_.info_log, "Can't get archive files: %s", s.ToString().c_str()); + Log(db_options_.info_log, "Can't get archive files: %s", + s.ToString().c_str()); assert(false); return; } @@ -841,14 +848,14 @@ void DBImpl::PurgeObsoleteWALFiles() { Status const s = env_->GetFileModificationTime(file_path, &file_m_time); if (!s.ok()) { - Log(options_.info_log, "Can't get file mod time: %s: %s", + Log(db_options_.info_log, "Can't get file mod time: %s: %s", file_path.c_str(), s.ToString().c_str()); continue; } - if (now_seconds - file_m_time > options_.WAL_ttl_seconds) { + if (now_seconds - file_m_time > db_options_.WAL_ttl_seconds) { Status const s = env_->DeleteFile(file_path); if (!s.ok()) { - Log(options_.info_log, "Can't delete file: %s: %s", + Log(db_options_.info_log, "Can't delete file: %s: %s", file_path.c_str(), s.ToString().c_str()); continue; } else { @@ -863,7 +870,7 @@ void DBImpl::PurgeObsoleteWALFiles() { uint64_t file_size; Status const s = env_->GetFileSize(file_path, &file_size); if (!s.ok()) { - Log(options_.info_log, "Can't get file size: %s: %s", + Log(db_options_.info_log, "Can't get file size: %s: %s", file_path.c_str(), s.ToString().c_str()); return; } else { @@ -873,7 +880,7 @@ void DBImpl::PurgeObsoleteWALFiles() { } else { Status s = env_->DeleteFile(file_path); if (!s.ok()) { - Log(options_.info_log, "Can't delete file: %s: %s", + Log(db_options_.info_log, "Can't delete file: %s: %s", file_path.c_str(), s.ToString().c_str()); continue; } else { @@ -890,7 +897,7 @@ void DBImpl::PurgeObsoleteWALFiles() { return; } - size_t const files_keep_num = options_.WAL_size_limit_MB * + size_t const files_keep_num = db_options_.WAL_size_limit_MB * 1024 * 1024 / log_file_size; if (log_files_num <= files_keep_num) { return; @@ -901,7 +908,7 @@ void DBImpl::PurgeObsoleteWALFiles() { GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile); if (files_del_num > archived_logs.size()) { - Log(options_.info_log, "Trying to delete more archived log files than " + Log(db_options_.info_log, "Trying to delete more archived log files than " "exist. Deleting all"); files_del_num = archived_logs.size(); } @@ -910,7 +917,7 @@ void DBImpl::PurgeObsoleteWALFiles() { std::string const file_path = archived_logs[i]->PathName(); Status const s = DeleteFile(file_path); if (!s.ok()) { - Log(options_.info_log, "Can't delete file: %s: %s", + Log(db_options_.info_log, "Can't delete file: %s: %s", file_path.c_str(), s.ToString().c_str()); continue; } else { @@ -1018,7 +1025,7 @@ Status DBImpl::ReadFirstRecord(const WalFileType type, const uint64_t number, } Status s; if (type == kAliveLogFile) { - std::string fname = LogFileName(options_.wal_dir, number); + std::string fname = LogFileName(db_options_.wal_dir, number); s = ReadFirstLine(fname, sequence); if (env_->FileExists(fname) && !s.ok()) { // return any error that is not caused by non-existing file @@ -1028,7 +1035,8 @@ Status DBImpl::ReadFirstRecord(const WalFileType type, const uint64_t number, if (type == kArchivedLogFile || !s.ok()) { // check if the file got moved to archive. - std::string archived_file = ArchivedLogFileName(options_.wal_dir, number); + std::string archived_file = + ArchivedLogFileName(db_options_.wal_dir, number); s = ReadFirstLine(archived_file, sequence); } @@ -1049,7 +1057,7 @@ Status DBImpl::ReadFirstLine(const std::string& fname, const char* fname; Status* status; - bool ignore_error; // true if options_.paranoid_checks==false + bool ignore_error; // true if db_options_.paranoid_checks==false virtual void Corruption(size_t bytes, const Status& s) { Log(info_log, "%s%s: dropping %d bytes; %s", (this->ignore_error ? "(ignoring error) " : ""), fname, @@ -1062,7 +1070,7 @@ Status DBImpl::ReadFirstLine(const std::string& fname, }; unique_ptr file; - Status status = env_->NewSequentialFile(fname, &file, storage_options_); + Status status = env_->NewSequentialFile(fname, &file, env_options_); if (!status.ok()) { return status; @@ -1070,17 +1078,17 @@ Status DBImpl::ReadFirstLine(const std::string& fname, LogReporter reporter; reporter.env = env_; - reporter.info_log = options_.info_log.get(); + reporter.info_log = db_options_.info_log.get(); reporter.fname = fname.c_str(); reporter.status = &status; - reporter.ignore_error = !options_.paranoid_checks; + reporter.ignore_error = !db_options_.paranoid_checks; log::Reader reader(std::move(file), &reporter, true /*checksum*/, 0 /*initial_offset*/); std::string scratch; Slice record; if (reader.ReadRecord(&record, &scratch) && - (status.ok() || !options_.paranoid_checks)) { + (status.ok() || !db_options_.paranoid_checks)) { if (record.size() < 12) { reporter.Corruption(record.size(), Status::Corruption("log record too small")); @@ -1121,7 +1129,7 @@ Status DBImpl::Recover( return s; } - for (auto& db_path : options_.db_paths) { + for (auto& db_path : db_options_.db_paths) { s = env_->CreateDirIfMissing(db_path.path); if (!s.ok()) { return s; @@ -1139,7 +1147,7 @@ Status DBImpl::Recover( } if (!env_->FileExists(CurrentFileName(dbname_))) { - if (options_.create_if_missing) { + if (db_options_.create_if_missing) { s = NewDB(); is_new_db = true; if (!s.ok()) { @@ -1150,7 +1158,7 @@ Status DBImpl::Recover( dbname_, "does not exist (create_if_missing is false)"); } } else { - if (options_.error_if_exists) { + if (db_options_.error_if_exists) { return Status::InvalidArgument( dbname_, "exists (error_if_exists is true)"); } @@ -1165,7 +1173,7 @@ Status DBImpl::Recover( } Status s = versions_->Recover(column_families, read_only); - if (options_.paranoid_checks && s.ok()) { + if (db_options_.paranoid_checks && s.ok()) { s = CheckConsistency(); } if (s.ok()) { @@ -1186,7 +1194,7 @@ Status DBImpl::Recover( const uint64_t min_log = versions_->MinLogNumber(); const uint64_t prev_log = versions_->PrevLogNumber(); std::vector filenames; - s = env_->GetChildren(options_.wal_dir, &filenames); + s = env_->GetChildren(db_options_.wal_dir, &filenames); if (!s.ok()) { return s; } @@ -1213,14 +1221,17 @@ Status DBImpl::Recover( "flag but a log file already exists"); } - // Recover in the order in which the logs were generated - std::sort(logs.begin(), logs.end()); - for (const auto& log : logs) { - // The previous incarnation may not have written any MANIFEST - // records after allocating this log number. So we manually - // update the file number allocation counter in VersionSet. - versions_->MarkFileNumberUsed(log); - s = RecoverLogFile(log, &max_sequence, read_only); + if (!logs.empty()) { + // Recover in the order in which the logs were generated + std::sort(logs.begin(), logs.end()); + s = RecoverLogFiles(logs, &max_sequence, read_only); + if (!s.ok()) { + // Clear memtables if recovery failed + for (auto cfd : *versions_->GetColumnFamilySet()) { + cfd->CreateNewMemtable(MemTableOptions( + *cfd->GetLatestMutableCFOptions(), *cfd->options())); + } + } } SetTickerCount(stats_, SEQUENCE_NUMBER, versions_->LastSequence()); } @@ -1233,14 +1244,15 @@ Status DBImpl::Recover( return s; } -Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, - bool read_only) { +// REQUIRES: log_numbers are sorted in ascending order +Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, + SequenceNumber* max_sequence, bool read_only) { struct LogReporter : public log::Reader::Reporter { Env* env; Logger* info_log; const char* fname; - Status* status; // nullptr if options_.paranoid_checks==false or - // options_.skip_log_error_on_recovery==true + Status* status; // nullptr if db_options_.paranoid_checks==false or + // db_options_.skip_log_error_on_recovery==true virtual void Corruption(size_t bytes, const Status& s) { Log(info_log, "%s%s: dropping %d bytes; %s", (this->status == nullptr ? "(ignoring error) " : ""), @@ -1250,7 +1262,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, }; mutex_.AssertHeld(); - + Status status; std::unordered_map version_edits; // no need to refcount because iteration is under mutex for (auto cfd : *versions_->GetColumnFamilySet()) { @@ -1259,61 +1271,80 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, version_edits.insert({cfd->GetID(), edit}); } - // Open the log file - std::string fname = LogFileName(options_.wal_dir, log_number); - unique_ptr file; - Status status = env_->NewSequentialFile(fname, &file, storage_options_); - if (!status.ok()) { - MaybeIgnoreError(&status); - return status; - } - - // Create the log reader. - LogReporter reporter; - reporter.env = env_; - reporter.info_log = options_.info_log.get(); - reporter.fname = fname.c_str(); - reporter.status = (options_.paranoid_checks && - !options_.skip_log_error_on_recovery ? &status : nullptr); - // We intentially make log::Reader do checksumming even if - // paranoid_checks==false so that corruptions cause entire commits - // to be skipped instead of propagating bad information (like overly - // large sequence numbers). - log::Reader reader(std::move(file), &reporter, true/*checksum*/, - 0/*initial_offset*/); - Log(options_.info_log, "Recovering log #%" PRIu64 "", log_number); - - // Read all the records and add to a memtable - std::string scratch; - Slice record; - WriteBatch batch; - while (reader.ReadRecord(&record, &scratch)) { - if (record.size() < 12) { - reporter.Corruption( - record.size(), Status::Corruption("log record too small")); - continue; + for (auto log_number : log_numbers) { + // The previous incarnation may not have written any MANIFEST + // records after allocating this log number. So we manually + // update the file number allocation counter in VersionSet. + versions_->MarkFileNumberUsed(log_number); + // Open the log file + std::string fname = LogFileName(db_options_.wal_dir, log_number); + unique_ptr file; + status = env_->NewSequentialFile(fname, &file, env_options_); + if (!status.ok()) { + MaybeIgnoreError(&status); + if (!status.ok()) { + return status; + } else { + // Fail with one log file, but that's ok. + // Try next one. + continue; + } } - WriteBatchInternal::SetContents(&batch, record); - status = WriteBatchInternal::InsertInto( - &batch, column_family_memtables_.get(), true, log_number); + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = db_options_.info_log.get(); + reporter.fname = fname.c_str(); + reporter.status = + (db_options_.paranoid_checks && !db_options_.skip_log_error_on_recovery + ? &status + : nullptr); + // We intentially make log::Reader do checksumming even if + // paranoid_checks==false so that corruptions cause entire commits + // to be skipped instead of propagating bad information (like overly + // large sequence numbers). + log::Reader reader(std::move(file), &reporter, true /*checksum*/, + 0 /*initial_offset*/); + Log(db_options_.info_log, "Recovering log #%" PRIu64 "", log_number); + + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + while (reader.ReadRecord(&record, &scratch)) { + if (record.size() < 12) { + reporter.Corruption(record.size(), + Status::Corruption("log record too small")); + continue; + } + WriteBatchInternal::SetContents(&batch, record); - MaybeIgnoreError(&status); - if (!status.ok()) { - return status; - } - const SequenceNumber last_seq = - WriteBatchInternal::Sequence(&batch) + - WriteBatchInternal::Count(&batch) - 1; - if (last_seq > *max_sequence) { - *max_sequence = last_seq; - } + // If column family was not found, it might mean that the WAL write + // batch references to the column family that was dropped after the + // insert. We don't want to fail the whole write batch in that case -- + // we just ignore the update. + // That's why we set ignore missing column families to true + status = WriteBatchInternal::InsertInto( + &batch, column_family_memtables_.get(), true, log_number); - if (!read_only) { - // no need to refcount since client still doesn't have access - // to the DB and can not drop column families while we iterate - for (auto cfd : *versions_->GetColumnFamilySet()) { - if (cfd->mem()->ShouldFlush()) { + MaybeIgnoreError(&status); + if (!status.ok()) { + return status; + } + const SequenceNumber last_seq = WriteBatchInternal::Sequence(&batch) + + WriteBatchInternal::Count(&batch) - 1; + if (last_seq > *max_sequence) { + *max_sequence = last_seq; + } + + if (!read_only) { + // we can do this because this is called before client has access to the + // DB and there is only a single thread operating on DB + ColumnFamilyData* cfd; + + while ((cfd = flush_scheduler_.GetNextColumnFamily()) != nullptr) { + cfd->Unref(); // If this asserts, it means that InsertInto failed in // filtering updates to already-flushed column families assert(cfd->GetLogNumber() <= log_number); @@ -1321,33 +1352,35 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, assert(iter != version_edits.end()); VersionEdit* edit = &iter->second; status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit); - // we still want to clear the memtable, even if the recovery failed - cfd->CreateNewMemtable(); if (!status.ok()) { // Reflect errors immediately so that conditions like full // file-systems cause the DB::Open() to fail. return status; } + cfd->CreateNewMemtable(MemTableOptions( + *cfd->GetLatestMutableCFOptions(), *cfd->options())); } } } - } - if (versions_->LastSequence() < *max_sequence) { - versions_->SetLastSequence(*max_sequence); + flush_scheduler_.Clear(); + if (versions_->LastSequence() < *max_sequence) { + versions_->SetLastSequence(*max_sequence); + } } if (!read_only) { // no need to refcount since client still doesn't have access // to the DB and can not drop column families while we iterate + auto max_log_number = log_numbers.back(); for (auto cfd : *versions_->GetColumnFamilySet()) { auto iter = version_edits.find(cfd->GetID()); assert(iter != version_edits.end()); VersionEdit* edit = &iter->second; - if (cfd->GetLogNumber() > log_number) { + if (cfd->GetLogNumber() > max_log_number) { // Column family cfd has already flushed the data - // from log_number. Memtable has to be empty because + // from all logs. Memtable has to be empty because // we filter the updates based on log_number // (in WriteBatch::InsertInto) assert(cfd->mem()->GetFirstSequenceNumber() == 0); @@ -1358,28 +1391,30 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, // flush the final memtable (if non-empty) if (cfd->mem()->GetFirstSequenceNumber() != 0) { status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit); - } - // we still want to clear the memtable, even if the recovery failed - cfd->CreateNewMemtable(); - if (!status.ok()) { - return status; + if (!status.ok()) { + // Recovery failed + break; + } + cfd->CreateNewMemtable(MemTableOptions( + *cfd->GetLatestMutableCFOptions(), *cfd->options())); } // write MANIFEST with update - // writing log number in the manifest means that any log file + // writing log_number in the manifest means that any log file // with number strongly less than (log_number + 1) is already // recovered and should be ignored on next reincarnation. - // Since we already recovered log_number, we want all logs - // with numbers `<= log_number` (includes this one) to be ignored - edit->SetLogNumber(log_number + 1); + // Since we already recovered max_log_number, we want all logs + // with numbers `<= max_log_number` (includes this one) to be ignored + edit->SetLogNumber(max_log_number + 1); // we must mark the next log number as used, even though it's // not actually used. that is because VersionSet assumes // VersionSet::next_file_number_ always to be strictly greater than any // log number - versions_->MarkFileNumberUsed(log_number + 1); + versions_->MarkFileNumberUsed(max_log_number + 1); status = versions_->LogAndApply(cfd, edit, &mutex_); if (!status.ok()) { - return status; + // Recovery failed + break; } } } @@ -1394,30 +1429,34 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem, FileMetaData meta; meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); pending_outputs_[meta.fd.GetNumber()] = 0; // path 0 for level 0 file. - Iterator* iter = mem->NewIterator(ReadOptions(), true); - const SequenceNumber newest_snapshot = snapshots_.GetNewest(); - const SequenceNumber earliest_seqno_in_memtable = - mem->GetFirstSequenceNumber(); - Log(options_.info_log, "[%s] Level-0 table #%" PRIu64 ": started", - cfd->GetName().c_str(), meta.fd.GetNumber()); - + ReadOptions ro; + ro.total_order_seek = true; + Arena arena; Status s; { - mutex_.Unlock(); - s = BuildTable(dbname_, env_, *cfd->options(), storage_options_, - cfd->table_cache(), iter, &meta, cfd->internal_comparator(), - newest_snapshot, earliest_seqno_in_memtable, - GetCompressionFlush(*cfd->options()), Env::IO_HIGH); - LogFlush(options_.info_log); - mutex_.Lock(); - } + ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); + const SequenceNumber newest_snapshot = snapshots_.GetNewest(); + const SequenceNumber earliest_seqno_in_memtable = + mem->GetFirstSequenceNumber(); + Log(db_options_.info_log, "[%s] Level-0 table #%" PRIu64 ": started", + cfd->GetName().c_str(), meta.fd.GetNumber()); - Log(options_.info_log, - "[%s] Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s", - cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(), - s.ToString().c_str()); - delete iter; + { + mutex_.Unlock(); + s = BuildTable( + dbname_, env_, *cfd->ioptions(), env_options_, cfd->table_cache(), + iter.get(), &meta, cfd->internal_comparator(), newest_snapshot, + earliest_seqno_in_memtable, GetCompressionFlush(*cfd->ioptions()), + cfd->ioptions()->compression_opts, Env::IO_HIGH); + LogFlush(db_options_.info_log); + mutex_.Lock(); + } + Log(db_options_.info_log, + "[%s] Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s", + cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(), + s.ToString().c_str()); + } pending_outputs_.erase(meta.fd.GetNumber()); // Note that if file_size is zero, the file has been deleted and @@ -1461,29 +1500,36 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd, mutex_.Unlock(); log_buffer->FlushBufferToLog(); std::vector memtables; + ReadOptions ro; + ro.total_order_seek = true; + Arena arena; for (MemTable* m : mems) { - Log(options_.info_log, + Log(db_options_.info_log, "[%s] Flushing memtable with next log file: %" PRIu64 "\n", cfd->GetName().c_str(), m->GetNextLogNumber()); - memtables.push_back(m->NewIterator(ReadOptions(), true)); + memtables.push_back(m->NewIterator(ro, &arena)); } - Iterator* iter = NewMergingIterator(&cfd->internal_comparator(), - &memtables[0], memtables.size()); - Log(options_.info_log, "[%s] Level-0 flush table #%" PRIu64 ": started", - cfd->GetName().c_str(), meta.fd.GetNumber()); - - s = BuildTable(dbname_, env_, *cfd->options(), storage_options_, - cfd->table_cache(), iter, &meta, cfd->internal_comparator(), - newest_snapshot, earliest_seqno_in_memtable, - GetCompressionFlush(*cfd->options()), Env::IO_HIGH); - LogFlush(options_.info_log); - delete iter; - Log(options_.info_log, + { + ScopedArenaIterator iter(NewMergingIterator(&cfd->internal_comparator(), + &memtables[0], + memtables.size(), &arena)); + Log(db_options_.info_log, + "[%s] Level-0 flush table #%" PRIu64 ": started", + cfd->GetName().c_str(), meta.fd.GetNumber()); + + s = BuildTable( + dbname_, env_, *cfd->ioptions(), env_options_, cfd->table_cache(), + iter.get(), &meta, cfd->internal_comparator(), newest_snapshot, + earliest_seqno_in_memtable, GetCompressionFlush(*cfd->ioptions()), + cfd->ioptions()->compression_opts, Env::IO_HIGH); + LogFlush(db_options_.info_log); + } + Log(db_options_.info_log, "[%s] Level-0 flush table #%" PRIu64 ": %" PRIu64 " bytes %s", cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(), s.ToString().c_str()); - if (!options_.disableDataSync) { + if (!db_options_.disableDataSync) { db_directory_->Fsync(); } mutex_.Lock(); @@ -1512,8 +1558,9 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd, // insert files directly into higher levels because some other // threads could be concurrently producing compacted files for // that key range. - if (base != nullptr && options_.max_background_compactions <= 1 && - cfd->options()->compaction_style == kCompactionStyleLevel) { + if (base != nullptr && db_options_.max_background_compactions <= 1 && + db_options_.max_background_flushes == 0 && + cfd->ioptions()->compaction_style == kCompactionStyleLevel) { level = base->PickLevelForMemTableOutput(min_user_key, max_user_key); } edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(), @@ -1574,12 +1621,13 @@ Status DBImpl::FlushMemTableToOutputFile(ColumnFamilyData* cfd, } else { // Replace immutable memtable with the generated Table s = cfd->imm()->InstallMemtableFlushResults( - cfd, mems, versions_.get(), &mutex_, options_.info_log.get(), + cfd, mems, versions_.get(), &mutex_, db_options_.info_log.get(), file_number, &pending_outputs_, &deletion_state.memtables_to_free, db_directory_.get(), log_buffer); } if (s.ok()) { + // Use latest MutableCFOptions InstallSuperVersion(cfd, deletion_state); if (madeProgress) { *madeProgress = 1; @@ -1600,7 +1648,7 @@ Status DBImpl::FlushMemTableToOutputFile(ColumnFamilyData* cfd, } } - if (!s.ok() && !s.IsShutdownInProgress() && options_.paranoid_checks && + if (!s.ok() && !s.IsShutdownInProgress() && db_options_.paranoid_checks && bg_error_.ok()) { // if a bad error happened (not ShutdownInProgress) and paranoid_checks is // true, mark DB read-only @@ -1614,7 +1662,7 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family, const Slice* begin, const Slice* end, bool reduce_level, int target_level, uint32_t target_path_id) { - if (target_path_id >= options_.db_paths.size()) { + if (target_path_id >= db_options_.db_paths.size()) { return Status::InvalidArgument("Invalid target path ID"); } @@ -1623,7 +1671,7 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family, Status s = FlushMemTable(cfd, FlushOptions()); if (!s.ok()) { - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); return s; } @@ -1642,8 +1690,8 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family, // bottom-most level, the output level will be the same as input one. // level 0 can never be the bottommost level (i.e. if all files are in level // 0, we will compact to level 1) - if (cfd->options()->compaction_style == kCompactionStyleUniversal || - cfd->options()->compaction_style == kCompactionStyleFIFO || + if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO || (level == max_level_with_files && level > 0)) { s = RunManualCompaction(cfd, level, level, target_path_id, begin, end); } else { @@ -1651,7 +1699,7 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family, end); } if (!s.ok()) { - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); return s; } } @@ -1659,11 +1707,25 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family, if (reduce_level) { s = ReFitLevel(cfd, max_level_with_files, target_level); } - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); + + { + MutexLock l(&mutex_); + // an automatic compaction that has been scheduled might have been + // preempted by the manual compactions. Need to schedule it back. + MaybeScheduleFlushOrCompaction(); + } return s; } +bool DBImpl::SetOptions(ColumnFamilyHandle* column_family, + const std::unordered_map& options_map) { + auto cfh = reinterpret_cast(column_family); + MutexLock l(&mutex_); + return cfh->cfd()->SetOptions(options_map); +} + // return the same level if it cannot be moved int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level) { mutex_.AssertHeld(); @@ -1694,7 +1756,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { // only allow one thread refitting if (refitting_level_) { mutex_.Unlock(); - Log(options_.info_log, "ReFitLevel: another thread is refitting"); + Log(db_options_.info_log, "ReFitLevel: another thread is refitting"); delete new_superversion; return Status::NotSupported("another thread is refitting"); } @@ -1703,7 +1765,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { // wait for all background threads to stop bg_work_gate_closed_ = true; while (bg_compaction_scheduled_ > 0 || bg_flush_scheduled_) { - Log(options_.info_log, + Log(db_options_.info_log, "RefitLevel: waiting for background threads to stop: %d %d", bg_compaction_scheduled_, bg_flush_scheduled_); bg_cv_.Wait(); @@ -1719,8 +1781,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { Status status; if (to_level < level) { - Log(options_.info_log, "[%s] Before refitting:\n%s", cfd->GetName().c_str(), - cfd->current()->DebugString().data()); + Log(db_options_.info_log, "[%s] Before refitting:\n%s", + cfd->GetName().c_str(), cfd->current()->DebugString().data()); VersionEdit edit; edit.SetColumnFamily(cfd->GetID()); @@ -1730,18 +1792,19 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { f->fd.GetFileSize(), f->smallest, f->largest, f->smallest_seqno, f->largest_seqno); } - Log(options_.info_log, "[%s] Apply version edit:\n%s", + Log(db_options_.info_log, "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), edit.DebugString().data()); status = versions_->LogAndApply(cfd, &edit, &mutex_, db_directory_.get()); + // Use latest MutableCFOptions superversion_to_free = cfd->InstallSuperVersion(new_superversion, &mutex_); new_superversion = nullptr; - Log(options_.info_log, "[%s] LogAndApply: %s\n", cfd->GetName().c_str(), + Log(db_options_.info_log, "[%s] LogAndApply: %s\n", cfd->GetName().c_str(), status.ToString().data()); if (status.ok()) { - Log(options_.info_log, "[%s] After refitting:\n%s", + Log(db_options_.info_log, "[%s] After refitting:\n%s", cfd->GetName().c_str(), cfd->current()->DebugString().data()); } } @@ -1797,16 +1860,16 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level, // For universal compaction, we enforce every manual compaction to compact // all files. if (begin == nullptr || - cfd->options()->compaction_style == kCompactionStyleUniversal || - cfd->options()->compaction_style == kCompactionStyleFIFO) { + cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { manual.begin = nullptr; } else { begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek); manual.begin = &begin_storage; } if (end == nullptr || - cfd->options()->compaction_style == kCompactionStyleUniversal || - cfd->options()->compaction_style == kCompactionStyleFIFO) { + cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { manual.end = nullptr; } else { end_storage = InternalKey(*end, 0, static_cast(0)); @@ -1831,53 +1894,51 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level, ++bg_manual_only_; while (bg_compaction_scheduled_ > 0) { - Log(options_.info_log, + Log(db_options_.info_log, "[%s] Manual compaction waiting for all other scheduled background " "compactions to finish", cfd->GetName().c_str()); bg_cv_.Wait(); } - Log(options_.info_log, "[%s] Manual compaction starting", + Log(db_options_.info_log, "[%s] Manual compaction starting", cfd->GetName().c_str()); - while (!manual.done && !shutting_down_.Acquire_Load() && bg_error_.ok()) { + // We don't check bg_error_ here, because if we get the error in compaction, + // the compaction will set manual.status to bg_error_ and set manual.done to + // true. + while (!manual.done) { assert(bg_manual_only_ > 0); if (manual_compaction_ != nullptr) { // Running either this or some other manual compaction bg_cv_.Wait(); } else { manual_compaction_ = &manual; - MaybeScheduleFlushOrCompaction(); + bg_compaction_scheduled_++; + env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW); } } assert(!manual.in_progress); assert(bg_manual_only_ > 0); --bg_manual_only_; - if (bg_manual_only_ == 0) { - // an automatic compaction should have been scheduled might have be - // preempted by the manual compactions. Need to schedule it back. - MaybeScheduleFlushOrCompaction(); - } return manual.status; } Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options) { - Writer w(&mutex_); - w.batch = nullptr; - w.sync = false; - w.disableWAL = false; - w.in_batch_group = false; - w.done = false; - w.timeout_hint_us = kNoTimeOut; - Status s; { WriteContext context; MutexLock guard_lock(&mutex_); - s = BeginWrite(&w, 0); + + if (cfd->imm()->size() == 0 && cfd->mem()->IsEmpty()) { + // Nothing to flush + return Status::OK(); + } + + WriteThread::Writer w(&mutex_); + s = write_thread_.EnterWriteThread(&w, 0); assert(s.ok() && !w.done); // No timeout and nobody should do our job // SetNewMemtableAndNewLogFile() will release and reacquire mutex @@ -1886,12 +1947,9 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, cfd->imm()->FlushRequested(); MaybeScheduleFlushOrCompaction(); - assert(!writers_.empty()); - assert(writers_.front() == &w); - EndWrite(&w, &w, s); + write_thread_.ExitWriteThread(&w, &w, s); } - if (s.ok() && options.wait) { // Wait until the compaction completes s = WaitForFlushMemTable(cfd); @@ -1929,10 +1987,10 @@ void DBImpl::MaybeScheduleFlushOrCompaction() { } if (is_flush_pending) { // memtable flush needed - if (bg_flush_scheduled_ < options_.max_background_flushes) { + if (bg_flush_scheduled_ < db_options_.max_background_flushes) { bg_flush_scheduled_++; env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH); - } else if (options_.max_background_flushes > 0) { + } else if (db_options_.max_background_flushes > 0) { bg_schedule_needed_ = true; } } @@ -1947,12 +2005,12 @@ void DBImpl::MaybeScheduleFlushOrCompaction() { // Schedule BGWorkCompaction if there's a compaction pending (or a memtable // flush, but the HIGH pool is not enabled) - // Do it only if max_background_compactions hasn't been reached and, in case - // bg_manual_only_ > 0, if it's a manual compaction. - if ((manual_compaction_ || is_compaction_needed || - (is_flush_pending && options_.max_background_flushes == 0)) && - (!bg_manual_only_ || manual_compaction_)) { - if (bg_compaction_scheduled_ < options_.max_background_compactions) { + // Do it only if max_background_compactions hasn't been reached and + // bg_manual_only_ == 0 + if (!bg_manual_only_ && + (is_compaction_needed || + (is_flush_pending && db_options_.max_background_flushes == 0))) { + if (bg_compaction_scheduled_ < db_options_.max_background_compactions) { bg_compaction_scheduled_++; env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW); } else { @@ -1963,7 +2021,7 @@ void DBImpl::MaybeScheduleFlushOrCompaction() { } void DBImpl::RecordFlushIOStats() { - RecordTick(stats_, FLUSH_WRITE_BYTES, iostats_context.bytes_written); + RecordTick(stats_, FLUSH_WRITE_BYTES, IOSTATS(bytes_written)); IOSTATS_RESET(bytes_written); } @@ -1988,6 +2046,11 @@ Status DBImpl::BackgroundFlush(bool* madeProgress, DeletionState& deletion_state, LogBuffer* log_buffer) { mutex_.AssertHeld(); + + if (!bg_error_.ok()) { + return bg_error_; + } + // call_status is failure if at least one flush was a failure. even if // flushing one column family reports a failure, we will continue flushing // other column families. however, call_status will be a failure in that case. @@ -2002,7 +2065,7 @@ Status DBImpl::BackgroundFlush(bool* madeProgress, "BackgroundCallFlush doing FlushMemTableToOutputFile with column " "family [%s], flush slots available %d", cfd->GetName().c_str(), - options_.max_background_flushes - bg_flush_scheduled_); + db_options_.max_background_flushes - bg_flush_scheduled_); flush_status = FlushMemTableToOutputFile(cfd, madeProgress, deletion_state, log_buffer); } @@ -2020,7 +2083,7 @@ void DBImpl::BackgroundCallFlush() { DeletionState deletion_state(true); assert(bg_flush_scheduled_); - LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, options_.info_log.get()); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); { MutexLock l(&mutex_); @@ -2036,12 +2099,12 @@ void DBImpl::BackgroundCallFlush() { default_cf_internal_stats_->BumpAndGetBackgroundErrorCount(); bg_cv_.SignalAll(); // In case a waiter can proceed despite the error mutex_.Unlock(); - Log(options_.info_log, + Log(db_options_.info_log, "Waiting after background flush error: %s" "Accumulated background error counts: %" PRIu64, s.ToString().c_str(), error_cnt); log_buffer.FlushBufferToLog(); - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); env_->SleepForMicroseconds(1000000); mutex_.Lock(); } @@ -2087,7 +2150,7 @@ void DBImpl::BackgroundCallCompaction() { DeletionState deletion_state(true); MaybeDumpStats(); - LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, options_.info_log.get()); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); { MutexLock l(&mutex_); assert(bg_compaction_scheduled_); @@ -2104,11 +2167,11 @@ void DBImpl::BackgroundCallCompaction() { bg_cv_.SignalAll(); // In case a waiter can proceed despite the error mutex_.Unlock(); log_buffer.FlushBufferToLog(); - Log(options_.info_log, + Log(db_options_.info_log, "Waiting after background compaction error: %s, " "Accumulated background error counts: %" PRIu64, s.ToString().c_str(), error_cnt); - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); env_->SleepForMicroseconds(1000000); mutex_.Lock(); } @@ -2152,7 +2215,7 @@ void DBImpl::BackgroundCallCompaction() { } if (madeProgress || bg_compaction_scheduled_ == 0 || bg_manual_only_ > 0) { // signal if - // * madeProgress -- need to wakeup MakeRoomForWrite + // * madeProgress -- need to wakeup DelayWrite // * bg_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl // * bg_manual_only_ > 0 -- need to wakeup RunManualCompaction // If none of this is true, there is no need to signal since nobody is @@ -2175,9 +2238,23 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, bool is_manual = (manual_compaction_ != nullptr) && (manual_compaction_->in_progress == false); + if (!bg_error_.ok()) { + if (is_manual) { + manual_compaction_->status = bg_error_; + manual_compaction_->done = true; + manual_compaction_->in_progress = false; + manual_compaction_ = nullptr; + } + return bg_error_; + } + if (is_manual) { // another thread cannot pick up the same work manual_compaction_->in_progress = true; + } else if (manual_compaction_ != nullptr) { + // there should be no automatic compactions running when manual compaction + // is running + return Status::OK(); } // FLUSH preempts compaction @@ -2188,7 +2265,7 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, log_buffer, "BackgroundCompaction doing FlushMemTableToOutputFile, " "compaction slots available %d", - options_.max_background_compactions - bg_compaction_scheduled_); + db_options_.max_background_compactions - bg_compaction_scheduled_); cfd->Ref(); flush_stat = FlushMemTableToOutputFile(cfd, madeProgress, deletion_state, log_buffer); @@ -2250,13 +2327,14 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, // file if there is alive snapshot pointing to it assert(c->num_input_files(1) == 0); assert(c->level() == 0); - assert(c->column_family_data()->options()->compaction_style == + assert(c->column_family_data()->ioptions()->compaction_style == kCompactionStyleFIFO); for (const auto& f : *c->inputs(0)) { c->edit()->DeleteFile(c->level(), f->fd.GetNumber()); } status = versions_->LogAndApply(c->column_family_data(), c->edit(), &mutex_, db_directory_.get()); + // Use latest MutableCFOptions InstallSuperVersion(c->column_family_data(), deletion_state); LogToBuffer(log_buffer, "[%s] Deleted %d files\n", c->column_family_data()->GetName().c_str(), @@ -2273,14 +2351,16 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, f->smallest_seqno, f->largest_seqno); status = versions_->LogAndApply(c->column_family_data(), c->edit(), &mutex_, db_directory_.get()); + // Use latest MutableCFOptions InstallSuperVersion(c->column_family_data(), deletion_state); Version::LevelSummaryStorage tmp; LogToBuffer( - log_buffer, "[%s] Moved #%lld to level-%d %lld bytes %s: %s\n", + log_buffer, + "[%s] Moved #%" PRIu64 " to level-%d %" PRIu64 " bytes %s: %s\n", c->column_family_data()->GetName().c_str(), - static_cast(f->fd.GetNumber()), c->level() + 1, - static_cast(f->fd.GetFileSize()), + f->fd.GetNumber(), c->level() + 1, + f->fd.GetFileSize(), status.ToString().c_str(), c->input_version()->LevelSummary(&tmp)); c->ReleaseCompactionFiles(status); *madeProgress = true; @@ -2297,12 +2377,12 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, if (status.ok()) { // Done - } else if (shutting_down_.Acquire_Load()) { + } else if (status.IsShutdownInProgress()) { // Ignore compaction errors found during shutting down } else { - Log(InfoLogLevel::WARN_LEVEL, options_.info_log, "Compaction error: %s", + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, "Compaction error: %s", status.ToString().c_str()); - if (options_.paranoid_checks && bg_error_.ok()) { + if (db_options_.paranoid_checks && bg_error_.ok()) { bg_error_ = status; } } @@ -2333,8 +2413,8 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, // We only compacted part of the requested range. Update *m // to the range that is left to be compacted. // Universal and FIFO compactions should always compact the whole range - assert(m->cfd->options()->compaction_style != kCompactionStyleUniversal); - assert(m->cfd->options()->compaction_style != kCompactionStyleFIFO); + assert(m->cfd->ioptions()->compaction_style != kCompactionStyleUniversal); + assert(m->cfd->ioptions()->compaction_style != kCompactionStyleFIFO); m->tmp_storage = *manual_end; m->begin = &m->tmp_storage; } @@ -2414,9 +2494,9 @@ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { compact->outputs.push_back(out); // Make the output file - std::string fname = TableFileName(options_.db_paths, file_number, + std::string fname = TableFileName(db_options_.db_paths, file_number, compact->compaction->GetOutputPathId()); - Status s = env_->NewWritableFile(fname, &compact->outfile, storage_options_); + Status s = env_->NewWritableFile(fname, &compact->outfile, env_options_); if (s.ok()) { compact->outfile->SetIOPriority(Env::IO_LOW); @@ -2425,10 +2505,11 @@ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { ColumnFamilyData* cfd = compact->compaction->column_family_data(); compact->builder.reset(NewTableBuilder( - *cfd->options(), cfd->internal_comparator(), compact->outfile.get(), - compact->compaction->OutputCompressionType())); + *cfd->ioptions(), cfd->internal_comparator(), compact->outfile.get(), + compact->compaction->OutputCompressionType(), + cfd->ioptions()->compression_opts)); } - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); return s; } @@ -2456,8 +2537,8 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, compact->builder.reset(); // Finish and check for file errors - if (s.ok() && !options_.disableDataSync) { - if (options_.use_fsync) { + if (s.ok() && !db_options_.disableDataSync) { + if (db_options_.use_fsync) { StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS); s = compact->outfile->Fsync(); } else { @@ -2475,11 +2556,11 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, ColumnFamilyData* cfd = compact->compaction->column_family_data(); FileDescriptor fd(output_number, output_path_id, current_bytes); Iterator* iter = cfd->table_cache()->NewIterator( - ReadOptions(), storage_options_, cfd->internal_comparator(), fd); + ReadOptions(), env_options_, cfd->internal_comparator(), fd); s = iter->status(); delete iter; if (s.ok()) { - Log(options_.info_log, "[%s] Generated table #%" PRIu64 ": %" PRIu64 + Log(db_options_.info_log, "[%s] Generated table #%" PRIu64 ": %" PRIu64 " keys, %" PRIu64 " bytes", cfd->GetName().c_str(), output_number, current_entries, current_bytes); @@ -2498,7 +2579,7 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact, // This ensures that a concurrent compaction did not erroneously // pick the same files to compact. if (!versions_->VerifyCompactionFileConsistency(compact->compaction)) { - Log(options_.info_log, "[%s] Compaction %d@%d + %d@%d files aborted", + Log(db_options_.info_log, "[%s] Compaction %d@%d + %d@%d files aborted", compact->compaction->column_family_data()->GetName().c_str(), compact->compaction->num_input_files(0), compact->compaction->level(), compact->compaction->num_input_files(1), @@ -2547,7 +2628,7 @@ inline SequenceNumber DBImpl::findEarliestVisibleSnapshot( prev = cur; // assignment assert(prev); } - Log(options_.info_log, + Log(db_options_.info_log, "Looking for seqid %" PRIu64 " but maxseqid is %" PRIu64 "", in, snapshots[snapshots.size() - 1]); assert(0); @@ -2557,6 +2638,10 @@ inline SequenceNumber DBImpl::findEarliestVisibleSnapshot( uint64_t DBImpl::CallFlushDuringCompaction(ColumnFamilyData* cfd, DeletionState& deletion_state, LogBuffer* log_buffer) { + if (db_options_.max_background_flushes > 0) { + // flush thread will take care of this + return 0; + } if (cfd->imm()->imm_flush_needed.NoBarrier_Load() != nullptr) { const uint64_t imm_start = env_->NowMicros(); mutex_.Lock(); @@ -2564,7 +2649,7 @@ uint64_t DBImpl::CallFlushDuringCompaction(ColumnFamilyData* cfd, cfd->Ref(); FlushMemTableToOutputFile(cfd, nullptr, deletion_state, log_buffer); cfd->Unref(); - bg_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary + bg_cv_.SignalAll(); // Wakeup DelayWrite() if necessary } mutex_.Unlock(); log_buffer->FlushBufferToLog(); @@ -2597,22 +2682,42 @@ Status DBImpl::ProcessKeyValueCompaction( SequenceNumber visible_in_snapshot = kMaxSequenceNumber; ColumnFamilyData* cfd = compact->compaction->column_family_data(); MergeHelper merge( - cfd->user_comparator(), cfd->options()->merge_operator.get(), - options_.info_log.get(), cfd->options()->min_partial_merge_operands, + cfd->user_comparator(), cfd->ioptions()->merge_operator, + db_options_.info_log.get(), cfd->options()->min_partial_merge_operands, false /* internal key corruption is expected */); - auto compaction_filter = cfd->options()->compaction_filter; + auto compaction_filter = cfd->ioptions()->compaction_filter; std::unique_ptr compaction_filter_from_factory = nullptr; if (!compaction_filter) { auto context = compact->GetFilterContextV1(); compaction_filter_from_factory = - cfd->options()->compaction_filter_factory->CreateCompactionFilter( + cfd->ioptions()->compaction_filter_factory->CreateCompactionFilter( context); compaction_filter = compaction_filter_from_factory.get(); } + int64_t key_drop_user = 0; + int64_t key_drop_newer_entry = 0; + int64_t key_drop_obsolete = 0; + int64_t loop_cnt = 0; while (input->Valid() && !shutting_down_.Acquire_Load() && !cfd->IsDropped()) { - RecordCompactionIOStats(); + if (++loop_cnt > 1000) { + if (key_drop_user > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_USER, key_drop_user); + key_drop_user = 0; + } + if (key_drop_newer_entry > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY, + key_drop_newer_entry); + key_drop_newer_entry = 0; + } + if (key_drop_obsolete > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE, key_drop_obsolete); + key_drop_obsolete = 0; + } + RecordCompactionIOStats(); + loop_cnt = 0; + } // FLUSH preempts compaction // TODO(icanadi) this currently only checks if flush is necessary on // compacting column family. we should also check if flush is necessary on @@ -2693,7 +2798,7 @@ Status DBImpl::ProcessKeyValueCompaction( ParseInternalKey(key, &ikey); // no value associated with delete value.clear(); - RecordTick(stats_, COMPACTION_KEY_DROP_USER); + ++key_drop_user; } else if (value_changed) { value = compaction_filter_value; } @@ -2717,7 +2822,7 @@ Status DBImpl::ProcessKeyValueCompaction( // TODO: why not > ? assert(last_sequence_for_key >= ikey.sequence); drop = true; // (A) - RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY); + ++key_drop_newer_entry; } else if (ikey.type == kTypeDeletion && ikey.sequence <= earliest_snapshot && compact->compaction->KeyNotExistsBeyondOutputLevel(ikey.user_key)) { @@ -2729,7 +2834,7 @@ Status DBImpl::ProcessKeyValueCompaction( // few iterations of this loop (by rule (A) above). // Therefore this deletion marker is obsolete and can be dropped. drop = true; - RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE); + ++key_drop_obsolete; } else if (ikey.type == kTypeMerge) { if (!merge.HasOperator()) { LogToBuffer(log_buffer, "Options::merge_operator is null."); @@ -2745,7 +2850,7 @@ Status DBImpl::ProcessKeyValueCompaction( // optimization in BuildTable. int steps = 0; merge.MergeUntil(input, prev_snapshot, bottommost_level, - options_.statistics.get(), &steps); + db_options_.statistics.get(), &steps); // Skip the Merge ops combined_idx = combined_idx - 1 + steps; @@ -2876,7 +2981,15 @@ Status DBImpl::ProcessKeyValueCompaction( input->Next(); } } - + if (key_drop_user > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_USER, key_drop_user); + } + if (key_drop_newer_entry > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY, key_drop_newer_entry); + } + if (key_drop_obsolete > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE, key_drop_obsolete); + } RecordCompactionIOStats(); return status; @@ -2964,7 +3077,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, cfd->GetName().c_str(), compact->compaction->num_input_files(0), compact->compaction->level(), compact->compaction->num_input_files(1), compact->compaction->output_level(), compact->compaction->score(), - options_.max_background_compactions - bg_compaction_scheduled_); + db_options_.max_background_compactions - bg_compaction_scheduled_); char scratch[2345]; compact->compaction->Summary(scratch, sizeof(scratch)); LogToBuffer(log_buffer, "[%s] Compaction start summary: %s\n", @@ -3004,9 +3117,6 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, const uint64_t start_micros = env_->NowMicros(); unique_ptr input(versions_->MakeInputIterator(compact->compaction)); input->SeekToFirst(); - shared_ptr backup_input( - versions_->MakeInputIterator(compact->compaction)); - backup_input->SeekToFirst(); Status status; ParsedInternalKey ikey; @@ -3014,19 +3124,35 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, = nullptr; auto context = compact->GetFilterContext(); compaction_filter_from_factory_v2 = - cfd->options()->compaction_filter_factory_v2->CreateCompactionFilterV2( - context); + cfd->ioptions()->compaction_filter_factory_v2-> + CreateCompactionFilterV2(context); auto compaction_filter_v2 = compaction_filter_from_factory_v2.get(); - // temp_backup_input always point to the start of the current buffer - // temp_backup_input = backup_input; - // iterate through input, - // 1) buffer ineligible keys and value keys into 2 separate buffers; - // 2) send value_buffer to compaction filter and alternate the values; - // 3) merge value_buffer with ineligible_value_buffer; - // 4) run the modified "compaction" using the old for loop. - if (compaction_filter_v2) { + if (!compaction_filter_v2) { + status = ProcessKeyValueCompaction( + is_snapshot_supported, + visible_at_tip, + earliest_snapshot, + latest_snapshot, + deletion_state, + bottommost_level, + imm_micros, + input.get(), + compact, + false, + log_buffer); + } else { + // temp_backup_input always point to the start of the current buffer + // temp_backup_input = backup_input; + // iterate through input, + // 1) buffer ineligible keys and value keys into 2 separate buffers; + // 2) send value_buffer to compaction filter and alternate the values; + // 3) merge value_buffer with ineligible_value_buffer; + // 4) run the modified "compaction" using the old for loop. + shared_ptr backup_input( + versions_->MakeInputIterator(compact->compaction)); + backup_input->SeekToFirst(); while (backup_input->Valid() && !shutting_down_.Acquire_Load() && !cfd->IsDropped()) { // FLUSH preempts compaction @@ -3040,12 +3166,12 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, if (!ParseInternalKey(key, &ikey)) { // log error - Log(options_.info_log, "[%s] Failed to parse key: %s", + Log(db_options_.info_log, "[%s] Failed to parse key: %s", cfd->GetName().c_str(), key.ToString().c_str()); continue; } else { const SliceTransform* transformer = - cfd->options()->compaction_filter_factory_v2->GetPrefixExtractor(); + cfd->ioptions()->compaction_filter_factory_v2->GetPrefixExtractor(); const auto key_prefix = transformer->Transform(ikey.user_key); if (!prefix_initialized) { compact->cur_prefix_ = key_prefix.ToString(); @@ -3154,21 +3280,6 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, log_buffer); } // checking for compaction filter v2 - if (!compaction_filter_v2) { - status = ProcessKeyValueCompaction( - is_snapshot_supported, - visible_at_tip, - earliest_snapshot, - latest_snapshot, - deletion_state, - bottommost_level, - imm_micros, - input.get(), - compact, - false, - log_buffer); - } - if (status.ok() && (shutting_down_.Acquire_Load() || cfd->IsDropped())) { status = Status::ShutdownInProgress( "Database shutdown or Column family drop during compaction"); @@ -3181,7 +3292,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, } input.reset(); - if (!options_.disableDataSync) { + if (!db_options_.disableDataSync) { db_directory_->Fsync(); } @@ -3213,7 +3324,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, RecordCompactionIOStats(); - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); mutex_.Lock(); cfd->internal_stats()->AddCompactionStats( compact->compaction->output_level(), stats); @@ -3224,6 +3335,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, if (status.ok()) { status = InstallCompactionResults(compact, log_buffer); + // Use latest MutableCFOptions InstallSuperVersion(cfd, deletion_state); } Version::LevelSummaryStorage tmp; @@ -3283,31 +3395,18 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, SuperVersion* super_version, Arena* arena) { Iterator* internal_iter; - if (arena != nullptr) { - // Need to create internal iterator from the arena. - MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), arena); - // Collect iterator for mutable mem - merge_iter_builder.AddIterator( - super_version->mem->NewIterator(options, false, arena)); - // Collect all needed child iterators for immutable memtables - super_version->imm->AddIterators(options, &merge_iter_builder); - // Collect iterators for files in L0 - Ln - super_version->current->AddIterators(options, storage_options_, - &merge_iter_builder); - internal_iter = merge_iter_builder.Finish(); - } else { - // Need to create internal iterator using malloc. - std::vector iterator_list; - // Collect iterator for mutable mem - iterator_list.push_back(super_version->mem->NewIterator(options)); - // Collect all needed child iterators for immutable memtables - super_version->imm->AddIterators(options, &iterator_list); - // Collect iterators for files in L0 - Ln - super_version->current->AddIterators(options, storage_options_, - &iterator_list); - internal_iter = NewMergingIterator(&cfd->internal_comparator(), - &iterator_list[0], iterator_list.size()); - } + assert(arena != nullptr); + // Need to create internal iterator from the arena. + MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), arena); + // Collect iterator for mutable mem + merge_iter_builder.AddIterator( + super_version->mem->NewIterator(options, arena)); + // Collect all needed child iterators for immutable memtables + super_version->imm->AddIterators(options, &merge_iter_builder); + // Collect iterators for files in L0 - Ln + super_version->current->AddIterators(options, env_options_, + &merge_iter_builder); + internal_iter = merge_iter_builder.Finish(); IterState* cleanup = new IterState(this, &mutex_, super_version); internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr); @@ -3341,6 +3440,7 @@ void DBImpl::InstallSuperVersion(ColumnFamilyData* cfd, SuperVersion* new_superversion = (deletion_state.new_superversion != nullptr) ? deletion_state.new_superversion : new SuperVersion(); + // Use latest MutableCFOptions SuperVersion* old_superversion = cfd->InstallSuperVersion(new_superversion, &mutex_); deletion_state.new_superversion = nullptr; @@ -3351,7 +3451,7 @@ Status DBImpl::GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, std::string* value, bool* value_found) { StopWatch sw(env_, stats_, DB_GET); - PERF_TIMER_AUTO(get_snapshot_time); + PERF_TIMER_GUARD(get_snapshot_time); auto cfh = reinterpret_cast(column_family); auto cfd = cfh->cfd(); @@ -3375,27 +3475,27 @@ Status DBImpl::GetImpl(const ReadOptions& options, // merge_operands will contain the sequence of merges in the latter case. LookupKey lkey(key, snapshot); PERF_TIMER_STOP(get_snapshot_time); - if (sv->mem->Get(lkey, value, &s, merge_context, *cfd->options())) { + + if (sv->mem->Get(lkey, value, &s, &merge_context)) { // Done RecordTick(stats_, MEMTABLE_HIT); - } else if (sv->imm->Get(lkey, value, &s, merge_context, *cfd->options())) { + } else if (sv->imm->Get(lkey, value, &s, &merge_context)) { // Done RecordTick(stats_, MEMTABLE_HIT); } else { - PERF_TIMER_START(get_from_output_files_time); - + PERF_TIMER_GUARD(get_from_output_files_time); sv->current->Get(options, lkey, value, &s, &merge_context, value_found); - PERF_TIMER_STOP(get_from_output_files_time); RecordTick(stats_, MEMTABLE_MISS); } - PERF_TIMER_START(get_post_process_time); + { + PERF_TIMER_GUARD(get_post_process_time); - ReturnAndCleanupSuperVersion(cfd, sv); + ReturnAndCleanupSuperVersion(cfd, sv); - RecordTick(stats_, NUMBER_KEYS_READ); - RecordTick(stats_, BYTES_READ, value->size()); - PERF_TIMER_STOP(get_post_process_time); + RecordTick(stats_, NUMBER_KEYS_READ); + RecordTick(stats_, BYTES_READ, value->size()); + } return s; } @@ -3405,7 +3505,7 @@ std::vector DBImpl::MultiGet( const std::vector& keys, std::vector* values) { StopWatch sw(env_, stats_, DB_MULTIGET); - PERF_TIMER_AUTO(get_snapshot_time); + PERF_TIMER_GUARD(get_snapshot_time); SequenceNumber snapshot; @@ -3464,12 +3564,9 @@ std::vector DBImpl::MultiGet( assert(mgd_iter != multiget_cf_data.end()); auto mgd = mgd_iter->second; auto super_version = mgd->super_version; - auto cfd = mgd->cfd; - if (super_version->mem->Get(lkey, value, &s, merge_context, - *cfd->options())) { + if (super_version->mem->Get(lkey, value, &s, &merge_context)) { // Done - } else if (super_version->imm->Get(lkey, value, &s, merge_context, - *cfd->options())) { + } else if (super_version->imm->Get(lkey, value, &s, &merge_context)) { // Done } else { super_version->current->Get(options, lkey, value, &s, &merge_context); @@ -3481,7 +3578,7 @@ std::vector DBImpl::MultiGet( } // Post processing (decrement reference counts and record statistics) - PERF_TIMER_START(get_post_process_time); + PERF_TIMER_GUARD(get_post_process_time); autovector superversions_to_delete; // TODO(icanadi) do we need lock here or just around Cleanup()? @@ -3536,14 +3633,15 @@ Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& options, auto cfd = versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name); assert(cfd != nullptr); + // Use latest MutableCFOptions delete cfd->InstallSuperVersion(new SuperVersion(), &mutex_); *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_); - Log(options_.info_log, "Created column family [%s] (ID %u)", + Log(db_options_.info_log, "Created column family [%s] (ID %u)", column_family_name.c_str(), (unsigned)cfd->GetID()); max_total_in_memory_state_ += cfd->options()->write_buffer_size * cfd->options()->max_write_buffer_number; } else { - Log(options_.info_log, "Creating column family [%s] FAILED -- %s", + Log(db_options_.info_log, "Creating column family [%s] FAILED -- %s", column_family_name.c_str(), s.ToString().c_str()); } return s; @@ -3560,6 +3658,7 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { edit.DropColumnFamily(); edit.SetColumnFamily(cfd->GetID()); + Status s; { MutexLock l(&mutex_); @@ -3567,7 +3666,12 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { s = Status::InvalidArgument("Column family already dropped!\n"); } if (s.ok()) { + // we drop column family from a single write thread + WriteThread::Writer w(&mutex_); + s = write_thread_.EnterWriteThread(&w, 0); + assert(s.ok() && !w.done); // No timeout and nobody should do our job s = versions_->LogAndApply(cfd, &edit, &mutex_); + write_thread_.ExitWriteThread(&w, &w, s); } } @@ -3575,9 +3679,11 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { assert(cfd->IsDropped()); max_total_in_memory_state_ -= cfd->options()->write_buffer_size * cfd->options()->max_write_buffer_number; - Log(options_.info_log, "Dropped column family with id %u\n", cfd->GetID()); + Log(db_options_.info_log, "Dropped column family with id %u\n", + cfd->GetID()); } else { - Log(options_.info_log, "Dropping column family with id %u FAILED -- %s\n", + Log(db_options_.info_log, + "Dropping column family with id %u FAILED -- %s\n", cfd->GetID(), s.ToString().c_str()); } @@ -3595,27 +3701,27 @@ bool DBImpl::KeyMayExist(const ReadOptions& options, roptions.read_tier = kBlockCacheTier; // read from block cache only auto s = GetImpl(roptions, column_family, key, value, value_found); - // If options.block_cache != nullptr and the index block of the table didn't + // If block_cache is enabled and the index block of the table didn't // not present in block_cache, the return value will be Status::Incomplete. // In this case, key may still exist in the table. return s.ok() || s.IsIncomplete(); } -Iterator* DBImpl::NewIterator(const ReadOptions& options, +Iterator* DBImpl::NewIterator(const ReadOptions& read_options, ColumnFamilyHandle* column_family) { auto cfh = reinterpret_cast(column_family); auto cfd = cfh->cfd(); - if (options.tailing) { + if (read_options.tailing) { #ifdef ROCKSDB_LITE // not supported in lite version return nullptr; #else - // TODO(ljin): remove tailing iterator - auto iter = new ForwardIterator(this, options, cfd); - return NewDBIterator(env_, *cfd->options(), cfd->user_comparator(), iter, - kMaxSequenceNumber); -// return new TailingIterator(env_, this, options, cfd); + auto iter = new ForwardIterator(this, read_options, cfd); + return NewDBIterator(env_, *cfd->ioptions(), cfd->user_comparator(), iter, + kMaxSequenceNumber, + cfd->options()->max_sequential_skip_in_iterations, + read_options.iterate_upper_bound); #endif } else { SequenceNumber latest_snapshot = versions_->LastSequence(); @@ -3623,8 +3729,9 @@ Iterator* DBImpl::NewIterator(const ReadOptions& options, sv = cfd->GetReferencedSuperVersion(&mutex_); auto snapshot = - options.snapshot != nullptr - ? reinterpret_cast(options.snapshot)->number_ + read_options.snapshot != nullptr + ? reinterpret_cast( + read_options.snapshot)->number_ : latest_snapshot; // Try to generate a DB iterator tree in continuous memory area to be @@ -3670,17 +3777,22 @@ Iterator* DBImpl::NewIterator(const ReadOptions& options, // likely that any iterator pointer is close to the iterator it points to so // that they are likely to be in the same cache line and/or page. ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( - env_, *cfd->options(), cfd->user_comparator(), snapshot); + env_, *cfd->ioptions(), cfd->user_comparator(), + snapshot, cfd->options()->max_sequential_skip_in_iterations, + read_options.iterate_upper_bound); + Iterator* internal_iter = - NewInternalIterator(options, cfd, sv, db_iter->GetArena()); + NewInternalIterator(read_options, cfd, sv, db_iter->GetArena()); db_iter->SetIterUnderDBIter(internal_iter); return db_iter; } + // To stop compiler from complaining + return nullptr; } Status DBImpl::NewIterators( - const ReadOptions& options, + const ReadOptions& read_options, const std::vector& column_families, std::vector* iterators) { iterators->clear(); @@ -3689,7 +3801,7 @@ Status DBImpl::NewIterators( std::vector super_versions; super_versions.reserve(column_families.size()); - if (!options.tailing) { + if (!read_options.tailing) { mutex_.Lock(); latest_snapshot = versions_->LastSequence(); for (auto cfh : column_families) { @@ -3699,17 +3811,18 @@ Status DBImpl::NewIterators( mutex_.Unlock(); } - if (options.tailing) { + if (read_options.tailing) { #ifdef ROCKSDB_LITE return Status::InvalidArgument( "Tailing interator not supported in RocksDB lite"); #else for (auto cfh : column_families) { auto cfd = reinterpret_cast(cfh)->cfd(); - auto iter = new ForwardIterator(this, options, cfd); + auto iter = new ForwardIterator(this, read_options, cfd); iterators->push_back( - NewDBIterator(env_, *cfd->options(), cfd->user_comparator(), iter, - kMaxSequenceNumber)); + NewDBIterator(env_, *cfd->ioptions(), cfd->user_comparator(), iter, + kMaxSequenceNumber, + cfd->options()->max_sequential_skip_in_iterations)); } #endif } else { @@ -3718,14 +3831,18 @@ Status DBImpl::NewIterators( auto cfd = cfh->cfd(); auto snapshot = - options.snapshot != nullptr - ? reinterpret_cast(options.snapshot)->number_ + read_options.snapshot != nullptr + ? reinterpret_cast( + read_options.snapshot)->number_ : latest_snapshot; - auto iter = NewInternalIterator(options, cfd, super_versions[i]); - iter = NewDBIterator(env_, *cfd->options(), - cfd->user_comparator(), iter, snapshot); - iterators->push_back(iter); + ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( + env_, *cfd->ioptions(), cfd->user_comparator(), snapshot, + cfd->options()->max_sequential_skip_in_iterations); + Iterator* internal_iter = NewInternalIterator( + read_options, cfd, super_versions[i], db_iter->GetArena()); + db_iter->SetIterUnderDBIter(internal_iter); + iterators->push_back(db_iter); } } @@ -3762,7 +3879,7 @@ Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family, Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family, const Slice& key, const Slice& val) { auto cfh = reinterpret_cast(column_family); - if (!cfh->cfd()->options()->merge_operator) { + if (!cfh->cfd()->ioptions()->merge_operator) { return Status::NotSupported("Provide a merge_operator when opening DB"); } else { return DB::Merge(o, column_family, key, val); @@ -3774,88 +3891,12 @@ Status DBImpl::Delete(const WriteOptions& options, return DB::Delete(options, column_family, key); } -// REQUIRES: mutex_ is held -Status DBImpl::BeginWrite(Writer* w, uint64_t expiration_time) { - // the following code block pushes the current writer "w" into the writer - // queue "writers_" and wait until one of the following conditions met: - // 1. the job of "w" has been done by some other writers. - // 2. "w" becomes the first writer in "writers_" - // 3. "w" timed-out. - mutex_.AssertHeld(); - writers_.push_back(w); - - bool timed_out = false; - while (!w->done && w != writers_.front()) { - if (expiration_time == 0) { - w->cv.Wait(); - } else if (w->cv.TimedWait(expiration_time)) { - if (w->in_batch_group) { - // then it means the front writer is currently doing the - // write on behalf of this "timed-out" writer. Then it - // should wait until the write completes. - expiration_time = 0; - } else { - timed_out = true; - break; - } - } - } - - if (timed_out) { -#ifndef NDEBUG - bool found = false; -#endif - for (auto iter = writers_.begin(); iter != writers_.end(); iter++) { - if (*iter == w) { - writers_.erase(iter); -#ifndef NDEBUG - found = true; -#endif - break; - } - } -#ifndef NDEBUG - assert(found); -#endif - // writers_.front() might still be in cond_wait without a time-out. - // As a result, we need to signal it to wake it up. Otherwise no - // one else will wake him up, and RocksDB will hang. - if (!writers_.empty()) { - writers_.front()->cv.Signal(); - } - return Status::TimedOut(); - } - return Status::OK(); -} - -// REQUIRES: mutex_ is held -void DBImpl::EndWrite(Writer* w, Writer* last_writer, Status status) { - // Pop out the current writer and all writers being pushed before the - // current writer from the writer queue. - mutex_.AssertHeld(); - while (!writers_.empty()) { - Writer* ready = writers_.front(); - writers_.pop_front(); - if (ready != w) { - ready->status = status; - ready->done = true; - ready->cv.Signal(); - } - if (ready == last_writer) break; - } - - // Notify new head of write queue - if (!writers_.empty()) { - writers_.front()->cv.Signal(); - } -} - Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { if (my_batch == nullptr) { return Status::Corruption("Batch is nullptr!"); } - PERF_TIMER_AUTO(write_pre_and_post_process_time); - Writer w(&mutex_); + PERF_TIMER_GUARD(write_pre_and_post_process_time); + WriteThread::Writer w(&mutex_); w.batch = my_batch; w.sync = options.sync; w.disableWAL = options.disableWAL; @@ -3864,10 +3905,12 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { w.timeout_hint_us = options.timeout_hint_us; uint64_t expiration_time = 0; + bool has_timeout = false; if (w.timeout_hint_us == 0) { - w.timeout_hint_us = kNoTimeOut; + w.timeout_hint_us = WriteThread::kNoTimeOut; } else { expiration_time = env_->NowMicros() + w.timeout_hint_us; + has_timeout = true; } if (!options.disableWAL) { @@ -3877,7 +3920,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { WriteContext context; mutex_.Lock(); - Status status = BeginWrite(&w, expiration_time); + Status status = write_thread_.EnterWriteThread(&w, expiration_time); assert(status.ok() || status.IsTimedOut()); if (status.IsTimedOut()) { mutex_.Unlock(); @@ -3902,59 +3945,55 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { assert(!single_column_family_mode_ || versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1); - uint64_t flush_column_family_if_log_file = 0; - uint64_t max_total_wal_size = (options_.max_total_wal_size == 0) + uint64_t max_total_wal_size = (db_options_.max_total_wal_size == 0) ? 4 * max_total_in_memory_state_ - : options_.max_total_wal_size; + : db_options_.max_total_wal_size; if (UNLIKELY(!single_column_family_mode_) && alive_log_files_.begin()->getting_flushed == false && total_log_size_ > max_total_wal_size) { - flush_column_family_if_log_file = alive_log_files_.begin()->number; + uint64_t flush_column_family_if_log_file = alive_log_files_.begin()->number; alive_log_files_.begin()->getting_flushed = true; - Log(options_.info_log, + Log(db_options_.info_log, "Flushing all column families with data in WAL number %" PRIu64 ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64, flush_column_family_if_log_file, total_log_size_, max_total_wal_size); - } - - if (LIKELY(single_column_family_mode_)) { - // fast path - status = MakeRoomForWrite(default_cf_handle_->cfd(), - &context, expiration_time); - } else { - // refcounting cfd in iteration - bool dead_cfd = false; + // no need to refcount because drop is happening in write thread, so can't + // happen while we're in the write thread for (auto cfd : *versions_->GetColumnFamilySet()) { - cfd->Ref(); - if (flush_column_family_if_log_file != 0 && - cfd->GetLogNumber() <= flush_column_family_if_log_file) { - // log size excedded limit and we need to do flush - // SetNewMemtableAndNewLogFie may temporarily unlock and wait + if (cfd->GetLogNumber() <= flush_column_family_if_log_file) { status = SetNewMemtableAndNewLogFile(cfd, &context); + if (!status.ok()) { + break; + } cfd->imm()->FlushRequested(); - MaybeScheduleFlushOrCompaction(); - } else { - // May temporarily unlock and wait. - status = MakeRoomForWrite(cfd, &context, expiration_time); - } - - if (cfd->Unref()) { - dead_cfd = true; - } - if (!status.ok()) { - break; } } - if (dead_cfd) { - versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); - } + MaybeScheduleFlushOrCompaction(); + } + + if (UNLIKELY(status.ok() && !bg_error_.ok())) { + status = bg_error_; + } + + if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) { + status = ScheduleFlushes(&context); + } + + if (UNLIKELY(status.ok()) && + (write_controller_.IsStopped() || write_controller_.GetDelay() > 0)) { + DelayWrite(expiration_time); + } + + if (UNLIKELY(status.ok() && has_timeout && + env_->NowMicros() > expiration_time)) { + status = Status::TimedOut(); } uint64_t last_sequence = versions_->LastSequence(); - Writer* last_writer = &w; + WriteThread::Writer* last_writer = &w; if (status.ok()) { autovector write_batch_group; - BuildBatchGroup(&last_writer, &write_batch_group); + write_thread_.BuildBatchGroup(&last_writer, &write_batch_group); // Add to log and apply to memtable. We can release the lock // during this phase since &w is currently responsible for logging @@ -3987,17 +4026,17 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { uint64_t log_size = 0; if (!options.disableWAL) { - PERF_TIMER_START(write_wal_time); + PERF_TIMER_GUARD(write_wal_time); Slice log_entry = WriteBatchInternal::Contents(updates); status = log_->AddRecord(log_entry); total_log_size_ += log_entry.size(); alive_log_files_.back().AddSize(log_entry.size()); log_empty_ = false; log_size = log_entry.size(); - RecordTick(stats_, WAL_FILE_SYNCED); RecordTick(stats_, WAL_FILE_BYTES, log_size); if (status.ok() && options.sync) { - if (options_.use_fsync) { + RecordTick(stats_, WAL_FILE_SYNCED); + if (db_options_.use_fsync) { StopWatch(env_, stats_, WAL_FILE_SYNC_MICROS); status = log_->file()->Fsync(); } else { @@ -4005,13 +4044,13 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { status = log_->file()->Sync(); } } - PERF_TIMER_STOP(write_wal_time); } if (status.ok()) { - PERF_TIMER_START(write_memtable_time); + PERF_TIMER_GUARD(write_memtable_time); status = WriteBatchInternal::InsertInto( - updates, column_family_memtables_.get(), false, 0, this, false); + updates, column_family_memtables_.get(), + options.ignore_missing_column_families, 0, this, false); // A non-OK status here indicates iteration failure (either in-memory // writebatch corruption (very bad), or the client specified invalid // column family). This will later on trigger bg_error_. @@ -4020,8 +4059,6 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { // into the memtable would result in a state that some write ops might // have succeeded in memtable but Status reports error for all writes. - PERF_TIMER_STOP(write_memtable_time); - SetTickerCount(stats_, SEQUENCE_NUMBER, last_sequence); } PERF_TIMER_START(write_pre_and_post_process_time); @@ -4043,253 +4080,62 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { } } } - if (options_.paranoid_checks && !status.ok() && + if (db_options_.paranoid_checks && !status.ok() && !status.IsTimedOut() && bg_error_.ok()) { bg_error_ = status; // stop compaction & fail any further writes } - EndWrite(&w, last_writer, status); + write_thread_.ExitWriteThread(&w, last_writer, status); mutex_.Unlock(); if (status.IsTimedOut()) { RecordTick(stats_, WRITE_TIMEDOUT); } - PERF_TIMER_STOP(write_pre_and_post_process_time); return status; } -// This function will be called only when the first writer succeeds. -// All writers in the to-be-built batch group will be processed. -// -// REQUIRES: Writer list must be non-empty -// REQUIRES: First writer must have a non-nullptr batch -void DBImpl::BuildBatchGroup(Writer** last_writer, - autovector* write_batch_group) { - assert(!writers_.empty()); - Writer* first = writers_.front(); - assert(first->batch != nullptr); - - size_t size = WriteBatchInternal::ByteSize(first->batch); - write_batch_group->push_back(first->batch); - - // Allow the group to grow up to a maximum size, but if the - // original write is small, limit the growth so we do not slow - // down the small write too much. - size_t max_size = 1 << 20; - if (size <= (128<<10)) { - max_size = size + (128<<10); - } - - *last_writer = first; - std::deque::iterator iter = writers_.begin(); - ++iter; // Advance past "first" - for (; iter != writers_.end(); ++iter) { - Writer* w = *iter; - if (w->sync && !first->sync) { - // Do not include a sync write into a batch handled by a non-sync write. - break; - } - - if (!w->disableWAL && first->disableWAL) { - // Do not include a write that needs WAL into a batch that has - // WAL disabled. - break; - } - - if (w->timeout_hint_us < first->timeout_hint_us) { - // Do not include those writes with shorter timeout. Otherwise, we might - // execute a write that should instead be aborted because of timeout. - break; - } +// REQUIRES: mutex_ is held +// REQUIRES: this thread is currently at the front of the writer queue +void DBImpl::DelayWrite(uint64_t expiration_time) { + StopWatch sw(env_, stats_, WRITE_STALL); + bool has_timeout = (expiration_time > 0); + auto delay = write_controller_.GetDelay(); + if (write_controller_.IsStopped() == false && delay > 0) { + mutex_.Unlock(); + env_->SleepForMicroseconds(delay); + mutex_.Lock(); + } - if (w->batch != nullptr) { - size += WriteBatchInternal::ByteSize(w->batch); - if (size > max_size) { - // Do not make batch too big + while (write_controller_.IsStopped()) { + if (has_timeout) { + bg_cv_.TimedWait(expiration_time); + if (env_->NowMicros() > expiration_time) { break; } - - write_batch_group->push_back(w->batch); + } else { + bg_cv_.Wait(); } - w->in_batch_group = true; - *last_writer = w; } } -// This function computes the amount of time in microseconds by which a write -// should be delayed based on the number of level-0 files according to the -// following formula: -// if n < bottom, return 0; -// if n >= top, return 1000; -// otherwise, let r = (n - bottom) / -// (top - bottom) -// and return r^2 * 1000. -// The goal of this formula is to gradually increase the rate at which writes -// are slowed. We also tried linear delay (r * 1000), but it seemed to do -// slightly worse. There is no other particular reason for choosing quadratic. -uint64_t DBImpl::SlowdownAmount(int n, double bottom, double top) { - uint64_t delay; - if (n >= top) { - delay = 1000; - } - else if (n < bottom) { - delay = 0; - } - else { - // If we are here, we know that: - // level0_start_slowdown <= n < level0_slowdown - // since the previous two conditions are false. - double how_much = - (double) (n - bottom) / - (top - bottom); - delay = std::max(how_much * how_much * 1000, 100.0); - } - assert(delay <= 1000); - return delay; -} - -// REQUIRES: mutex_ is held -// REQUIRES: this thread is currently at the front of the writer queue -Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, - WriteContext* context, - uint64_t expiration_time) { - mutex_.AssertHeld(); - assert(!writers_.empty()); - bool allow_delay = true; - bool allow_hard_rate_limit_delay = true; - bool allow_soft_rate_limit_delay = true; - uint64_t rate_limit_delay_millis = 0; - Status s; - double score; - // Once we schedule background work, we shouldn't schedule it again, since it - // might generate a tight feedback loop, constantly scheduling more background - // work, even if additional background work is not needed - bool schedule_background_work = true; - bool has_timeout = (expiration_time > 0); - - while (true) { - if (!bg_error_.ok()) { - // Yield previous error - s = bg_error_; - break; - } else if (has_timeout && env_->NowMicros() > expiration_time) { - s = Status::TimedOut(); - break; - } else if (allow_delay && cfd->NeedSlowdownForNumLevel0Files()) { - // We are getting close to hitting a hard limit on the number of - // L0 files. Rather than delaying a single write by several - // seconds when we hit the hard limit, start delaying each - // individual write by 0-1ms to reduce latency variance. Also, - // this delay hands over some CPU to the compaction thread in - // case it is sharing the same core as the writer. - uint64_t slowdown = - SlowdownAmount(cfd->current()->NumLevelFiles(0), - cfd->options()->level0_slowdown_writes_trigger, - cfd->options()->level0_stop_writes_trigger); - mutex_.Unlock(); - uint64_t delayed; - { - StopWatch sw(env_, stats_, STALL_L0_SLOWDOWN_COUNT, &delayed); - env_->SleepForMicroseconds(slowdown); - } - RecordTick(stats_, STALL_L0_SLOWDOWN_MICROS, delayed); - allow_delay = false; // Do not delay a single write more than once - mutex_.Lock(); - cfd->internal_stats()->AddCFStats( - InternalStats::LEVEL0_SLOWDOWN, delayed); - delayed_writes_++; - } else if (!cfd->mem()->ShouldFlush()) { - // There is room in current memtable - if (allow_delay) { - DelayLoggingAndReset(); - } - break; - } else if (cfd->NeedWaitForNumMemtables()) { - // We have filled up the current memtable, but the previous - // ones are still being flushed, so we wait. - DelayLoggingAndReset(); - Log(options_.info_log, "[%s] wait for memtable flush...\n", - cfd->GetName().c_str()); - if (schedule_background_work) { - MaybeScheduleFlushOrCompaction(); - schedule_background_work = false; - } - uint64_t stall; - { - StopWatch sw(env_, stats_, STALL_MEMTABLE_COMPACTION_COUNT, &stall); - if (!has_timeout) { - bg_cv_.Wait(); - } else { - bg_cv_.TimedWait(expiration_time); - } - } - RecordTick(stats_, STALL_MEMTABLE_COMPACTION_MICROS, stall); - cfd->internal_stats()->AddCFStats( - InternalStats::MEMTABLE_COMPACTION, stall); - } else if (cfd->NeedWaitForNumLevel0Files()) { - DelayLoggingAndReset(); - Log(options_.info_log, "[%s] wait for fewer level0 files...\n", - cfd->GetName().c_str()); - uint64_t stall; - { - StopWatch sw(env_, stats_, STALL_L0_NUM_FILES_COUNT, &stall); - if (!has_timeout) { - bg_cv_.Wait(); - } else { - bg_cv_.TimedWait(expiration_time); - } - } - RecordTick(stats_, STALL_L0_NUM_FILES_MICROS, stall); - cfd->internal_stats()->AddCFStats( - InternalStats::LEVEL0_NUM_FILES, stall); - } else if (allow_hard_rate_limit_delay && cfd->ExceedsHardRateLimit()) { - // Delay a write when the compaction score for any level is too large. - const int max_level = cfd->current()->MaxCompactionScoreLevel(); - score = cfd->current()->MaxCompactionScore(); - mutex_.Unlock(); - uint64_t delayed; - { - StopWatch sw(env_, stats_, HARD_RATE_LIMIT_DELAY_COUNT, &delayed); - env_->SleepForMicroseconds(1000); - } - // Make sure the following value doesn't round to zero. - uint64_t rate_limit = std::max((delayed / 1000), (uint64_t) 1); - rate_limit_delay_millis += rate_limit; - RecordTick(stats_, RATE_LIMIT_DELAY_MILLIS, rate_limit); - if (cfd->options()->rate_limit_delay_max_milliseconds > 0 && - rate_limit_delay_millis >= - (unsigned)cfd->options()->rate_limit_delay_max_milliseconds) { - allow_hard_rate_limit_delay = false; - } - mutex_.Lock(); - cfd->internal_stats()->RecordLevelNSlowdown(max_level, delayed, false); - } else if (allow_soft_rate_limit_delay && cfd->ExceedsSoftRateLimit()) { - const int max_level = cfd->current()->MaxCompactionScoreLevel(); - score = cfd->current()->MaxCompactionScore(); - // Delay a write when the compaction score for any level is too large. - // TODO: add statistics - uint64_t slowdown = SlowdownAmount(score, cfd->options()->soft_rate_limit, - cfd->options()->hard_rate_limit); - uint64_t elapsed = 0; - mutex_.Unlock(); - { - StopWatch sw(env_, stats_, SOFT_RATE_LIMIT_DELAY_COUNT, &elapsed); - env_->SleepForMicroseconds(slowdown); - rate_limit_delay_millis += slowdown; - } - allow_soft_rate_limit_delay = false; - mutex_.Lock(); - cfd->internal_stats()->RecordLevelNSlowdown(max_level, elapsed, true); - } else { - s = SetNewMemtableAndNewLogFile(cfd, context); - if (!s.ok()) { - break; - } - MaybeScheduleFlushOrCompaction(); +Status DBImpl::ScheduleFlushes(WriteContext* context) { + bool schedule_bg_work = false; + ColumnFamilyData* cfd; + while ((cfd = flush_scheduler_.GetNextColumnFamily()) != nullptr) { + schedule_bg_work = true; + auto status = SetNewMemtableAndNewLogFile(cfd, context); + if (cfd->Unref()) { + delete cfd; + } + if (!status.ok()) { + return status; } } - return s; + if (schedule_bg_work) { + MaybeScheduleFlushOrCompaction(); + } + return Status::OK(); } // REQUIRES: mutex_ is held @@ -4308,14 +4154,14 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd, uint64_t new_log_number = creating_new_log ? versions_->NewFileNumber() : logfile_number_; SuperVersion* new_superversion = nullptr; + const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); mutex_.Unlock(); Status s; { - DelayLoggingAndReset(); if (creating_new_log) { - s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number), - &lfile, - env_->OptimizeForLogWrite(storage_options_)); + s = env_->NewWritableFile( + LogFileName(db_options_.wal_dir, new_log_number), + &lfile, env_->OptimizeForLogWrite(env_options_)); if (s.ok()) { // Our final size should be less than write_buffer_size // (compression, etc) but err on the side of caution. @@ -4326,7 +4172,9 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd, } if (s.ok()) { - new_mem = new MemTable(cfd->internal_comparator(), *cfd->options()); + new_mem = new MemTable(cfd->internal_comparator(), + *cfd->ioptions(), MemTableOptions(mutable_cf_options, + *cfd->options())); new_superversion = new SuperVersion(); } } @@ -4362,11 +4210,11 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd, cfd->imm()->Add(cfd->mem()); new_mem->Ref(); cfd->SetMemtable(new_mem); - Log(options_.info_log, + Log(db_options_.info_log, "[%s] New memtable created with log file: #%" PRIu64 "\n", cfd->GetName().c_str(), logfile_number_); context->superversions_to_free_.push_back( - cfd->InstallSuperVersion(new_superversion, &mutex_)); + cfd->InstallSuperVersion(new_superversion, &mutex_, mutable_cf_options)); return s; } @@ -4452,7 +4300,7 @@ bool DBImpl::GetIntPropertyInternal(ColumnFamilyHandle* column_family, if (!need_out_of_mutex) { MutexLock l(&mutex_); - return cfd->internal_stats()->GetIntProperty(property_type, value); + return cfd->internal_stats()->GetIntProperty(property_type, value, this); } else { SuperVersion* sv = GetAndRefSuperVersion(cfd); @@ -4467,7 +4315,7 @@ bool DBImpl::GetIntPropertyInternal(ColumnFamilyHandle* column_family, SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) { // TODO(ljin): consider using GetReferencedSuperVersion() directly - if (LIKELY(options_.allow_thread_local)) { + if (LIKELY(db_options_.allow_thread_local)) { return cfd->GetThreadLocalSuperVersion(&mutex_); } else { MutexLock l(&mutex_); @@ -4478,7 +4326,7 @@ SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) { void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, SuperVersion* sv) { bool unref_sv = true; - if (LIKELY(options_.allow_thread_local)) { + if (LIKELY(db_options_.allow_thread_local)) { unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv); } @@ -4523,13 +4371,6 @@ void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family, } } -inline void DBImpl::DelayLoggingAndReset() { - if (delayed_writes_ > 0) { - Log(options_.info_log, "delayed %d write...\n", delayed_writes_ ); - delayed_writes_ = 0; - } -} - #ifndef ROCKSDB_LITE Status DBImpl::GetUpdatesSince( SequenceNumber seq, unique_ptr* iter, @@ -4552,8 +4393,8 @@ Status DBImpl::GetUpdatesSince( if (!s.ok()) { return s; } - iter->reset(new TransactionLogIteratorImpl(options_.wal_dir, &options_, - read_options, storage_options_, + iter->reset(new TransactionLogIteratorImpl(db_options_.wal_dir, &db_options_, + read_options, env_options_, seq, std::move(wal_files), this)); return (*iter)->status(); } @@ -4564,7 +4405,7 @@ Status DBImpl::DeleteFile(std::string name) { WalFileType log_type; if (!ParseFileName(name, &number, &type, &log_type) || (type != kTableFile && type != kLogFile)) { - Log(options_.info_log, "DeleteFile %s failed.\n", name.c_str()); + Log(db_options_.info_log, "DeleteFile %s failed.\n", name.c_str()); return Status::InvalidArgument("Invalid file name"); } @@ -4572,13 +4413,13 @@ Status DBImpl::DeleteFile(std::string name) { if (type == kLogFile) { // Only allow deleting archived log files if (log_type != kArchivedLogFile) { - Log(options_.info_log, "DeleteFile %s failed - not archived log.\n", + Log(db_options_.info_log, "DeleteFile %s failed - not archived log.\n", name.c_str()); return Status::NotSupported("Delete only supported for archived logs"); } - status = env_->DeleteFile(options_.wal_dir + "/" + name.c_str()); + status = env_->DeleteFile(db_options_.wal_dir + "/" + name.c_str()); if (!status.ok()) { - Log(options_.info_log, "DeleteFile %s failed -- %s.\n", + Log(db_options_.info_log, "DeleteFile %s failed -- %s.\n", name.c_str(), status.ToString().c_str()); } return status; @@ -4593,7 +4434,7 @@ Status DBImpl::DeleteFile(std::string name) { MutexLock l(&mutex_); status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd); if (!status.ok()) { - Log(options_.info_log, "DeleteFile %s failed. File not found\n", + Log(db_options_.info_log, "DeleteFile %s failed. File not found\n", name.c_str()); return Status::InvalidArgument("File not found"); } @@ -4601,7 +4442,7 @@ Status DBImpl::DeleteFile(std::string name) { // If the file is being compacted no need to delete. if (metadata->being_compacted) { - Log(options_.info_log, + Log(db_options_.info_log, "DeleteFile %s Skipped. File about to be compacted\n", name.c_str()); return Status::OK(); } @@ -4611,7 +4452,7 @@ Status DBImpl::DeleteFile(std::string name) { // lost. Check that the level passed is the last level. for (int i = level + 1; i < cfd->NumberLevels(); i++) { if (cfd->current()->NumLevelFiles(i) != 0) { - Log(options_.info_log, + Log(db_options_.info_log, "DeleteFile %s FAILED. File not in last level\n", name.c_str()); return Status::InvalidArgument("File not in last level"); } @@ -4623,7 +4464,7 @@ Status DBImpl::DeleteFile(std::string name) { } FindObsoleteFiles(deletion_state, false); } // lock released here - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); // remove files outside the db-lock if (deletion_state.HaveSomethingToDelete()) { PurgeObsoleteFiles(deletion_state); @@ -4756,6 +4597,10 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { Status DB::Open(const DBOptions& db_options, const std::string& dbname, const std::vector& column_families, std::vector* handles, DB** dbptr) { + Status s = SanitizeDBOptionsByCFOptions(&db_options, column_families); + if (!s.ok()) { + return s; + } if (db_options.db_paths.size() > 1) { for (auto& cfd : column_families) { if (cfd.options.compaction_style != kCompactionStyleUniversal) { @@ -4778,16 +4623,12 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, for (auto cf : column_families) { max_write_buffer_size = std::max(max_write_buffer_size, cf.options.write_buffer_size); - if (cf.options.block_cache != nullptr && cf.options.no_block_cache) { - return Status::InvalidArgument( - "no_block_cache is true while block_cache is not nullptr"); - } } DBImpl* impl = new DBImpl(db_options, dbname); - Status s = impl->env_->CreateDirIfMissing(impl->options_.wal_dir); + s = impl->env_->CreateDirIfMissing(impl->db_options_.wal_dir); if (s.ok()) { - for (auto db_path : impl->options_.db_paths) { + for (auto db_path : impl->db_options_.db_paths) { s = impl->env_->CreateDirIfMissing(db_path.path); if (!s.ok()) { break; @@ -4812,9 +4653,9 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, uint64_t new_log_number = impl->versions_->NewFileNumber(); unique_ptr lfile; EnvOptions soptions(db_options); - s = impl->options_.env->NewWritableFile( - LogFileName(impl->options_.wal_dir, new_log_number), &lfile, - impl->options_.env->OptimizeForLogWrite(soptions)); + s = impl->db_options_.env->NewWritableFile( + LogFileName(impl->db_options_.wal_dir, new_log_number), &lfile, + impl->db_options_.env->OptimizeForLogWrite(soptions)); if (s.ok()) { lfile->SetPreallocationBlockSize(1.1 * max_write_buffer_size); impl->logfile_number_ = new_log_number; @@ -4848,6 +4689,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, } if (s.ok()) { for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + // Use latest MutableCFOptions delete cfd->InstallSuperVersion(new SuperVersion(), &impl->mutex_); } impl->alive_log_files_.push_back( @@ -4860,8 +4702,8 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, if (s.ok()) { for (auto cfd : *impl->versions_->GetColumnFamilySet()) { - if (cfd->options()->compaction_style == kCompactionStyleUniversal || - cfd->options()->compaction_style == kCompactionStyleFIFO) { + if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { Version* current = cfd->current(); for (int i = 1; i < current->NumberLevels(); ++i) { int num_files = current->NumLevelFiles(i); @@ -4873,7 +4715,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, } } } - if (cfd->options()->merge_operator != nullptr && + if (cfd->ioptions()->merge_operator != nullptr && !cfd->mem()->IsMergeOperatorSupported()) { s = Status::InvalidArgument( "The memtable of column family %s does not support merge operator " @@ -4911,9 +4753,7 @@ Snapshot::~Snapshot() { Status DestroyDB(const std::string& dbname, const Options& options) { const InternalKeyComparator comparator(options.comparator); - const InternalFilterPolicy filter_policy(options.filter_policy); - const Options& soptions(SanitizeOptions( - dbname, &comparator, &filter_policy, options)); + const Options& soptions(SanitizeOptions(dbname, &comparator, options)); Env* env = soptions.env; std::vector filenames; std::vector archiveFiles; @@ -4939,8 +4779,9 @@ Status DestroyDB(const std::string& dbname, const Options& options) { if (result.ok()) { uint64_t number; FileType type; + InfoLogPrefix info_log_prefix(!options.db_log_dir.empty(), dbname); for (size_t i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &type) && + if (ParseFileName(filenames[i], &number, info_log_prefix.prefix, &type) && type != kDBLockFile) { // Lock file will be deleted at end Status del; if (type == kMetaDatabase) { diff --git a/db/db_impl.h b/db/db_impl.h index 988e2d8d8ff..0bc2018b42c 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -30,7 +30,11 @@ #include "util/autovector.h" #include "util/stop_watch.h" #include "util/thread_local.h" +#include "util/scoped_arena_iterator.h" #include "db/internal_stats.h" +#include "db/write_controller.h" +#include "db/flush_scheduler.h" +#include "db/write_thread.h" namespace rocksdb { @@ -108,6 +112,10 @@ class DBImpl : public DB { bool reduce_level = false, int target_level = -1, uint32_t target_path_id = 0); + using DB::SetOptions; + bool SetOptions(ColumnFamilyHandle* column_family, + const std::unordered_map& options_map); + using DB::NumberLevels; virtual int NumberLevels(ColumnFamilyHandle* column_family); using DB::MaxMemCompactionLevel; @@ -127,6 +135,7 @@ class DBImpl : public DB { #ifndef ROCKSDB_LITE virtual Status DisableFileDeletions(); virtual Status EnableFileDeletions(bool force); + virtual int IsFileDeletionsEnabled() const; // All the returned filenames start with "/" virtual Status GetLiveFiles(std::vector&, uint64_t* manifest_file_size, @@ -172,8 +181,8 @@ class DBImpl : public DB { // Return an internal iterator over the current state of the database. // The keys of this iterator are internal keys (see format.h). // The returned iterator should be deleted when no longer needed. - Iterator* TEST_NewInternalIterator(ColumnFamilyHandle* column_family = - nullptr); + Iterator* TEST_NewInternalIterator( + Arena* arena, ColumnFamilyHandle* column_family = nullptr); // Return the maximum overlapping data (in bytes) at next level for any // file at a level >= 1. @@ -201,6 +210,17 @@ class DBImpl : public DB { SequenceNumber* sequence); Status TEST_ReadFirstLine(const std::string& fname, SequenceNumber* sequence); + + void TEST_LockMutex(); + + void TEST_UnlockMutex(); + + // REQUIRES: mutex locked + void* TEST_BeginWrite(); + + // REQUIRES: mutex locked + // pass the pointer that you got from TEST_BeginWrite() + void TEST_EndWrite(void* w); #endif // NDEBUG // Structure to store information for candidate files to delete. @@ -274,7 +294,7 @@ class DBImpl : public DB { // Returns the list of live files in 'live' and the list // of all files in the filesystem in 'candidate_files'. // If force == false and the last call was less than - // options_.delete_obsolete_files_period_micros microseconds ago, + // db_options_.delete_obsolete_files_period_micros microseconds ago, // it will not fill up the deletion_state void FindObsoleteFiles(DeletionState& deletion_state, bool force, @@ -292,23 +312,21 @@ class DBImpl : public DB { Env* const env_; const std::string dbname_; unique_ptr versions_; - const DBOptions options_; + const DBOptions db_options_; Statistics* stats_; Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd, - SuperVersion* super_version, - Arena* arena = nullptr); + SuperVersion* super_version, Arena* arena); private: friend class DB; friend class InternalStats; #ifndef ROCKSDB_LITE - friend class TailingIterator; friend class ForwardIterator; #endif friend struct SuperVersion; struct CompactionState; - struct Writer; + struct WriteContext; Status NewDB(); @@ -332,8 +350,9 @@ class DBImpl : public DB { DeletionState& deletion_state, LogBuffer* log_buffer); - Status RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, - bool read_only); + // REQUIRES: log_numbers are sorted in ascending order + Status RecoverLogFiles(const std::vector& log_numbers, + SequenceNumber* max_sequence, bool read_only); // The following two methods are used to flush a memtable to // storage. The first one is used atdatabase RecoveryTime (when the @@ -346,43 +365,13 @@ class DBImpl : public DB { VersionEdit* edit, uint64_t* filenumber, LogBuffer* log_buffer); - uint64_t SlowdownAmount(int n, double bottom, double top); + void DelayWrite(uint64_t expiration_time); - // Before applying write operation (such as DBImpl::Write, DBImpl::Flush) - // thread should grab the mutex_ and be the first on writers queue. - // BeginWrite is used for it. - // Be aware! Writer's job can be done by other thread (see DBImpl::Write - // for examples), so check it via w.done before applying changes. - // - // Writer* w: writer to be placed in the queue - // uint64_t expiration_time: maximum time to be in the queue - // See also: EndWrite - Status BeginWrite(Writer* w, uint64_t expiration_time); - - // After doing write job, we need to remove already used writers from - // writers_ queue and notify head of the queue about it. - // EndWrite is used for this. - // - // Writer* w: Writer, that was added by BeginWrite function - // Writer* last_writer: Since we can join a few Writers (as DBImpl::Write - // does) - // we should pass last_writer as a parameter to - // EndWrite - // (if you don't touch other writers, just pass w) - // Status status: Status of write operation - // See also: BeginWrite - void EndWrite(Writer* w, Writer* last_writer, Status status); - - Status MakeRoomForWrite(ColumnFamilyData* cfd, - WriteContext* context, - uint64_t expiration_time); + Status ScheduleFlushes(WriteContext* context); Status SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd, WriteContext* context); - void BuildBatchGroup(Writer** last_writer, - autovector* write_batch_group); - // Force current memtable contents to be flushed. Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options); @@ -527,10 +516,13 @@ class DBImpl : public DB { std::unique_ptr db_directory_; - // Queue of writers. - std::deque writers_; + WriteThread write_thread_; + WriteBatch tmp_batch_; + WriteController write_controller_; + FlushScheduler flush_scheduler_; + SnapshotList snapshots_; // cache for ReadFirstRecord() calls @@ -599,14 +591,10 @@ class DBImpl : public DB { bool flush_on_destroy_; // Used when disableWAL is true. static const int KEEP_LOG_FILE_NUM = 1000; - static const uint64_t kNoTimeOut = std::numeric_limits::max(); std::string db_absolute_path_; - // count of the number of contiguous delaying writes - int delayed_writes_; - // The options to access storage files - const EnvOptions storage_options_; + const EnvOptions env_options_; // A value of true temporarily disables scheduling of background work bool bg_work_gate_closed_; @@ -621,9 +609,6 @@ class DBImpl : public DB { DBImpl(const DBImpl&); void operator=(const DBImpl&); - // dump the delayed_writes_ to the log file and reset counter. - void DelayLoggingAndReset(); - // Return the earliest snapshot where seqno is visible. // Store the snapshot right before that, if any, in prev_snapshot inline SequenceNumber findEarliestVisibleSnapshot( @@ -669,7 +654,6 @@ class DBImpl : public DB { // it is not equal to src.info_log. extern Options SanitizeOptions(const std::string& db, const InternalKeyComparator* icmp, - const InternalFilterPolicy* ipolicy, const Options& src); extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src); diff --git a/db/db_impl_debug.cc b/db/db_impl_debug.cc index 8df66f6c6d9..6c073d4d5e4 100644 --- a/db/db_impl_debug.cc +++ b/db/db_impl_debug.cc @@ -20,7 +20,8 @@ uint64_t DBImpl::TEST_GetLevel0TotalSize() { return default_cf_handle_->cfd()->current()->NumLevelBytes(0); } -Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) { +Iterator* DBImpl::TEST_NewInternalIterator(Arena* arena, + ColumnFamilyHandle* column_family) { ColumnFamilyData* cfd; if (column_family == nullptr) { cfd = default_cf_handle_->cfd(); @@ -33,7 +34,7 @@ Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) { SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); mutex_.Unlock(); ReadOptions roptions; - return NewInternalIterator(roptions, cfd, super_version); + return NewInternalIterator(roptions, cfd, super_version, arena); } int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes( @@ -129,5 +130,27 @@ Status DBImpl::TEST_ReadFirstLine(const std::string& fname, SequenceNumber* sequence) { return ReadFirstLine(fname, sequence); } + +void DBImpl::TEST_LockMutex() { + mutex_.Lock(); +} + +void DBImpl::TEST_UnlockMutex() { + mutex_.Unlock(); +} + +void* DBImpl::TEST_BeginWrite() { + auto w = new WriteThread::Writer(&mutex_); + Status s = write_thread_.EnterWriteThread(w, 0); + assert(s.ok() && !w->done); // No timeout and nobody should do our job + return reinterpret_cast(w); +} + +void DBImpl::TEST_EndWrite(void* w) { + auto writer = reinterpret_cast(w); + write_thread_.ExitWriteThread(writer, writer, Status::OK()); + delete writer; +} + } // namespace rocksdb #endif // ROCKSDB_LITE diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc index eae8379a9ee..8cea58736fd 100644 --- a/db/db_impl_readonly.cc +++ b/db/db_impl_readonly.cc @@ -16,7 +16,6 @@ #include #include #include -#include #include "db/db_iter.h" #include "db/dbformat.h" #include "db/filename.h" @@ -42,17 +41,17 @@ namespace rocksdb { -DBImplReadOnly::DBImplReadOnly(const DBOptions& options, +DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options, const std::string& dbname) - : DBImpl(options, dbname) { - Log(options_.info_log, "Opening the db in read only mode"); + : DBImpl(db_options, dbname) { + Log(db_options_.info_log, "Opening the db in read only mode"); } DBImplReadOnly::~DBImplReadOnly() { } // Implementations of the DB interface -Status DBImplReadOnly::Get(const ReadOptions& options, +Status DBImplReadOnly::Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, std::string* value) { Status s; @@ -62,33 +61,34 @@ Status DBImplReadOnly::Get(const ReadOptions& options, SuperVersion* super_version = cfd->GetSuperVersion(); MergeContext merge_context; LookupKey lkey(key, snapshot); - if (super_version->mem->Get(lkey, value, &s, merge_context, - *cfd->options())) { + if (super_version->mem->Get(lkey, value, &s, &merge_context)) { } else { - super_version->current->Get(options, lkey, value, &s, &merge_context); + super_version->current->Get(read_options, lkey, value, &s, &merge_context); } return s; } -Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options, +Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options, ColumnFamilyHandle* column_family) { auto cfh = reinterpret_cast(column_family); auto cfd = cfh->cfd(); SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); SequenceNumber latest_snapshot = versions_->LastSequence(); auto db_iter = NewArenaWrappedDbIterator( - env_, *cfd->options(), cfd->user_comparator(), - (options.snapshot != nullptr - ? reinterpret_cast(options.snapshot)->number_ - : latest_snapshot)); - auto internal_iter = - NewInternalIterator(options, cfd, super_version, db_iter->GetArena()); + env_, *cfd->ioptions(), cfd->user_comparator(), + (read_options.snapshot != nullptr + ? reinterpret_cast( + read_options.snapshot)->number_ + : latest_snapshot), + cfd->options()->max_sequential_skip_in_iterations); + auto internal_iter = NewInternalIterator( + read_options, cfd, super_version, db_iter->GetArena()); db_iter->SetIterUnderDBIter(internal_iter); return db_iter; } Status DBImplReadOnly::NewIterators( - const ReadOptions& options, + const ReadOptions& read_options, const std::vector& column_families, std::vector* iterators) { if (iterators == nullptr) { @@ -101,12 +101,14 @@ Status DBImplReadOnly::NewIterators( for (auto cfh : column_families) { auto cfd = reinterpret_cast(cfh)->cfd(); auto db_iter = NewArenaWrappedDbIterator( - env_, *cfd->options(), cfd->user_comparator(), - options.snapshot != nullptr - ? reinterpret_cast(options.snapshot)->number_ - : latest_snapshot); + env_, *cfd->ioptions(), cfd->user_comparator(), + (read_options.snapshot != nullptr + ? reinterpret_cast( + read_options.snapshot)->number_ + : latest_snapshot), + cfd->options()->max_sequential_skip_in_iterations); auto internal_iter = NewInternalIterator( - options, cfd, cfd->GetSuperVersion()->Ref(), db_iter->GetArena()); + read_options, cfd, cfd->GetSuperVersion()->Ref(), db_iter->GetArena()); db_iter->SetIterUnderDBIter(internal_iter); iterators->push_back(db_iter); } diff --git a/db/db_impl_readonly.h b/db/db_impl_readonly.h index 47c8ab33db7..1dfdf422ef7 100644 --- a/db/db_impl_readonly.h +++ b/db/db_impl_readonly.h @@ -74,6 +74,8 @@ class DBImplReadOnly : public DBImpl { uint32_t target_path_id = 0) override { return Status::NotSupported("Not supported operation in read only mode."); } + +#ifndef ROCKSDB_LITE virtual Status DisableFileDeletions() override { return Status::NotSupported("Not supported operation in read only mode."); } @@ -85,6 +87,8 @@ class DBImplReadOnly : public DBImpl { bool flush_memtable = true) override { return Status::NotSupported("Not supported operation in read only mode."); } +#endif // ROCKSDB_LITE + using DBImpl::Flush; virtual Status Flush(const FlushOptions& options, ColumnFamilyHandle* column_family) override { diff --git a/db/db_iter.cc b/db/db_iter.cc index 370ffd8cb4e..db86ebc2c49 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -58,22 +58,25 @@ class DBIter: public Iterator { kReverse }; - DBIter(Env* env, const Options& options, const Comparator* cmp, - Iterator* iter, SequenceNumber s, bool arena_mode) + DBIter(Env* env, const ImmutableCFOptions& ioptions, + const Comparator* cmp, Iterator* iter, SequenceNumber s, + bool arena_mode, uint64_t max_sequential_skip_in_iterations, + const Slice* iterate_upper_bound = nullptr) : arena_mode_(arena_mode), env_(env), - logger_(options.info_log.get()), + logger_(ioptions.info_log), user_comparator_(cmp), - user_merge_operator_(options.merge_operator.get()), + user_merge_operator_(ioptions.merge_operator), iter_(iter), sequence_(s), direction_(kForward), valid_(false), current_entry_is_merged_(false), - statistics_(options.statistics.get()) { + statistics_(ioptions.statistics), + iterate_upper_bound_(iterate_upper_bound) { RecordTick(statistics_, NO_ITERATORS); - has_prefix_extractor_ = (options.prefix_extractor.get() != nullptr); - max_skip_ = options.max_sequential_skip_in_iterations; + prefix_extractor_ = ioptions.prefix_extractor; + max_skip_ = max_sequential_skip_in_iterations; } virtual ~DBIter() { RecordTick(statistics_, NO_ITERATORS, -1); @@ -132,7 +135,7 @@ class DBIter: public Iterator { } } - bool has_prefix_extractor_; + const SliceTransform* prefix_extractor_; bool arena_mode_; Env* const env_; Logger* logger_; @@ -149,6 +152,7 @@ class DBIter: public Iterator { bool current_entry_is_merged_; Statistics* statistics_; uint64_t max_skip_; + const Slice* iterate_upper_bound_; // No copying allowed DBIter(const DBIter&); @@ -194,9 +198,8 @@ void DBIter::Next() { // NOTE: In between, saved_key_ can point to a user key that has // a delete marker inline void DBIter::FindNextUserEntry(bool skipping) { - PERF_TIMER_AUTO(find_next_user_entry_time); + PERF_TIMER_GUARD(find_next_user_entry_time); FindNextUserEntryInternal(skipping); - PERF_TIMER_STOP(find_next_user_entry_time); } // Actual implementation of DBIter::FindNextUserEntry() @@ -208,36 +211,44 @@ void DBIter::FindNextUserEntryInternal(bool skipping) { uint64_t num_skipped = 0; do { ParsedInternalKey ikey; - if (ParseKey(&ikey) && ikey.sequence <= sequence_) { - if (skipping && - user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) { - num_skipped++; // skip this entry - PERF_COUNTER_ADD(internal_key_skipped_count, 1); - } else { - skipping = false; - switch (ikey.type) { - case kTypeDeletion: - // Arrange to skip all upcoming entries for this key since - // they are hidden by this deletion. - saved_key_.SetKey(ikey.user_key); - skipping = true; - num_skipped = 0; - PERF_COUNTER_ADD(internal_delete_skipped_count, 1); - break; - case kTypeValue: - valid_ = true; - saved_key_.SetKey(ikey.user_key); - return; - case kTypeMerge: - // By now, we are sure the current ikey is going to yield a value - saved_key_.SetKey(ikey.user_key); - current_entry_is_merged_ = true; - valid_ = true; - MergeValuesNewToOld(); // Go to a different state machine - return; - default: - assert(false); - break; + + if (ParseKey(&ikey)) { + if (iterate_upper_bound_ != nullptr && + ikey.user_key.compare(*iterate_upper_bound_) >= 0) { + break; + } + + if (ikey.sequence <= sequence_) { + if (skipping && + user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) { + num_skipped++; // skip this entry + PERF_COUNTER_ADD(internal_key_skipped_count, 1); + } else { + skipping = false; + switch (ikey.type) { + case kTypeDeletion: + // Arrange to skip all upcoming entries for this key since + // they are hidden by this deletion. + saved_key_.SetKey(ikey.user_key); + skipping = true; + num_skipped = 0; + PERF_COUNTER_ADD(internal_delete_skipped_count, 1); + break; + case kTypeValue: + valid_ = true; + saved_key_.SetKey(ikey.user_key); + return; + case kTypeMerge: + // By now, we are sure the current ikey is going to yield a value + saved_key_.SetKey(ikey.user_key); + current_entry_is_merged_ = true; + valid_ = true; + MergeValuesNewToOld(); // Go to a different state machine + return; + default: + assert(false); + break; + } } } } @@ -399,6 +410,7 @@ bool DBIter::FindValueForCurrentKey() { case kTypeDeletion: operands.clear(); last_not_merge_type = kTypeDeletion; + PERF_COUNTER_ADD(internal_delete_skipped_count, 1); break; case kTypeMerge: assert(user_merge_operator_ != nullptr); @@ -408,6 +420,7 @@ bool DBIter::FindValueForCurrentKey() { assert(false); } + PERF_COUNTER_ADD(internal_key_skipped_count, 1); assert(user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) == 0); iter_->Prev(); ++num_skipped; @@ -554,12 +567,29 @@ void DBIter::FindParseableKey(ParsedInternalKey* ikey, Direction direction) { void DBIter::Seek(const Slice& target) { StopWatch sw(env_, statistics_, DB_SEEK); + // total ordering is not guaranteed if prefix_extractor is set + // hence prefix based seeks will not give correct results + if (iterate_upper_bound_ != nullptr && prefix_extractor_ != nullptr) { + if (!prefix_extractor_->InDomain(*iterate_upper_bound_) || + !prefix_extractor_->InDomain(target) || + prefix_extractor_->Transform(*iterate_upper_bound_).compare( + prefix_extractor_->Transform(target)) != 0) { + status_ = Status::InvalidArgument("read_options.iterate_*_bound " + " and seek target need to have the same prefix."); + valid_ = false; + return; + } + } + saved_key_.Clear(); // now savved_key is used to store internal key. saved_key_.SetInternalKey(target, sequence_); - PERF_TIMER_AUTO(seek_internal_seek_time); - iter_->Seek(saved_key_.GetKey()); - PERF_TIMER_STOP(seek_internal_seek_time); + + { + PERF_TIMER_GUARD(seek_internal_seek_time); + iter_->Seek(saved_key_.GetKey()); + } + if (iter_->Valid()) { direction_ = kForward; ClearSavedValue(); @@ -572,14 +602,17 @@ void DBIter::Seek(const Slice& target) { void DBIter::SeekToFirst() { // Don't use iter_::Seek() if we set a prefix extractor // because prefix seek wiil be used. - if (has_prefix_extractor_) { + if (prefix_extractor_ != nullptr) { max_skip_ = std::numeric_limits::max(); } direction_ = kForward; ClearSavedValue(); - PERF_TIMER_AUTO(seek_internal_seek_time); - iter_->SeekToFirst(); - PERF_TIMER_STOP(seek_internal_seek_time); + + { + PERF_TIMER_GUARD(seek_internal_seek_time); + iter_->SeekToFirst(); + } + if (iter_->Valid()) { FindNextUserEntry(false /* not skipping */); } else { @@ -590,24 +623,29 @@ void DBIter::SeekToFirst() { void DBIter::SeekToLast() { // Don't use iter_::Seek() if we set a prefix extractor // because prefix seek wiil be used. - if (has_prefix_extractor_) { + if (prefix_extractor_ != nullptr) { max_skip_ = std::numeric_limits::max(); } direction_ = kReverse; ClearSavedValue(); - PERF_TIMER_AUTO(seek_internal_seek_time); - iter_->SeekToLast(); - PERF_TIMER_STOP(seek_internal_seek_time); + + { + PERF_TIMER_GUARD(seek_internal_seek_time); + iter_->SeekToLast(); + } PrevInternal(); } -Iterator* NewDBIterator(Env* env, const Options& options, +Iterator* NewDBIterator(Env* env, const ImmutableCFOptions& ioptions, const Comparator* user_key_comparator, Iterator* internal_iter, - const SequenceNumber& sequence) { - return new DBIter(env, options, user_key_comparator, internal_iter, sequence, - false); + const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iterations, + const Slice* iterate_upper_bound) { + return new DBIter(env, ioptions, user_key_comparator, internal_iter, sequence, + false, max_sequential_skip_in_iterations, + iterate_upper_bound); } ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); } @@ -635,14 +673,20 @@ void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1, } ArenaWrappedDBIter* NewArenaWrappedDbIterator( - Env* env, const Options& options, const Comparator* user_key_comparator, - const SequenceNumber& sequence) { + Env* env, const ImmutableCFOptions& ioptions, + const Comparator* user_key_comparator, + const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iterations, + const Slice* iterate_upper_bound) { ArenaWrappedDBIter* iter = new ArenaWrappedDBIter(); Arena* arena = iter->GetArena(); auto mem = arena->AllocateAligned(sizeof(DBIter)); - DBIter* db_iter = new (mem) - DBIter(env, options, user_key_comparator, nullptr, sequence, true); + DBIter* db_iter = new (mem) DBIter(env, ioptions, user_key_comparator, + nullptr, sequence, true, max_sequential_skip_in_iterations, + iterate_upper_bound); + iter->SetDBIter(db_iter); + return iter; } diff --git a/db/db_iter.h b/db/db_iter.h index cb9840324ff..c676d6cda13 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -24,10 +24,12 @@ class DBIter; // into appropriate user keys. extern Iterator* NewDBIterator( Env* env, - const Options& options, + const ImmutableCFOptions& options, const Comparator *user_key_comparator, Iterator* internal_iter, - const SequenceNumber& sequence); + const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iterations, + const Slice* iterate_upper_bound = nullptr); // A wrapper iterator which wraps DB Iterator and the arena, with which the DB // iterator is supposed be allocated. This class is used as an entry point of @@ -67,7 +69,9 @@ class ArenaWrappedDBIter : public Iterator { // Generate the arena wrapped iterator class. extern ArenaWrappedDBIter* NewArenaWrappedDbIterator( - Env* env, const Options& options, const Comparator* user_key_comparator, - const SequenceNumber& sequence); + Env* env, const ImmutableCFOptions& options, + const Comparator* user_key_comparator, + const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, + const Slice* iterate_upper_bound = nullptr); } // namespace rocksdb diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc index 4ce79da1bac..2aa30e327cb 100644 --- a/db/db_iter_test.cc +++ b/db/db_iter_test.cc @@ -158,7 +158,9 @@ TEST(DBIteratorTest, DBIteratorPrevNext) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 10, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -191,7 +193,9 @@ TEST(DBIteratorTest, DBIteratorPrevNext) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 10, + options.max_sequential_skip_in_iterations)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -232,7 +236,9 @@ TEST(DBIteratorTest, DBIteratorPrevNext) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 2, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "b"); @@ -262,7 +268,9 @@ TEST(DBIteratorTest, DBIteratorPrevNext) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 10, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -288,7 +296,9 @@ TEST(DBIteratorTest, DBIteratorEmpty) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 0, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); } @@ -298,7 +308,9 @@ TEST(DBIteratorTest, DBIteratorEmpty) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 0, + options.max_sequential_skip_in_iterations)); db_iter->SeekToFirst(); ASSERT_TRUE(!db_iter->Valid()); } @@ -318,7 +330,9 @@ TEST(DBIteratorTest, DBIteratorUseSkipCountSkips) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 2, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -357,7 +371,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) { options.statistics = rocksdb::CreateDBStatistics(); std::unique_ptr db_iter(NewDBIterator( - env_, options, BytewiseComparator(), internal_iter, i + 2)); + env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, i + 2, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -391,7 +407,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, options, BytewiseComparator(), internal_iter, i + 2)); + env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, i + 2, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -418,7 +436,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, options, BytewiseComparator(), internal_iter, 202)); + env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 202, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -449,7 +469,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) { internal_iter->AddPut("c", "200"); internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, i)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, i, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); @@ -464,7 +486,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) { internal_iter->AddPut("c", "200"); internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 200)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 200, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -497,7 +521,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, options, BytewiseComparator(), internal_iter, i + 2)); + env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, i + 2, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -530,7 +556,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, options, BytewiseComparator(), internal_iter, i + 2)); + env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, i + 2, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -570,7 +598,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 1)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 1, + options.max_sequential_skip_in_iterations)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -590,7 +620,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 0, + options.max_sequential_skip_in_iterations)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -609,7 +641,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 2, + options.max_sequential_skip_in_iterations)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -628,7 +662,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 4, + options.max_sequential_skip_in_iterations)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -654,7 +690,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 0, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -675,7 +713,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 1)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 1, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -696,7 +736,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 2, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -717,7 +759,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 3)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 3, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -738,7 +782,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 4, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -759,7 +805,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 5)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 5, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -780,7 +828,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 6)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 6, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -803,7 +853,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 0, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -824,7 +876,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 1)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 1, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -845,7 +899,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 2, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -866,7 +922,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 3)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 3, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); } @@ -883,7 +941,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 4, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -904,7 +964,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 5)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 5, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -925,7 +987,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 6)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 6, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -960,7 +1024,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 0, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -993,7 +1059,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 2, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1032,7 +1100,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 4, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1071,7 +1141,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 5)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 5, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1115,7 +1187,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 6)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 6, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1160,7 +1234,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 7)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 7, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1199,7 +1275,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 9)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 9, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1244,7 +1322,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, options, BytewiseComparator(), internal_iter, 13)); + env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 13, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1290,7 +1370,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, options, BytewiseComparator(), internal_iter, 14)); + env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 14, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1316,7 +1398,9 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 10, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "b"); diff --git a/db/db_test.cc b/db/db_test.cc index ee84ba975b2..7ad249d7fc9 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -30,6 +31,7 @@ #include "rocksdb/table.h" #include "rocksdb/options.h" #include "rocksdb/table_properties.h" +#include "rocksdb/utilities/write_batch_with_index.h" #include "table/block_based_table_factory.h" #include "table/plain_table_factory.h" #include "util/hash.h" @@ -40,6 +42,7 @@ #include "util/rate_limiter.h" #include "util/statistics.h" #include "util/testharness.h" +#include "util/scoped_arena_iterator.h" #include "util/sync_point.h" #include "util/testutil.h" @@ -101,6 +104,11 @@ class AtomicCounter { count_ = 0; } }; + +struct OptionsOverride { + std::shared_ptr filter_policy = nullptr; +}; + } // namespace anon static std::string Key(int i) { @@ -115,6 +123,9 @@ class SpecialEnv : public EnvWrapper { // sstable Sync() calls are blocked while this pointer is non-nullptr. port::AtomicPointer delay_sstable_sync_; + // Drop writes on the floor while this pointer is non-nullptr. + port::AtomicPointer drop_writes_; + // Simulate no-space errors while this pointer is non-nullptr. port::AtomicPointer no_space_; @@ -140,8 +151,11 @@ class SpecialEnv : public EnvWrapper { std::atomic bytes_written_; + std::atomic sync_counter_; + explicit SpecialEnv(Env* base) : EnvWrapper(base) { delay_sstable_sync_.Release_Store(nullptr); + drop_writes_.Release_Store(nullptr); no_space_.Release_Store(nullptr); non_writable_.Release_Store(nullptr); count_random_reads_ = false; @@ -150,6 +164,7 @@ class SpecialEnv : public EnvWrapper { manifest_write_error_.Release_Store(nullptr); log_write_error_.Release_Store(nullptr); bytes_written_ = 0; + sync_counter_ = 0; } Status NewWritableFile(const std::string& f, unique_ptr* r, @@ -165,9 +180,11 @@ class SpecialEnv : public EnvWrapper { base_(std::move(base)) { } Status Append(const Slice& data) { - if (env_->no_space_.Acquire_Load() != nullptr) { + if (env_->drop_writes_.Acquire_Load() != nullptr) { // Drop writes on the floor return Status::OK(); + } else if (env_->no_space_.Acquire_Load() != nullptr) { + return Status::IOError("No space left on device"); } else { env_->bytes_written_ += data.size(); return base_->Append(data); @@ -176,6 +193,7 @@ class SpecialEnv : public EnvWrapper { Status Close() { return base_->Close(); } Status Flush() { return base_->Flush(); } Status Sync() { + ++env_->sync_counter_; while (env_->delay_sstable_sync_.Acquire_Load() != nullptr) { env_->SleepForMicroseconds(100000); } @@ -202,6 +220,7 @@ class SpecialEnv : public EnvWrapper { Status Close() { return base_->Close(); } Status Flush() { return base_->Flush(); } Status Sync() { + ++env_->sync_counter_; if (env_->manifest_sync_error_.Acquire_Load() != nullptr) { return Status::IOError("simulated sync error"); } else { @@ -225,7 +244,10 @@ class SpecialEnv : public EnvWrapper { } Status Close() { return base_->Close(); } Status Flush() { return base_->Flush(); } - Status Sync() { return base_->Sync(); } + Status Sync() { + ++env_->sync_counter_; + return base_->Sync(); + } }; if (non_writable_.Acquire_Load() != nullptr) { @@ -303,9 +325,6 @@ class SpecialEnv : public EnvWrapper { }; class DBTest { - private: - const FilterPolicy* filter_policy_; - protected: // Sequence of option configurations to try enum OptionConfig { @@ -319,21 +338,22 @@ class DBTest { kHashCuckoo = 7, kMergePut = 8, kFilter = 9, - kUncompressed = 10, - kNumLevel_3 = 11, - kDBLogDir = 12, - kWalDir = 13, - kManifestFileSize = 14, - kCompactOnFlush = 15, - kPerfOptions = 16, - kDeletesFilterFirst = 17, - kHashSkipList = 18, - kUniversalCompaction = 19, - kCompressedBlockCache = 20, - kInfiniteMaxOpenFiles = 21, - kxxHashChecksum = 22, - kFIFOCompaction = 23, - kEnd = 24 + kFullFilter = 10, + kUncompressed = 11, + kNumLevel_3 = 12, + kDBLogDir = 13, + kWalDirAndMmapReads = 14, + kManifestFileSize = 15, + kCompactOnFlush = 16, + kPerfOptions = 17, + kDeletesFilterFirst = 18, + kHashSkipList = 19, + kUniversalCompaction = 20, + kCompressedBlockCache = 21, + kInfiniteMaxOpenFiles = 22, + kxxHashChecksum = 23, + kFIFOCompaction = 24, + kEnd = 25 }; int option_config_; @@ -357,11 +377,12 @@ class DBTest { kSkipNoSeekToLast = 32, kSkipHashCuckoo = 64, kSkipFIFOCompaction = 128, + kSkipMmapReads = 256, }; + DBTest() : option_config_(kDefault), env_(new SpecialEnv(Env::Default())) { - filter_policy_ = NewBloomFilterPolicy(10); dbname_ = test::TmpDir() + "/db_test"; ASSERT_OK(DestroyDB(dbname_, Options())); db_ = nullptr; @@ -377,7 +398,6 @@ class DBTest { options.db_paths.emplace_back(dbname_ + "_4", 0); ASSERT_OK(DestroyDB(dbname_, options)); delete env_; - delete filter_policy_; } // Switch to a fresh database with the next option configuration to @@ -417,6 +437,10 @@ class DBTest { option_config_ == kFIFOCompaction) { continue; } + if ((skip_mask & kSkipMmapReads) && + option_config_ == kWalDirAndMmapReads) { + continue; + } break; } @@ -444,15 +468,44 @@ class DBTest { } } + // Switch between different filter policy + // Jump from kDefault to kFilter to kFullFilter + bool ChangeFilterOptions(Options* prev_options = nullptr) { + if (option_config_ == kDefault) { + option_config_ = kFilter; + if (prev_options == nullptr) { + prev_options = &last_options_; + } + Destroy(prev_options); + TryReopen(); + return true; + } else if (option_config_ == kFilter) { + option_config_ = kFullFilter; + if (prev_options == nullptr) { + prev_options = &last_options_; + } + Destroy(prev_options); + TryReopen(); + return true; + } else { + return false; + } + } + // Return the current option configuration. - Options CurrentOptions() { + Options CurrentOptions( + const anon::OptionsOverride& options_override = anon::OptionsOverride()) { Options options; - return CurrentOptions(options); + return CurrentOptions(options, options_override); } - Options CurrentOptions(const Options& defaultOptions) { + Options CurrentOptions( + const Options& defaultOptions, + const anon::OptionsOverride& options_override = anon::OptionsOverride()) { // this redudant copy is to minimize code change w/o having lint error. Options options = defaultOptions; + BlockBasedTableOptions table_options; + bool set_block_based_table_factory = true; switch (option_config_) { case kHashSkipList: options.prefix_extractor.reset(NewFixedPrefixTransform(1)); @@ -464,18 +517,23 @@ class DBTest { options.prefix_extractor.reset(NewFixedPrefixTransform(1)); options.allow_mmap_reads = true; options.max_sequential_skip_in_iterations = 999999; + set_block_based_table_factory = false; break; case kPlainTableAllBytesPrefix: options.table_factory.reset(new PlainTableFactory()); options.prefix_extractor.reset(NewNoopTransform()); options.allow_mmap_reads = true; options.max_sequential_skip_in_iterations = 999999; + set_block_based_table_factory = false; break; case kMergePut: options.merge_operator = MergeOperators::CreatePutOperator(); break; case kFilter: - options.filter_policy = filter_policy_; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + break; + case kFullFilter: + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); break; case kUncompressed: options.compression = kNoCompression; @@ -486,8 +544,11 @@ class DBTest { case kDBLogDir: options.db_log_dir = test::TmpDir(); break; - case kWalDir: + case kWalDirAndMmapReads: options.wal_dir = test::TmpDir() + "/wal"; + // mmap reads should be orthogonal to WalDir setting, so we piggyback to + // this option config to test mmap reads as well + options.allow_mmap_reads = true; break; case kManifestFileSize: options.max_manifest_file_size = 50; // 50 bytes @@ -520,15 +581,13 @@ class DBTest { break; case kCompressedBlockCache: options.allow_mmap_writes = true; - options.block_cache_compressed = NewLRUCache(8*1024*1024); + table_options.block_cache_compressed = NewLRUCache(8*1024*1024); break; case kInfiniteMaxOpenFiles: options.max_open_files = -1; break; case kxxHashChecksum: { - BlockBasedTableOptions table_options; table_options.checksum = kxxHash; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); break; } case kFIFOCompaction: { @@ -536,22 +595,25 @@ class DBTest { break; } case kBlockBasedTableWithPrefixHashIndex: { - BlockBasedTableOptions table_options; table_options.index_type = BlockBasedTableOptions::kHashSearch; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.prefix_extractor.reset(NewFixedPrefixTransform(1)); break; } case kBlockBasedTableWithWholeKeyHashIndex: { - BlockBasedTableOptions table_options; table_options.index_type = BlockBasedTableOptions::kHashSearch; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.prefix_extractor.reset(NewNoopTransform()); break; } default: break; } + + if (options_override.filter_policy) { + table_options.filter_policy = options_override.filter_policy; + } + if (set_block_based_table_factory) { + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + } return options; } @@ -651,7 +713,6 @@ class DBTest { opts.create_if_missing = true; } last_options_ = opts; - return DB::Open(opts, dbname_, &db_); } @@ -746,11 +807,12 @@ class DBTest { } std::string AllEntriesFor(const Slice& user_key, int cf = 0) { - Iterator* iter; + ScopedArenaIterator iter; + Arena arena; if (cf == 0) { - iter = dbfull()->TEST_NewInternalIterator(); + iter.set(dbfull()->TEST_NewInternalIterator(&arena)); } else { - iter = dbfull()->TEST_NewInternalIterator(handles_[cf]); + iter.set(dbfull()->TEST_NewInternalIterator(&arena, handles_[cf])); } InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); iter->Seek(target.Encode()); @@ -795,7 +857,6 @@ class DBTest { } result += "]"; } - delete iter; return result; } @@ -1033,11 +1094,12 @@ class DBTest { // Utility method to test InplaceUpdate void validateNumberOfEntries(int numValues, int cf = 0) { - Iterator* iter; + ScopedArenaIterator iter; + Arena arena; if (cf != 0) { - iter = dbfull()->TEST_NewInternalIterator(handles_[cf]); + iter.set(dbfull()->TEST_NewInternalIterator(&arena, handles_[cf])); } else { - iter = dbfull()->TEST_NewInternalIterator(); + iter.set(dbfull()->TEST_NewInternalIterator(&arena)); } iter->SeekToFirst(); ASSERT_EQ(iter->status().ok(), true); @@ -1051,7 +1113,6 @@ class DBTest { ASSERT_EQ(ikey.sequence, (unsigned)seq--); iter->Next(); } - delete iter; ASSERT_EQ(0, seq); } @@ -1106,6 +1167,17 @@ void VerifyTableProperties(DB* db, uint64_t expected_entries_size) { ASSERT_EQ(props.size(), unique_entries.size()); ASSERT_EQ(expected_entries_size, sum); } + +uint64_t GetNumberOfSstFilesForColumnFamily(DB* db, + std::string column_family_name) { + std::vector metadata; + db->GetLiveFilesMetaData(&metadata); + uint64_t result = 0; + for (auto& fileMetadata : metadata) { + result += (fileMetadata.column_family_name == column_family_name); + } + return result; +} } // namespace TEST(DBTest, Empty) { @@ -1140,6 +1212,31 @@ TEST(DBTest, Empty) { ASSERT_EQ("v1", Get(1, "foo")); env_->delay_sstable_sync_.Release_Store(nullptr); // Release sync calls + + ASSERT_OK(db_->DisableFileDeletions()); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("1", num); + + ASSERT_OK(db_->DisableFileDeletions()); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("2", num); + + ASSERT_OK(db_->DisableFileDeletions()); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("3", num); + + ASSERT_OK(db_->EnableFileDeletions(false)); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("2", num); + + ASSERT_OK(db_->EnableFileDeletions()); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("0", num); } while (ChangeOptions()); } @@ -1161,18 +1258,27 @@ TEST(DBTest, ReadOnlyDB) { } ASSERT_EQ(count, 2); delete iter; + Close(); + + // Reopen and flush memtable. + Reopen(); + Flush(); + Close(); + // Now check keys in read only mode. + ASSERT_OK(ReadOnlyReopen(&options)); + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v2", Get("bar")); } // Make sure that when options.block_cache is set, after a new table is // created its index/filter blocks are added to block cache. TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) { Options options = CurrentOptions(); - std::unique_ptr filter_policy(NewBloomFilterPolicy(20)); - options.filter_policy = filter_policy.get(); options.create_if_missing = true; options.statistics = rocksdb::CreateDBStatistics(); BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); options.table_factory.reset(new BlockBasedTableFactory(table_options)); CreateAndReopenWithCF({"pikachu"}, &options); @@ -1219,6 +1325,7 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) { TEST(DBTest, GetPropertiesOfAllTablesTest) { Options options = CurrentOptions(); + options.max_background_flushes = 0; Reopen(&options); // Create 4 tables for (int table = 0; table < 4; ++table) { @@ -1414,7 +1521,10 @@ TEST(DBTest, GetPicksCorrectFile) { TEST(DBTest, GetEncountersEmptyLevel) { do { - CreateAndReopenWithCF({"pikachu"}); + Options options = CurrentOptions(); + options.max_background_flushes = 0; + options.disableDataSync = true; + CreateAndReopenWithCF({"pikachu"}, &options); // Arrange for the following to happen: // * sstable A in level 0 // * nothing in level 1 @@ -1458,8 +1568,9 @@ TEST(DBTest, KeyMayExist) { do { ReadOptions ropts; std::string value; - Options options = CurrentOptions(); - options.filter_policy = NewBloomFilterPolicy(20); + anon::OptionsOverride options_override; + options_override.filter_policy.reset(NewBloomFilterPolicy(20)); + Options options = CurrentOptions(options_override); options.statistics = rocksdb::CreateDBStatistics(); CreateAndReopenWithCF({"pikachu"}, &options); @@ -1510,8 +1621,6 @@ TEST(DBTest, KeyMayExist) { ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); - delete options.filter_policy; - // KeyMayExist function only checks data in block caches, which is not used // by plain table format. } while ( @@ -1578,16 +1687,17 @@ TEST(DBTest, NonBlockingIteration) { // This test verifies block cache behaviors, which is not used by plain // table format. // Exclude kHashCuckoo as it does not support iteration currently - } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | - kSkipHashCuckoo)); + } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipHashCuckoo | + kSkipMmapReads)); } // A delete is skipped for key if KeyMayExist(key) returns False // Tests Writebatch consistency and proper delete behaviour TEST(DBTest, FilterDeletes) { do { - Options options = CurrentOptions(); - options.filter_policy = NewBloomFilterPolicy(20); + anon::OptionsOverride options_override; + options_override.filter_policy.reset(NewBloomFilterPolicy(20)); + Options options = CurrentOptions(options_override); options.filter_deletes = true; CreateAndReopenWithCF({"pikachu"}, &options); WriteBatch batch; @@ -1617,8 +1727,6 @@ TEST(DBTest, FilterDeletes) { dbfull()->Write(WriteOptions(), &batch); ASSERT_EQ(AllEntriesFor("c", 1), "[ DEL, d ]"); // Delete issued batch.Clear(); - - delete options.filter_policy; } while (ChangeCompactOptions()); } @@ -2464,6 +2572,49 @@ class SleepingBackgroundTask { bool done_with_sleep_; }; +TEST(DBTest, FlushEmptyColumnFamily) { + // Block flush thread and disable compaction thread + env_->SetBackgroundThreads(1, Env::HIGH); + env_->SetBackgroundThreads(1, Env::LOW); + SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + SleepingBackgroundTask sleeping_task_high; + env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_high, + Env::Priority::HIGH); + + Options options = CurrentOptions(); + // disable compaction + options.disable_auto_compactions = true; + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.max_write_buffer_number = 2; + options.min_write_buffer_number_to_merge = 1; + CreateAndReopenWithCF({"pikachu"}, &options); + + // Compaction can still go through even if no thread can flush the + // mem table. + ASSERT_OK(Flush(0)); + ASSERT_OK(Flush(1)); + + // Insert can go through + ASSERT_OK(dbfull()->Put(writeOpt, handles_[0], "foo", "v1")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); + + ASSERT_EQ("v1", Get(0, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); + + sleeping_task_high.WakeUp(); + sleeping_task_high.WaitUntilDone(); + + // Flush can still go through. + ASSERT_OK(Flush(0)); + ASSERT_OK(Flush(1)); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); +} + TEST(DBTest, GetProperty) { // Set sizes to both background thread pool to be 1 and block them. env_->SetBackgroundThreads(1, Env::HIGH); @@ -2657,6 +2808,44 @@ TEST(DBTest, RecoverDuringMemtableCompaction) { } while (ChangeOptions()); } +TEST(DBTest, FlushSchedule) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.level0_stop_writes_trigger = 1 << 10; + options.level0_slowdown_writes_trigger = 1 << 10; + options.min_write_buffer_number_to_merge = 1; + options.max_write_buffer_number = 2; + options.write_buffer_size = 100 * 1000; + CreateAndReopenWithCF({"pikachu"}, &options); + std::vector threads; + + std::atomic thread_num(0); + // each column family will have 5 thread, each thread generating 2 memtables. + // each column family should end up with 10 table files + for (int i = 0; i < 10; ++i) { + threads.emplace_back([&]() { + int a = thread_num.fetch_add(1); + Random rnd(a); + WriteOptions wo; + // this should fill up 2 memtables + for (int k = 0; k < 5000; ++k) { + ASSERT_OK(db_->Put(wo, handles_[a & 1], RandomString(&rnd, 13), "")); + } + }); + } + + for (auto& t : threads) { + t.join(); + } + + auto default_tables = GetNumberOfSstFilesForColumnFamily(db_, "default"); + auto pikachu_tables = GetNumberOfSstFilesForColumnFamily(db_, "pikachu"); + ASSERT_LE(default_tables, static_cast(10)); + ASSERT_GT(default_tables, static_cast(0)); + ASSERT_LE(pikachu_tables, static_cast(10)); + ASSERT_GT(pikachu_tables, static_cast(0)); +} + TEST(DBTest, MinorCompactionsHappen) { do { Options options; @@ -3332,32 +3521,37 @@ TEST(DBTest, CompressedCache) { // Iteration 4: both block cache and compressed cache, but DB is not // compressed for (int iter = 0; iter < 4; iter++) { - Options options = CurrentOptions(); + Options options; options.write_buffer_size = 64*1024; // small write buffer options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions table_options; switch (iter) { case 0: // only uncompressed block cache - options.block_cache = NewLRUCache(8*1024); - options.block_cache_compressed = nullptr; + table_options.block_cache = NewLRUCache(8*1024); + table_options.block_cache_compressed = nullptr; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); break; case 1: // no block cache, only compressed cache - options.no_block_cache = true; - options.block_cache = nullptr; - options.block_cache_compressed = NewLRUCache(8*1024); + table_options.no_block_cache = true; + table_options.block_cache = nullptr; + table_options.block_cache_compressed = NewLRUCache(8*1024); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); break; case 2: // both compressed and uncompressed block cache - options.block_cache = NewLRUCache(1024); - options.block_cache_compressed = NewLRUCache(8*1024); + table_options.block_cache = NewLRUCache(1024); + table_options.block_cache_compressed = NewLRUCache(8*1024); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); break; case 3: // both block cache and compressed cache, but DB is not compressed // also, make block cache sizes bigger, to trigger block cache hits - options.block_cache = NewLRUCache(1024 * 1024); - options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024); + table_options.block_cache = NewLRUCache(1024 * 1024); + table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.compression = kNoCompression; break; default: @@ -3366,9 +3560,11 @@ TEST(DBTest, CompressedCache) { CreateAndReopenWithCF({"pikachu"}, &options); // default column family doesn't have block cache Options no_block_cache_opts; - no_block_cache_opts.no_block_cache = true; no_block_cache_opts.statistics = options.statistics; - options = CurrentOptions(options); + BlockBasedTableOptions table_options_no_bc; + table_options_no_bc.no_block_cache = true; + no_block_cache_opts.table_factory.reset( + NewBlockBasedTableFactory(table_options_no_bc)); ReopenWithColumnFamilies({"default", "pikachu"}, {&no_block_cache_opts, &options}); @@ -4162,22 +4358,25 @@ TEST(DBTest, CompactionFilter) { // TODO: figure out sequence number squashtoo int count = 0; int total = 0; - Iterator* iter = dbfull()->TEST_NewInternalIterator(handles_[1]); - iter->SeekToFirst(); - ASSERT_OK(iter->status()); - while (iter->Valid()) { - ParsedInternalKey ikey(Slice(), 0, kTypeValue); - ikey.sequence = -1; - ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); - total++; - if (ikey.sequence != 0) { - count++; + Arena arena; + { + ScopedArenaIterator iter( + dbfull()->TEST_NewInternalIterator(&arena, handles_[1])); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + total++; + if (ikey.sequence != 0) { + count++; + } + iter->Next(); } - iter->Next(); } ASSERT_EQ(total, 100000); ASSERT_EQ(count, 1); - delete iter; // overwrite all the 100K keys once again. for (int i = 0; i < 100000; i++) { @@ -4232,7 +4431,7 @@ TEST(DBTest, CompactionFilter) { ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); // Scan the entire database to ensure that nothing is left - iter = db_->NewIterator(ReadOptions(), handles_[1]); + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); iter->SeekToFirst(); count = 0; while (iter->Valid()) { @@ -4248,18 +4447,20 @@ TEST(DBTest, CompactionFilter) { // TODO: remove the following or design a different // test count = 0; - iter = dbfull()->TEST_NewInternalIterator(handles_[1]); - iter->SeekToFirst(); - ASSERT_OK(iter->status()); - while (iter->Valid()) { - ParsedInternalKey ikey(Slice(), 0, kTypeValue); - ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); - ASSERT_NE(ikey.sequence, (unsigned)0); - count++; - iter->Next(); + { + ScopedArenaIterator iter( + dbfull()->TEST_NewInternalIterator(&arena, handles_[1])); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + ASSERT_NE(ikey.sequence, (unsigned)0); + count++; + iter->Next(); + } + ASSERT_EQ(count, 0); } - ASSERT_EQ(count, 0); - delete iter; } // Tests the edge case where compaction does not produce any output -- all @@ -4381,22 +4582,24 @@ TEST(DBTest, CompactionFilterContextManual) { // Verify total number of keys is correct after manual compaction. int count = 0; int total = 0; - Iterator* iter = dbfull()->TEST_NewInternalIterator(); - iter->SeekToFirst(); - ASSERT_OK(iter->status()); - while (iter->Valid()) { - ParsedInternalKey ikey(Slice(), 0, kTypeValue); - ikey.sequence = -1; - ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); - total++; - if (ikey.sequence != 0) { - count++; + { + Arena arena; + ScopedArenaIterator iter(dbfull()->TEST_NewInternalIterator(&arena)); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + total++; + if (ikey.sequence != 0) { + count++; + } + iter->Next(); } - iter->Next(); + ASSERT_EQ(total, 700); + ASSERT_EQ(count, 1); } - ASSERT_EQ(total, 700); - ASSERT_EQ(count, 1); - delete iter; } class KeepFilterV2 : public CompactionFilterV2 { @@ -4553,25 +4756,27 @@ TEST(DBTest, CompactionFilterV2) { // All the files are in the lowest level. int count = 0; int total = 0; - Iterator* iter = dbfull()->TEST_NewInternalIterator(); - iter->SeekToFirst(); - ASSERT_OK(iter->status()); - while (iter->Valid()) { - ParsedInternalKey ikey(Slice(), 0, kTypeValue); - ikey.sequence = -1; - ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); - total++; - if (ikey.sequence != 0) { - count++; + { + Arena arena; + ScopedArenaIterator iter(dbfull()->TEST_NewInternalIterator(&arena)); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + total++; + if (ikey.sequence != 0) { + count++; + } + iter->Next(); } - iter->Next(); } ASSERT_EQ(total, 100000); // 1 snapshot only. Since we are using universal compacton, // the sequence no is cleared for better compression ASSERT_EQ(count, 1); - delete iter; // create a new database with the compaction // filter in such a way that it deletes all keys @@ -4595,7 +4800,7 @@ TEST(DBTest, CompactionFilterV2) { ASSERT_EQ(NumTableFilesAtLevel(1), 0); // Scan the entire database to ensure that nothing is left - iter = db_->NewIterator(ReadOptions()); + Iterator* iter = db_->NewIterator(ReadOptions()); iter->SeekToFirst(); count = 0; while (iter->Valid()) { @@ -4923,7 +5128,9 @@ TEST(DBTest, Snapshot) { TEST(DBTest, HiddenValuesAreRemoved) { do { - CreateAndReopenWithCF({"pikachu"}); + Options options = CurrentOptions(); + options.max_background_flushes = 0; + CreateAndReopenWithCF({"pikachu"}, &options); Random rnd(301); FillLevels("a", "z", 1); @@ -5014,7 +5221,9 @@ TEST(DBTest, CompactBetweenSnapshots) { } TEST(DBTest, DeletionMarkers1) { - CreateAndReopenWithCF({"pikachu"}); + Options options = CurrentOptions(); + options.max_background_flushes = 0; + CreateAndReopenWithCF({"pikachu"}, &options); Put(1, "foo", "v1"); ASSERT_OK(Flush(1)); const int last = CurrentOptions().max_mem_compaction_level; @@ -5049,7 +5258,9 @@ TEST(DBTest, DeletionMarkers1) { } TEST(DBTest, DeletionMarkers2) { - CreateAndReopenWithCF({"pikachu"}); + Options options = CurrentOptions(); + options.max_background_flushes = 0; + CreateAndReopenWithCF({"pikachu"}, &options); Put(1, "foo", "v1"); ASSERT_OK(Flush(1)); const int last = CurrentOptions().max_mem_compaction_level; @@ -5078,7 +5289,9 @@ TEST(DBTest, DeletionMarkers2) { TEST(DBTest, OverlapInLevel0) { do { - CreateAndReopenWithCF({"pikachu"}); + Options options = CurrentOptions(); + options.max_background_flushes = 0; + CreateAndReopenWithCF({"pikachu"}, &options); int tmp = CurrentOptions().max_mem_compaction_level; ASSERT_EQ(tmp, 2) << "Fix test to match config"; @@ -5228,7 +5441,6 @@ TEST(DBTest, CustomComparator) { new_options = CurrentOptions(); new_options.create_if_missing = true; new_options.comparator = &cmp; - new_options.filter_policy = nullptr; // Cannot use bloom filters new_options.write_buffer_size = 1000; // Compact more often new_options = CurrentOptions(new_options); DestroyAndReopen(&new_options); @@ -5257,7 +5469,9 @@ TEST(DBTest, CustomComparator) { } TEST(DBTest, ManualCompaction) { - CreateAndReopenWithCF({"pikachu"}); + Options options = CurrentOptions(); + options.max_background_flushes = 0; + CreateAndReopenWithCF({"pikachu"}, &options); ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2) << "Need to update this test to match kMaxMemCompactLevel"; @@ -5295,6 +5509,7 @@ TEST(DBTest, ManualCompaction) { if (iter == 0) { Options options = CurrentOptions(); + options.max_background_flushes = 0; options.num_levels = 3; options.create_if_missing = true; DestroyAndReopen(&options); @@ -5394,6 +5609,7 @@ TEST(DBTest, DBOpen_Options) { TEST(DBTest, DBOpen_Change_NumLevels) { Options opts; opts.create_if_missing = true; + opts.max_background_flushes = 0; DestroyAndReopen(&opts); ASSERT_TRUE(db_ != nullptr); CreateAndReopenWithCF({"pikachu"}, &opts); @@ -5444,8 +5660,8 @@ TEST(DBTest, DestroyDBMetaDatabase) { ASSERT_TRUE(!(DB::Open(opts, metametadbname, &db)).ok()); } -// Check that number of files does not grow when we are out of space -TEST(DBTest, NoSpace) { +// Check that number of files does not grow when writes are dropped +TEST(DBTest, DropWrites) { do { Options options = CurrentOptions(); options.env = env_; @@ -5456,7 +5672,7 @@ TEST(DBTest, NoSpace) { ASSERT_EQ("v1", Get("foo")); Compact("a", "z"); const int num_files = CountFiles(); - env_->no_space_.Release_Store(env_); // Force out-of-space errors + env_->drop_writes_.Release_Store(env_); // Force out-of-space errors env_->sleep_counter_.Reset(); for (int i = 0; i < 5; i++) { for (int level = 0; level < dbfull()->NumberLevels()-1; level++) { @@ -5468,7 +5684,7 @@ TEST(DBTest, NoSpace) { ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); ASSERT_EQ("5", property_value); - env_->no_space_.Release_Store(nullptr); + env_->drop_writes_.Release_Store(nullptr); ASSERT_LT(CountFiles(), num_files + 3); // Check that compaction attempts slept after errors @@ -5477,7 +5693,7 @@ TEST(DBTest, NoSpace) { } // Check background error counter bumped on flush failures. -TEST(DBTest, NoSpaceFlush) { +TEST(DBTest, DropWritesFlush) { do { Options options = CurrentOptions(); options.env = env_; @@ -5485,7 +5701,7 @@ TEST(DBTest, NoSpaceFlush) { Reopen(&options); ASSERT_OK(Put("foo", "v1")); - env_->no_space_.Release_Store(env_); // Force out-of-space errors + env_->drop_writes_.Release_Store(env_); // Force out-of-space errors std::string property_value; // Background error count is 0 now. @@ -5509,6 +5725,30 @@ TEST(DBTest, NoSpaceFlush) { } ASSERT_EQ("1", property_value); + env_->drop_writes_.Release_Store(nullptr); + } while (ChangeCompactOptions()); +} + +// Check that CompactRange() returns failure if there is not enough space left +// on device +TEST(DBTest, NoSpaceCompactRange) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.disable_auto_compactions = true; + Reopen(&options); + + // generate 5 tables + for (int i = 0; i < 5; ++i) { + ASSERT_OK(Put(Key(i), Key(i) + "v")); + ASSERT_OK(Flush()); + } + + env_->no_space_.Release_Store(env_); // Force out-of-space errors + + Status s = db_->CompactRange(nullptr, nullptr); + ASSERT_TRUE(s.IsIOError()); + env_->no_space_.Release_Store(nullptr); } while (ChangeCompactOptions()); } @@ -5553,6 +5793,7 @@ TEST(DBTest, ManifestWriteError) { options.env = env_; options.create_if_missing = true; options.error_if_exists = false; + options.max_background_flushes = 0; DestroyAndReopen(&options); ASSERT_OK(Put("foo", "bar")); ASSERT_EQ("bar", Get("foo")); @@ -5636,11 +5877,16 @@ TEST(DBTest, FilesDeletedAfterCompaction) { TEST(DBTest, BloomFilter) { do { - env_->count_random_reads_ = true; Options options = CurrentOptions(); + env_->count_random_reads_ = true; options.env = env_; - options.no_block_cache = true; - options.filter_policy = NewBloomFilterPolicy(10); + // ChangeCompactOptions() only changes compaction style, which does not + // trigger reset of table_factory + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, &options); // Populate multiple layers @@ -5678,10 +5924,169 @@ TEST(DBTest, BloomFilter) { env_->delay_sstable_sync_.Release_Store(nullptr); Close(); - delete options.filter_policy; } while (ChangeCompactOptions()); } +TEST(DBTest, BloomFilterRate) { + while (ChangeFilterOptions()) { + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + CreateAndReopenWithCF({"pikachu"}, &options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + // Add a large key to make the file contain wide range + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + Flush(1); + + // Check if they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + + // Check if filter is useful + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i+33333))); + } + ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey*0.98); + } +} + +TEST(DBTest, BloomFilterCompatibility) { + Options options; + options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + // Create with block based filter + CreateAndReopenWithCF({"pikachu"}, &options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + Flush(1); + + // Check db with full filter + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + + // Check if they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); +} + +TEST(DBTest, BloomFilterReverseCompatibility) { + Options options; + options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + // Create with full filter + CreateAndReopenWithCF({"pikachu"}, &options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + Flush(1); + + // Check db with block_based filter + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + + // Check if they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); +} + +namespace { +// A wrapped bloom over default FilterPolicy +class WrappedBloom : public FilterPolicy { + public: + explicit WrappedBloom(int bits_per_key) : + filter_(NewBloomFilterPolicy(bits_per_key)), + counter_(0) {} + + ~WrappedBloom() { delete filter_; } + + const char* Name() const override { return "WrappedRocksDbFilterPolicy"; } + + void CreateFilter(const rocksdb::Slice* keys, int n, std::string* dst) + const override { + std::unique_ptr user_keys(new rocksdb::Slice[n]); + for (int i = 0; i < n; ++i) { + user_keys[i] = convertKey(keys[i]); + } + return filter_->CreateFilter(user_keys.get(), n, dst); + } + + bool KeyMayMatch(const rocksdb::Slice& key, const rocksdb::Slice& filter) + const override { + counter_++; + return filter_->KeyMayMatch(convertKey(key), filter); + } + + uint32_t GetCounter() { return counter_; } + + private: + const FilterPolicy* filter_; + mutable uint32_t counter_; + + rocksdb::Slice convertKey(const rocksdb::Slice key) const { + return key; + } +}; +} // namespace + +TEST(DBTest, BloomFilterWrapper) { + Options options; + options.statistics = rocksdb::CreateDBStatistics(); + + BlockBasedTableOptions table_options; + WrappedBloom* policy = new WrappedBloom(10); + table_options.filter_policy.reset(policy); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + CreateAndReopenWithCF({"pikachu"}, &options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + // Add a large key to make the file contain wide range + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + ASSERT_EQ(0U, policy->GetCounter()); + Flush(1); + + // Check if they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ(1U * maxKey, policy->GetCounter()); + + // Check if filter is useful + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i+33333))); + } + ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey*0.98); + ASSERT_EQ(2U * maxKey, policy->GetCounter()); +} + TEST(DBTest, SnapshotFiles) { do { Options options = CurrentOptions(); @@ -5901,18 +6306,18 @@ namespace { std::vector ListSpecificFiles( Env* env, const std::string& path, const FileType expected_file_type) { std::vector files; - std::vector log_files; + std::vector file_numbers; env->GetChildren(path, &files); uint64_t number; FileType type; for (size_t i = 0; i < files.size(); ++i) { if (ParseFileName(files[i], &number, &type)) { if (type == expected_file_type) { - log_files.push_back(number); + file_numbers.push_back(number); } } } - return std::move(log_files); + return std::move(file_numbers); } std::vector ListLogFiles(Env* env, const std::string& path) { @@ -5946,6 +6351,130 @@ TEST(DBTest, FlushOneColumnFamily) { } } +// In https://reviews.facebook.net/D20661 we change +// recovery behavior: previously for each log file each column family +// memtable was flushed, even it was empty. Now it's changed: +// we try to create the smallest number of table files by merging +// updates from multiple logs +TEST(DBTest, RecoverCheckFileAmountWithSmallWriteBuffer) { + Options options; + options.write_buffer_size = 5000000; + CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, &options); + + // Since we will reopen DB with smaller write_buffer_size, + // each key will go to new SST file + ASSERT_OK(Put(1, Key(10), DummyString(1000000))); + ASSERT_OK(Put(1, Key(10), DummyString(1000000))); + ASSERT_OK(Put(1, Key(10), DummyString(1000000))); + ASSERT_OK(Put(1, Key(10), DummyString(1000000))); + + ASSERT_OK(Put(3, Key(10), DummyString(1))); + // Make 'dobrynia' to be flushed and new WAL file to be created + ASSERT_OK(Put(2, Key(10), DummyString(7500000))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + { + auto tables = ListTableFiles(env_, dbname_); + ASSERT_EQ(tables.size(), static_cast(1)); + // Make sure 'dobrynia' was flushed: check sst files amount + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(1)); + } + // New WAL file + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(3, Key(10), DummyString(1))); + ASSERT_OK(Put(3, Key(10), DummyString(1))); + ASSERT_OK(Put(3, Key(10), DummyString(1))); + + options.write_buffer_size = 10; + ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"}, + &options); + { + // No inserts => default is empty + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(0)); + // First 4 keys goes to separate SSTs + 1 more SST for 2 smaller keys + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(5)); + // 1 SST for big key + 1 SST for small one + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(2)); + // 1 SST for all keys + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(1)); + } +} + +// In https://reviews.facebook.net/D20661 we change +// recovery behavior: previously for each log file each column family +// memtable was flushed, even it wasn't empty. Now it's changed: +// we try to create the smallest number of table files by merging +// updates from multiple logs +TEST(DBTest, RecoverCheckFileAmount) { + Options options; + options.write_buffer_size = 100000; + CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, &options); + + ASSERT_OK(Put(0, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + + // Make 'nikitich' memtable to be flushed + ASSERT_OK(Put(3, Key(10), DummyString(1002400))); + ASSERT_OK(Put(3, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + // 4 memtable are not flushed, 1 sst file + { + auto tables = ListTableFiles(env_, dbname_); + ASSERT_EQ(tables.size(), static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(1)); + } + // Memtable for 'nikitich' has flushed, new WAL file has opened + // 4 memtable still not flushed + + // Write to new WAL file + ASSERT_OK(Put(0, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + + // Fill up 'nikitich' one more time + ASSERT_OK(Put(3, Key(10), DummyString(1002400))); + // make it flush + ASSERT_OK(Put(3, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + // There are still 4 memtable not flushed, and 2 sst tables + ASSERT_OK(Put(0, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + + { + auto tables = ListTableFiles(env_, dbname_); + ASSERT_EQ(tables.size(), static_cast(2)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(2)); + } + + ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"}, + &options); + { + std::vector table_files = ListTableFiles(env_, dbname_); + // Check, that records for 'default', 'dobrynia' and 'pikachu' from + // first, second and third WALs went to the same SST. + // So, there is 6 SSTs: three for 'nikitich', one for 'default', one for + // 'dobrynia', one for 'pikachu' + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(3)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(1)); + } +} + TEST(DBTest, WALArchivalTtl) { do { Options options = CurrentOptions(); @@ -6075,13 +6604,28 @@ TEST(DBTest, PurgeInfoLogs) { int info_log_count = 0; for (std::string file : files) { if (file.find("LOG") != std::string::npos) { - if (mode == 1) { - env_->DeleteFile(options.db_log_dir + "/" + file); - } info_log_count++; } } ASSERT_EQ(5, info_log_count); + + Destroy(&options); + // For mode (1), test DestroyDB() to delete all the logs under DB dir. + // For mode (2), no info log file should have been put under DB dir. + std::vector db_files; + env_->GetChildren(dbname_, &db_files); + for (std::string file : db_files) { + ASSERT_TRUE(file.find("LOG") == std::string::npos); + } + + if (mode == 1) { + // Cleaning up + env_->GetChildren(options.db_log_dir, &files); + for (std::string file : files) { + env_->DeleteFile(options.db_log_dir + "/" + file); + } + env_->DeleteDir(options.db_log_dir); + } } } @@ -6429,13 +6973,26 @@ static void MTThreadBody(void* arg) { // into each of the CFs // We add some padding for force compactions. int unique_id = rnd.Uniform(1000000); - WriteBatch batch; - for (int cf = 0; cf < kColumnFamilies; ++cf) { - snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id, - static_cast(counter), cf, unique_id); - batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf)); + + // Half of the time directly use WriteBatch. Half of the time use + // WriteBatchWithIndex. + if (rnd.OneIn(2)) { + WriteBatch batch; + for (int cf = 0; cf < kColumnFamilies; ++cf) { + snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id, + static_cast(counter), cf, unique_id); + batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf)); + } + ASSERT_OK(db->Write(WriteOptions(), &batch)); + } else { + WriteBatchWithIndex batch(db->GetOptions().comparator); + for (int cf = 0; cf < kColumnFamilies; ++cf) { + snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id, + static_cast(counter), cf, unique_id); + batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf)); + } + ASSERT_OK(db->Write(WriteOptions(), batch.GetWriteBatch())); } - ASSERT_OK(db->Write(WriteOptions(), &batch)); } else { // Read a value and verify that it matches the pattern written above // and that writes to all column families were atomic (unique_id is the @@ -7104,45 +7661,49 @@ void PrefixScanInit(DBTest *dbtest) { } // namespace TEST(DBTest, PrefixScan) { - int count; - Slice prefix; - Slice key; - char buf[100]; - Iterator* iter; - snprintf(buf, sizeof(buf), "03______:"); - prefix = Slice(buf, 8); - key = Slice(buf, 9); - // db configs - env_->count_random_reads_ = true; - Options options = CurrentOptions(); - options.env = env_; - options.no_block_cache = true; - options.filter_policy = NewBloomFilterPolicy(10); - options.prefix_extractor.reset(NewFixedPrefixTransform(8)); - options.whole_key_filtering = false; - options.disable_auto_compactions = true; - options.max_background_compactions = 2; - options.create_if_missing = true; - options.memtable_factory.reset(NewHashSkipListRepFactory(16)); + while (ChangeFilterOptions()) { + int count; + Slice prefix; + Slice key; + char buf[100]; + Iterator* iter; + snprintf(buf, sizeof(buf), "03______:"); + prefix = Slice(buf, 8); + key = Slice(buf, 9); + // db configs + env_->count_random_reads_ = true; + Options options = CurrentOptions(); + options.env = env_; + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.disable_auto_compactions = true; + options.max_background_compactions = 2; + options.create_if_missing = true; + options.memtable_factory.reset(NewHashSkipListRepFactory(16)); - // 11 RAND I/Os - DestroyAndReopen(&options); - PrefixScanInit(this); - count = 0; - env_->random_read_counter_.Reset(); - iter = db_->NewIterator(ReadOptions()); - for (iter->Seek(prefix); iter->Valid(); iter->Next()) { - if (! iter->key().starts_with(prefix)) { - break; + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + table_options.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + // 11 RAND I/Os + DestroyAndReopen(&options); + PrefixScanInit(this); + count = 0; + env_->random_read_counter_.Reset(); + iter = db_->NewIterator(ReadOptions()); + for (iter->Seek(prefix); iter->Valid(); iter->Next()) { + if (! iter->key().starts_with(prefix)) { + break; + } + count++; } - count++; - } - ASSERT_OK(iter->status()); - delete iter; - ASSERT_EQ(count, 2); - ASSERT_EQ(env_->random_read_counter_.Read(), 2); - Close(); - delete options.filter_policy; + ASSERT_OK(iter->status()); + delete iter; + ASSERT_EQ(count, 2); + ASSERT_EQ(env_->random_read_counter_.Read(), 2); + Close(); + } // end of while } TEST(DBTest, TailingIteratorSingle) { @@ -7457,25 +8018,35 @@ TEST(DBTest, FIFOCompactionTest) { } TEST(DBTest, SimpleWriteTimeoutTest) { + // Block compaction thread, which will also block the flushes because + // max_background_flushes == 0, so flushes are getting executed by the + // compaction thread + env_->SetBackgroundThreads(1, Env::LOW); + SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + Options options; options.env = env_; options.create_if_missing = true; options.write_buffer_size = 100000; options.max_background_flushes = 0; options.max_write_buffer_number = 2; - options.min_write_buffer_number_to_merge = 3; options.max_total_wal_size = std::numeric_limits::max(); WriteOptions write_opt = WriteOptions(); write_opt.timeout_hint_us = 0; DestroyAndReopen(&options); - // fill the two write buffer + // fill the two write buffers ASSERT_OK(Put(Key(1), Key(1) + std::string(100000, 'v'), write_opt)); ASSERT_OK(Put(Key(2), Key(2) + std::string(100000, 'v'), write_opt)); // As the only two write buffers are full in this moment, the third // Put is expected to be timed-out. - write_opt.timeout_hint_us = 300; + write_opt.timeout_hint_us = 50; ASSERT_TRUE( Put(Key(3), Key(3) + std::string(100000, 'v'), write_opt).IsTimedOut()); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); } // Multi-threaded Timeout Test @@ -7576,6 +8147,26 @@ TEST(DBTest, MTRandomTimeoutTest) { } } +TEST(DBTest, Level0StopWritesTest) { + Options options = CurrentOptions(); + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 4; + options.disable_auto_compactions = 4; + options.max_mem_compaction_level = 0; + Reopen(&options); + + // create 4 level0 tables + for (int i = 0; i < 4; ++i) { + Put("a", "b"); + Flush(); + } + + WriteOptions woptions; + woptions.timeout_hint_us = 30 * 1000; // 30 ms + Status s = Put("a", "b", woptions); + ASSERT_TRUE(s.IsTimedOut()); +} + } // anonymous namespace /* @@ -7650,6 +8241,201 @@ TEST(DBTest, RateLimitingTest) { ASSERT_TRUE(ratio < 0.6); } +TEST(DBTest, TableOptionsSanitizeTest) { + Options options = CurrentOptions(); + options.create_if_missing = true; + DestroyAndReopen(&options); + ASSERT_EQ(db_->GetOptions().allow_mmap_reads, false); + + options.table_factory.reset(new PlainTableFactory()); + options.prefix_extractor.reset(NewNoopTransform()); + Destroy(&options); + ASSERT_TRUE(TryReopen(&options).IsNotSupported()); +} + +TEST(DBTest, DBIteratorBoundTest) { + Options options; + options.env = env_; + options.create_if_missing = true; + + options.prefix_extractor = nullptr; + DestroyAndReopen(&options); + ASSERT_OK(Put("a", "0")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("g1", "0")); + + // testing basic case with no iterate_upper_bound and no prefix_extractor + { + ReadOptions ro; + ro.iterate_upper_bound = nullptr; + + std::unique_ptr iter(db_->NewIterator(ro)); + + iter->Seek("foo"); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo1")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("g1")), 0); + } + + // testing iterate_upper_bound and forward iterator + // to make sure it stops at bound + { + ReadOptions ro; + // iterate_upper_bound points beyond the last expected entry + Slice prefix("foo2"); + ro.iterate_upper_bound = &prefix; + + std::unique_ptr iter(db_->NewIterator(ro)); + + iter->Seek("foo"); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(("foo1")), 0); + + iter->Next(); + // should stop here... + ASSERT_TRUE(!iter->Valid()); + } + + // prefix is the first letter of the key + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + + DestroyAndReopen(&options); + ASSERT_OK(Put("a", "0")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("g1", "0")); + + // testing with iterate_upper_bound and prefix_extractor + // Seek target and iterate_upper_bound are not is same prefix + // This should be an error + { + ReadOptions ro; + Slice prefix("g1"); + ro.iterate_upper_bound = &prefix; + + std::unique_ptr iter(db_->NewIterator(ro)); + + iter->Seek("foo"); + + ASSERT_TRUE(!iter->Valid()); + ASSERT_TRUE(iter->status().IsInvalidArgument()); + } + + // testing that iterate_upper_bound prevents iterating over deleted items + // if the bound has already reached + { + options.prefix_extractor = nullptr; + DestroyAndReopen(&options); + ASSERT_OK(Put("a", "0")); + ASSERT_OK(Put("b", "0")); + ASSERT_OK(Put("b1", "0")); + ASSERT_OK(Put("c", "0")); + ASSERT_OK(Put("d", "0")); + ASSERT_OK(Put("e", "0")); + ASSERT_OK(Delete("c")); + ASSERT_OK(Delete("d")); + + // base case with no bound + ReadOptions ro; + ro.iterate_upper_bound = nullptr; + + std::unique_ptr iter(db_->NewIterator(ro)); + + iter->Seek("b"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("b")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(("b1")), 0); + + perf_context.Reset(); + iter->Next(); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(static_cast(perf_context.internal_delete_skipped_count), 2); + + // now testing with iterate_bound + Slice prefix("c"); + ro.iterate_upper_bound = &prefix; + + iter.reset(db_->NewIterator(ro)); + + perf_context.Reset(); + + iter->Seek("b"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("b")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(("b1")), 0); + + iter->Next(); + // the iteration should stop as soon as the the bound key is reached + // even though the key is deleted + // hence internal_delete_skipped_count should be 0 + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(static_cast(perf_context.internal_delete_skipped_count), 0); + } +} + +TEST(DBTest, WriteSingleThreadEntry) { + std::vector threads; + dbfull()->TEST_LockMutex(); + auto w = dbfull()->TEST_BeginWrite(); + threads.emplace_back([&] { Put("a", "b"); }); + env_->SleepForMicroseconds(10000); + threads.emplace_back([&] { Flush(); }); + env_->SleepForMicroseconds(10000); + dbfull()->TEST_UnlockMutex(); + dbfull()->TEST_LockMutex(); + dbfull()->TEST_EndWrite(w); + dbfull()->TEST_UnlockMutex(); + + for (auto& t : threads) { + t.join(); + } +} + +TEST(DBTest, DisableDataSyncTest) { + // iter 0 -- no sync + // iter 1 -- sync + for (int iter = 0; iter < 2; ++iter) { + Options options = CurrentOptions(); + options.disableDataSync = iter == 0; + options.create_if_missing = true; + options.env = env_; + Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); + + MakeTables(10, "a", "z"); + Compact("a", "z"); + + if (iter == 0) { + ASSERT_EQ(env_->sync_counter_.load(), 0); + } else { + ASSERT_GT(env_->sync_counter_.load(), 0); + } + Destroy(&options); + } +} + + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/dbformat.cc b/db/dbformat.cc index e53d16dc1bc..baeb868027e 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -127,26 +127,6 @@ void InternalKeyComparator::FindShortSuccessor(std::string* key) const { } } -const char* InternalFilterPolicy::Name() const { - return user_policy_->Name(); -} - -void InternalFilterPolicy::CreateFilter(const Slice* keys, int n, - std::string* dst) const { - // We rely on the fact that the code in table.cc does not mind us - // adjusting keys[]. - Slice* mkey = const_cast(keys); - for (int i = 0; i < n; i++) { - mkey[i] = ExtractUserKey(keys[i]); - // TODO(sanjay): Suppress dups? - } - user_policy_->CreateFilter(keys, n, dst); -} - -bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const { - return user_policy_->KeyMayMatch(ExtractUserKey(key), f); -} - LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) { size_t usize = user_key.size(); size_t needed = usize + 13; // A conservative estimate diff --git a/db/dbformat.h b/db/dbformat.h index b6a6c7a35a2..516a4693b33 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -124,17 +124,6 @@ class InternalKeyComparator : public Comparator { int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const; }; -// Filter policy wrapper that converts from internal keys to user keys -class InternalFilterPolicy : public FilterPolicy { - private: - const FilterPolicy* const user_policy_; - public: - explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { } - virtual const char* Name() const; - virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const; - virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const; -}; - // Modules in this directory should keep internal keys wrapped inside // the following class instead of plain strings so that we do not // incorrectly use string comparisons instead of an InternalKeyComparator. @@ -255,7 +244,7 @@ class IterKey { Slice GetKey() const { return Slice(key_, key_size_); } - const size_t Size() { return key_size_; } + size_t Size() { return key_size_; } void Clear() { key_size_ = 0; } @@ -401,4 +390,12 @@ class InternalKeySliceTransform : public SliceTransform { const SliceTransform* const transform_; }; +// Read record from a write batch piece from input. +// tag, column_family, key, value and blob are return values. Callers own the +// Slice they point to. +// Tag is defined as ValueType. +// input will be advanced to after the record. +extern Status ReadRecordFromWriteBatch(Slice* input, char* tag, + uint32_t* column_family, Slice* key, + Slice* value, Slice* blob); } // namespace rocksdb diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc index 14f0324c172..a5af3128484 100644 --- a/db/deletefile_test.cc +++ b/db/deletefile_test.cc @@ -34,6 +34,7 @@ class DeleteFileTest { DeleteFileTest() { db_ = nullptr; env_ = Env::Default(); + options_.max_background_flushes = 0; options_.write_buffer_size = 1024*1024*1000; options_.target_file_size_base = 1024*1024*1000; options_.max_bytes_for_level_base = 1024*1024*1000; diff --git a/db/filename.cc b/db/filename.cc index 42c7efb781b..a8f6852968c 100644 --- a/db/filename.cc +++ b/db/filename.cc @@ -6,7 +6,10 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include "db/filename.h" #include diff --git a/db/flush_scheduler.cc b/db/flush_scheduler.cc new file mode 100644 index 00000000000..636ff5a98ee --- /dev/null +++ b/db/flush_scheduler.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "db/flush_scheduler.h" + +#include + +#include "db/column_family.h" + +namespace rocksdb { + +void FlushScheduler::ScheduleFlush(ColumnFamilyData* cfd) { +#ifndef NDEBUG + assert(column_families_set_.find(cfd) == column_families_set_.end()); + column_families_set_.insert(cfd); +#endif // NDEBUG + cfd->Ref(); + column_families_.push_back(cfd); +} + +ColumnFamilyData* FlushScheduler::GetNextColumnFamily() { + ColumnFamilyData* cfd = nullptr; + while (column_families_.size() > 0) { + cfd = column_families_.front(); + column_families_.pop_front(); + if (cfd->IsDropped()) { + if (cfd->Unref()) { + delete cfd; + } + } else { + break; + } + } +#ifndef NDEBUG + if (cfd != nullptr) { + auto itr = column_families_set_.find(cfd); + assert(itr != column_families_set_.end()); + column_families_set_.erase(itr); + } +#endif // NDEBUG + return cfd; +} + +bool FlushScheduler::Empty() { return column_families_.empty(); } + +void FlushScheduler::Clear() { + for (auto cfd : column_families_) { +#ifndef NDEBUG + auto itr = column_families_set_.find(cfd); + assert(itr != column_families_set_.end()); + column_families_set_.erase(itr); +#endif // NDEBUG + if (cfd->Unref()) { + delete cfd; + } + } + column_families_.clear(); +} + +} // namespace rocksdb diff --git a/db/flush_scheduler.h b/db/flush_scheduler.h new file mode 100644 index 00000000000..201e4a13c75 --- /dev/null +++ b/db/flush_scheduler.h @@ -0,0 +1,39 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include +#include +#include +#include + +namespace rocksdb { + +class ColumnFamilyData; + +// This class is thread-compatible. It's should only be accessed from single +// write thread (between BeginWrite() and EndWrite()) +class FlushScheduler { + public: + FlushScheduler() = default; + ~FlushScheduler() = default; + + void ScheduleFlush(ColumnFamilyData* cfd); + // Returns Ref()-ed column family. Client needs to Unref() + ColumnFamilyData* GetNextColumnFamily(); + + bool Empty(); + + void Clear(); + + private: + std::deque column_families_; +#ifndef NDEBUG + std::set column_families_set_; +#endif // NDEBUG +}; + +} // namespace rocksdb diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index 79cc953cf79..6b78c4037bb 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -6,9 +6,10 @@ #ifndef ROCKSDB_LITE #include "db/forward_iterator.h" +#include #include #include -#include + #include "db/db_impl.h" #include "db/db_iter.h" #include "db/column_family.h" @@ -37,12 +38,16 @@ class LevelIterator : public Iterator { assert(file_index < files_.size()); if (file_index != file_index_) { file_index_ = file_index; - file_iter_.reset(cfd_->table_cache()->NewIterator( - read_options_, *(cfd_->soptions()), cfd_->internal_comparator(), - files_[file_index_]->fd, nullptr /* table_reader_ptr */, false)); + Reset(); } valid_ = false; } + void Reset() { + assert(file_index_ < files_.size()); + file_iter_.reset(cfd_->table_cache()->NewIterator( + read_options_, *(cfd_->soptions()), cfd_->internal_comparator(), + files_[file_index_]->fd, nullptr /* table_reader_ptr */, false)); + } void SeekToLast() override { status_ = Status::NotSupported("LevelIterator::SeekToLast()"); valid_ = false; @@ -63,12 +68,15 @@ class LevelIterator : public Iterator { assert(file_iter_ != nullptr); file_iter_->Seek(internal_key); valid_ = file_iter_->Valid(); - assert(valid_); } void Next() override { assert(valid_); file_iter_->Next(); - while (!file_iter_->Valid()) { + for (;;) { + if (file_iter_->status().IsIncomplete() || file_iter_->Valid()) { + valid_ = !file_iter_->status().IsIncomplete(); + return; + } if (file_index_ + 1 >= files_.size()) { valid_ = false; return; @@ -76,7 +84,6 @@ class LevelIterator : public Iterator { SetFileIndex(file_index_ + 1); file_iter_->SeekToFirst(); } - valid_ = file_iter_->Valid(); } Slice key() const override { assert(valid_); @@ -125,9 +132,11 @@ ForwardIterator::~ForwardIterator() { } void ForwardIterator::Cleanup() { - delete mutable_iter_; + if (mutable_iter_ != nullptr) { + mutable_iter_->~Iterator(); + } for (auto* m : imm_iters_) { - delete m; + m->~Iterator(); } imm_iters_.clear(); for (auto* f : l0_iters_) { @@ -160,6 +169,8 @@ void ForwardIterator::SeekToFirst() { if (sv_ == nullptr || sv_ ->version_number != cfd_->GetSuperVersionNumber()) { RebuildIterators(); + } else if (status_.IsIncomplete()) { + ResetIncompleteIterators(); } SeekInternal(Slice(), true); } @@ -168,6 +179,8 @@ void ForwardIterator::Seek(const Slice& internal_key) { if (sv_ == nullptr || sv_ ->version_number != cfd_->GetSuperVersionNumber()) { RebuildIterators(); + } else if (status_.IsIncomplete()) { + ResetIncompleteIterators(); } SeekInternal(internal_key, false); } @@ -211,7 +224,15 @@ void ForwardIterator::SeekInternal(const Slice& internal_key, } l0_iters_[i]->Seek(internal_key); } - if (l0_iters_[i]->Valid()) { + + if (l0_iters_[i]->status().IsIncomplete()) { + // if any of the immutable iterators is incomplete (no-io option was + // used), we are unable to reliably find the smallest key + assert(read_options_.read_tier == kBlockCacheTier); + status_ = l0_iters_[i]->status(); + valid_ = false; + return; + } else if (l0_iters_[i]->Valid()) { immutable_min_heap_.push(l0_iters_[i]); } } @@ -280,7 +301,14 @@ void ForwardIterator::SeekInternal(const Slice& internal_key, level_iters_[level - 1]->SetFileIndex(f_idx); seek_to_first ? level_iters_[level - 1]->SeekToFirst() : level_iters_[level - 1]->Seek(internal_key); - if (level_iters_[level - 1]->Valid()) { + + if (level_iters_[level - 1]->status().IsIncomplete()) { + // see above + assert(read_options_.read_tier == kBlockCacheTier); + status_ = level_iters_[level - 1]->status(); + valid_ = false; + return; + } else if (level_iters_[level - 1]->Valid()) { immutable_min_heap_.push(level_iters_[level - 1]); } } @@ -304,7 +332,7 @@ void ForwardIterator::Next() { assert(valid_); if (sv_ == nullptr || - sv_ ->version_number != cfd_->GetSuperVersionNumber()) { + sv_->version_number != cfd_->GetSuperVersionNumber()) { std::string current_key = key().ToString(); Slice old_key(current_key.data(), current_key.size()); @@ -320,9 +348,17 @@ void ForwardIterator::Next() { } current_->Next(); - if (current_->Valid() && current_ != mutable_iter_) { - immutable_min_heap_.push(current_); + if (current_ != mutable_iter_) { + if (current_->status().IsIncomplete()) { + assert(read_options_.read_tier == kBlockCacheTier); + status_ = current_->status(); + valid_ = false; + return; + } else if (current_->Valid()) { + immutable_min_heap_.push(current_); + } } + UpdateCurrent(); } @@ -367,8 +403,8 @@ void ForwardIterator::RebuildIterators() { Cleanup(); // New sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_)); - mutable_iter_ = sv_->mem->NewIterator(read_options_); - sv_->imm->AddIterators(read_options_, &imm_iters_); + mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_); + sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_); const auto& l0_files = sv_->current->files_[0]; l0_iters_.reserve(l0_files.size()); for (const auto* l0 : l0_files) { @@ -389,6 +425,29 @@ void ForwardIterator::RebuildIterators() { is_prev_set_ = false; } +void ForwardIterator::ResetIncompleteIterators() { + const auto& l0_files = sv_->current->files_[0]; + for (uint32_t i = 0; i < l0_iters_.size(); ++i) { + assert(i < l0_files.size()); + if (!l0_iters_[i]->status().IsIncomplete()) { + continue; + } + delete l0_iters_[i]; + l0_iters_[i] = cfd_->table_cache()->NewIterator( + read_options_, *cfd_->soptions(), cfd_->internal_comparator(), + l0_files[i]->fd); + } + + for (auto* level_iter : level_iters_) { + if (level_iter && level_iter->status().IsIncomplete()) { + level_iter->Reset(); + } + } + + current_ = nullptr; + is_prev_set_ = false; +} + void ForwardIterator::UpdateCurrent() { if (immutable_min_heap_.empty() && !mutable_iter_->Valid()) { current_ = nullptr; @@ -417,7 +476,7 @@ void ForwardIterator::UpdateCurrent() { } bool ForwardIterator::NeedToSeekImmutable(const Slice& target) { - if (!is_prev_set_) { + if (!valid_ || !is_prev_set_) { return true; } Slice prev_key = prev_key_.GetKey(); diff --git a/db/forward_iterator.h b/db/forward_iterator.h index d539ae3c704..653a0ac0cae 100644 --- a/db/forward_iterator.h +++ b/db/forward_iterator.h @@ -14,6 +14,7 @@ #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "db/dbformat.h" +#include "util/arena.h" namespace rocksdb { @@ -73,6 +74,7 @@ class ForwardIterator : public Iterator { private: void Cleanup(); void RebuildIterators(); + void ResetIncompleteIterators(); void SeekInternal(const Slice& internal_key, bool seek_to_first); void UpdateCurrent(); bool NeedToSeekImmutable(const Slice& internal_key); @@ -99,6 +101,7 @@ class ForwardIterator : public Iterator { IterKey prev_key_; bool is_prev_set_; + Arena arena_; }; } // namespace rocksdb diff --git a/db/internal_stats.cc b/db/internal_stats.cc index 839e3d61d06..c9f9306e296 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -7,10 +7,15 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/internal_stats.h" + +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include "db/column_family.h" +#include "db/db_impl.h" namespace rocksdb { @@ -133,6 +138,8 @@ DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property, } else if (in == "estimate-table-readers-mem") { *need_out_of_mutex = true; return kEstimatedUsageByTableReaders; + } else if (in == "is-file-deletions-enabled") { + return kIsFileDeletionEnabled; } return kUnknown; } @@ -215,7 +222,7 @@ bool InternalStats::GetStringProperty(DBPropertyType property_type, } bool InternalStats::GetIntProperty(DBPropertyType property_type, - uint64_t* value) const { + uint64_t* value, DBImpl* db) const { Version* current = cfd_->current(); switch (property_type) { @@ -254,6 +261,11 @@ bool InternalStats::GetIntProperty(DBPropertyType property_type, cfd_->imm()->current()->GetTotalNumEntries() + current->GetEstimatedActiveKeys(); return true; +#ifndef ROCKSDB_LITE + case kIsFileDeletionEnabled: + *value = db->IsFileDeletionsEnabled(); + return true; +#endif default: return false; } diff --git a/db/internal_stats.h b/db/internal_stats.h index 3c1bc299586..2e04f24e719 100644 --- a/db/internal_stats.h +++ b/db/internal_stats.h @@ -42,6 +42,8 @@ enum DBPropertyType : uint32_t { // the immutable mem tables. kEstimatedNumKeys, // Estimated total number of keys in the database. kEstimatedUsageByTableReaders, // Estimated memory by table readers. + kIsFileDeletionEnabled, // Equals disable_delete_obsolete_files_, + // 0 means file deletions enabled }; extern DBPropertyType GetPropertyType(const Slice& property, @@ -197,7 +199,8 @@ class InternalStats { bool GetStringProperty(DBPropertyType property_type, const Slice& property, std::string* value); - bool GetIntProperty(DBPropertyType property_type, uint64_t* value) const; + bool GetIntProperty(DBPropertyType property_type, uint64_t* value, + DBImpl* db) const; bool GetIntPropertyOutOfMutex(DBPropertyType property_type, Version* version, uint64_t* value) const; diff --git a/db/log_and_apply_bench.cc b/db/log_and_apply_bench.cc index a5aa950173f..3a5535d2d77 100644 --- a/db/log_and_apply_bench.cc +++ b/db/log_and_apply_bench.cc @@ -9,6 +9,7 @@ #include "util/testharness.h" #include "util/benchharness.h" #include "db/version_set.h" +#include "db/write_controller.h" #include "util/mutexlock.h" namespace rocksdb { @@ -21,6 +22,7 @@ std::string MakeKey(unsigned int num) { void BM_LogAndApply(int iters, int num_base_files) { VersionSet* vset; + WriteController wc; ColumnFamilyData* default_cfd; uint64_t fnum = 1; port::Mutex mu; @@ -47,7 +49,7 @@ void BM_LogAndApply(int iters, int num_base_files) { options.db_paths.emplace_back(dbname, 0); // The parameter of table cache is passed in as null, so any file I/O // operation is likely to fail. - vset = new VersionSet(dbname, &options, sopt, nullptr); + vset = new VersionSet(dbname, &options, sopt, nullptr, &wc); std::vector dummy; dummy.push_back(ColumnFamilyDescriptor()); ASSERT_OK(vset->Recover(dummy)); @@ -69,6 +71,7 @@ void BM_LogAndApply(int iters, int num_base_files) { vedit.AddFile(2, ++fnum, 0, 1 /* file size */, start, limit, 1, 1); vset->LogAndApply(default_cfd, &vedit, &mu); } + delete vset; } BENCHMARK_NAMED_PARAM(BM_LogAndApply, 1000_iters_1_file, 1000, 1) diff --git a/db/memtable.cc b/db/memtable.cc index 4ddcb37471d..bdfbc805fb0 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -31,41 +31,57 @@ namespace rocksdb { -MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options) +MemTableOptions::MemTableOptions( + const MutableCFOptions& mutable_cf_options, const Options& options) + : write_buffer_size(mutable_cf_options.write_buffer_size), + arena_block_size(mutable_cf_options.arena_block_size), + memtable_prefix_bloom_bits(mutable_cf_options.memtable_prefix_bloom_bits), + memtable_prefix_bloom_probes( + mutable_cf_options.memtable_prefix_bloom_probes), + memtable_prefix_bloom_huge_page_tlb_size( + mutable_cf_options.memtable_prefix_bloom_huge_page_tlb_size), + inplace_update_support(options.inplace_update_support), + inplace_update_num_locks(options.inplace_update_num_locks), + inplace_callback(options.inplace_callback), + max_successive_merges(mutable_cf_options.max_successive_merges), + filter_deletes(mutable_cf_options.filter_deletes) {} + +MemTable::MemTable(const InternalKeyComparator& cmp, + const ImmutableCFOptions& ioptions, + const MemTableOptions& moptions) : comparator_(cmp), + ioptions_(ioptions), + moptions_(moptions), refs_(0), - kArenaBlockSize(OptimizeBlockSize(options.arena_block_size)), - kWriteBufferSize(options.write_buffer_size), - arena_(options.arena_block_size), - table_(options.memtable_factory->CreateMemTableRep( - comparator_, &arena_, options.prefix_extractor.get(), - options.info_log.get())), + kArenaBlockSize(OptimizeBlockSize(moptions.arena_block_size)), + arena_(moptions.arena_block_size), + table_(ioptions.memtable_factory->CreateMemTableRep( + comparator_, &arena_, ioptions.prefix_extractor, ioptions.info_log)), num_entries_(0), flush_in_progress_(false), flush_completed_(false), file_number_(0), first_seqno_(0), mem_next_logfile_number_(0), - locks_(options.inplace_update_support ? options.inplace_update_num_locks - : 0), - prefix_extractor_(options.prefix_extractor.get()), - should_flush_(ShouldFlushNow()) { + locks_(moptions.inplace_update_support ? moptions.inplace_update_num_locks + : 0), + prefix_extractor_(ioptions.prefix_extractor), + should_flush_(ShouldFlushNow()), + flush_scheduled_(false) { // if should_flush_ == true without an entry inserted, something must have // gone wrong already. assert(!should_flush_); - if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) { + if (prefix_extractor_ && moptions.memtable_prefix_bloom_bits > 0) { prefix_bloom_.reset(new DynamicBloom( &arena_, - options.memtable_prefix_bloom_bits, options.bloom_locality, - options.memtable_prefix_bloom_probes, nullptr, - options.memtable_prefix_bloom_huge_page_tlb_size, - options.info_log.get())); + moptions.memtable_prefix_bloom_bits, ioptions.bloom_locality, + moptions.memtable_prefix_bloom_probes, nullptr, + moptions.memtable_prefix_bloom_huge_page_tlb_size, + ioptions.info_log)); } } -MemTable::~MemTable() { - assert(refs_ == 0); -} +MemTable::~MemTable() { assert(refs_ == 0); } size_t MemTable::ApproximateMemoryUsage() { size_t arena_usage = arena_.ApproximateMemoryUsage(); @@ -97,14 +113,16 @@ bool MemTable::ShouldFlushNow() const { // if we can still allocate one more block without exceeding the // over-allocation ratio, then we should not flush. if (allocated_memory + kArenaBlockSize < - kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) { + moptions_.write_buffer_size + + kArenaBlockSize * kAllowOverAllocationRatio) { return false; } - // if user keeps adding entries that exceeds kWriteBufferSize, we need to - // flush earlier even though we still have much available memory left. - if (allocated_memory > - kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) { + // if user keeps adding entries that exceeds moptions.write_buffer_size, + // we need to flush earlier even though we still have much available + // memory left. + if (allocated_memory > moptions_.write_buffer_size + + kArenaBlockSize * kAllowOverAllocationRatio) { return true; } @@ -174,13 +192,13 @@ const char* EncodeKey(std::string* scratch, const Slice& target) { class MemTableIterator: public Iterator { public: - MemTableIterator(const MemTable& mem, const ReadOptions& options, - bool enforce_total_order, Arena* arena) + MemTableIterator( + const MemTable& mem, const ReadOptions& read_options, Arena* arena) : bloom_(nullptr), prefix_extractor_(mem.prefix_extractor_), valid_(false), arena_mode_(arena != nullptr) { - if (prefix_extractor_ != nullptr && !enforce_total_order) { + if (prefix_extractor_ != nullptr && !read_options.total_order_seek) { bloom_ = mem.prefix_bloom_.get(); iter_ = mem.table_->GetDynamicPrefixIterator(arena); } else { @@ -248,15 +266,10 @@ class MemTableIterator: public Iterator { void operator=(const MemTableIterator&); }; -Iterator* MemTable::NewIterator(const ReadOptions& options, - bool enforce_total_order, Arena* arena) { - if (arena == nullptr) { - return new MemTableIterator(*this, options, enforce_total_order, nullptr); - } else { - auto mem = arena->AllocateAligned(sizeof(MemTableIterator)); - return new (mem) - MemTableIterator(*this, options, enforce_total_order, arena); - } +Iterator* MemTable::NewIterator(const ReadOptions& read_options, Arena* arena) { + assert(arena != nullptr); + auto mem = arena->AllocateAligned(sizeof(MemTableIterator)); + return new (mem) MemTableIterator(*this, read_options, arena); } port::RWMutex* MemTable::GetLock(const Slice& key) { @@ -417,8 +430,13 @@ static bool SaveValue(void* arg, const char* entry) { } bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, - MergeContext& merge_context, const Options& options) { - PERF_TIMER_AUTO(get_from_memtable_time); + MergeContext* merge_context) { + // The sequence number is updated synchronously in version_set.h + if (IsEmpty()) { + // Avoiding recording stats for speed. + return false; + } + PERF_TIMER_GUARD(get_from_memtable_time); Slice user_key = key.user_key(); bool found_final_value = false; @@ -436,11 +454,11 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, saver.value = value; saver.status = s; saver.mem = this; - saver.merge_context = &merge_context; - saver.merge_operator = options.merge_operator.get(); - saver.logger = options.info_log.get(); - saver.inplace_update_support = options.inplace_update_support; - saver.statistics = options.statistics.get(); + saver.merge_context = merge_context; + saver.merge_operator = ioptions_.merge_operator; + saver.logger = ioptions_.info_log; + saver.inplace_update_support = moptions_.inplace_update_support; + saver.statistics = ioptions_.statistics; table_->Get(key, &saver, SaveValue); } @@ -448,7 +466,6 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, if (!found_final_value && merge_in_progress) { *s = Status::MergeInProgress(""); } - PERF_TIMER_STOP(get_from_memtable_time); PERF_COUNTER_ADD(get_from_memtable_count, 1); return found_final_value; } @@ -513,8 +530,7 @@ void MemTable::Update(SequenceNumber seq, bool MemTable::UpdateCallback(SequenceNumber seq, const Slice& key, - const Slice& delta, - const Options& options) { + const Slice& delta) { LookupKey lkey(key, seq); Slice memkey = lkey.memtable_key(); @@ -549,8 +565,8 @@ bool MemTable::UpdateCallback(SequenceNumber seq, std::string str_value; WriteLock wl(GetLock(lkey.user_key())); - auto status = options.inplace_callback(prev_buffer, &new_prev_size, - delta, &str_value); + auto status = moptions_.inplace_callback(prev_buffer, &new_prev_size, + delta, &str_value); if (status == UpdateStatus::UPDATED_INPLACE) { // Value already updated by callback. assert(new_prev_size <= prev_size); @@ -563,12 +579,12 @@ bool MemTable::UpdateCallback(SequenceNumber seq, memcpy(p, prev_buffer, new_prev_size); } } - RecordTick(options.statistics.get(), NUMBER_KEYS_UPDATED); + RecordTick(ioptions_.statistics, NUMBER_KEYS_UPDATED); should_flush_ = ShouldFlushNow(); return true; } else if (status == UpdateStatus::UPDATED) { Add(seq, kTypeValue, key, Slice(str_value)); - RecordTick(options.statistics.get(), NUMBER_KEYS_WRITTEN); + RecordTick(ioptions_.statistics, NUMBER_KEYS_WRITTEN); should_flush_ = ShouldFlushNow(); return true; } else if (status == UpdateStatus::UPDATE_FAILED) { diff --git a/db/memtable.h b/db/memtable.h index 8bad2773a33..ce6cce7f6e5 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -10,14 +10,18 @@ #pragma once #include #include +#include #include +#include #include "db/dbformat.h" #include "db/skiplist.h" #include "db/version_edit.h" #include "rocksdb/db.h" #include "rocksdb/memtablerep.h" +#include "rocksdb/immutable_options.h" #include "util/arena.h" #include "util/dynamic_bloom.h" +#include "util/mutable_cf_options.h" namespace rocksdb { @@ -26,6 +30,25 @@ class Mutex; class MemTableIterator; class MergeContext; +struct MemTableOptions { + explicit MemTableOptions( + const MutableCFOptions& mutable_cf_options, + const Options& options); + size_t write_buffer_size; + size_t arena_block_size; + uint32_t memtable_prefix_bloom_bits; + uint32_t memtable_prefix_bloom_probes; + size_t memtable_prefix_bloom_huge_page_tlb_size; + bool inplace_update_support; + size_t inplace_update_num_locks; + UpdateStatus (*inplace_callback)(char* existing_value, + uint32_t* existing_value_size, + Slice delta_value, + std::string* merged_value); + size_t max_successive_merges; + bool filter_deletes; +}; + class MemTable { public: struct KeyComparator : public MemTableRep::KeyComparator { @@ -40,7 +63,8 @@ class MemTable { // MemTables are reference counted. The initial reference count // is zero and the caller must call Ref() at least once. explicit MemTable(const InternalKeyComparator& comparator, - const Options& options); + const ImmutableCFOptions& ioptions, + const MemTableOptions& moptions); ~MemTable(); @@ -67,7 +91,11 @@ class MemTable { // This method heuristically determines if the memtable should continue to // host more data. - bool ShouldFlush() const { return should_flush_; } + bool ShouldScheduleFlush() const { + return flush_scheduled_ == false && should_flush_; + } + + void MarkFlushScheduled() { flush_scheduled_ = true; } // Return an iterator that yields the contents of the memtable. // @@ -81,9 +109,7 @@ class MemTable { // arena: If not null, the arena needs to be used to allocate the Iterator. // Calling ~Iterator of the iterator will destroy all the states but // those allocated in arena. - Iterator* NewIterator(const ReadOptions& options, - bool enforce_total_order = false, - Arena* arena = nullptr); + Iterator* NewIterator(const ReadOptions& read_options, Arena* arena); // Add an entry into memtable that maps key to value at the // specified sequence number and with the specified type. @@ -101,7 +127,7 @@ class MemTable { // store MergeInProgress in s, and return false. // Else, return false. bool Get(const LookupKey& key, std::string* value, Status* s, - MergeContext& merge_context, const Options& options); + MergeContext* merge_context); // Attempts to update the new_value inplace, else does normal Add // Pseudocode @@ -125,8 +151,7 @@ class MemTable { // else return false bool UpdateCallback(SequenceNumber seq, const Slice& key, - const Slice& delta, - const Options& options); + const Slice& delta); // Returns the number of successive merge entries starting from the newest // entry for the key up to the last non-merge entry or last entry for the @@ -139,6 +164,9 @@ class MemTable { // Returns the edits area that is needed for flushing the memtable VersionEdit* GetEdits() { return &edit_; } + // Returns if there is no entry inserted to the mem table. + bool IsEmpty() const { return first_seqno_ == 0; } + // Returns the sequence number of the first element that was inserted // into the memtable SequenceNumber GetFirstSequenceNumber() { return first_seqno_; } @@ -171,8 +199,11 @@ class MemTable { const Arena& TEST_GetArena() const { return arena_; } + const ImmutableCFOptions* GetImmutableOptions() const { return &ioptions_; } + const MemTableOptions* GetMemTableOptions() const { return &moptions_; } + private: - // Dynamically check if we can add more incoming entries. + // Dynamically check if we can add more incoming entries bool ShouldFlushNow() const; friend class MemTableIterator; @@ -180,9 +211,10 @@ class MemTable { friend class MemTableList; KeyComparator comparator_; + const ImmutableCFOptions& ioptions_; + const MemTableOptions moptions_; int refs_; const size_t kArenaBlockSize; - const size_t kWriteBufferSize; Arena arena_; unique_ptr table_; @@ -215,6 +247,9 @@ class MemTable { // a flag indicating if a memtable has met the criteria to flush bool should_flush_; + + // a flag indicating if flush has been scheduled + bool flush_scheduled_; }; extern const char* EncodeKey(std::string* scratch, const Slice& target); diff --git a/db/memtable_list.cc b/db/memtable_list.cc index d3fc1356b27..728b1c0a095 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -62,10 +62,9 @@ int MemTableList::size() const { // Return the most recent value found, if any. // Operands stores the list of merge operations to apply, so far. bool MemTableListVersion::Get(const LookupKey& key, std::string* value, - Status* s, MergeContext& merge_context, - const Options& options) { + Status* s, MergeContext* merge_context) { for (auto& memtable : memlist_) { - if (memtable->Get(key, value, s, merge_context, options)) { + if (memtable->Get(key, value, s, merge_context)) { return true; } } @@ -73,9 +72,10 @@ bool MemTableListVersion::Get(const LookupKey& key, std::string* value, } void MemTableListVersion::AddIterators(const ReadOptions& options, - std::vector* iterator_list) { + std::vector* iterator_list, + Arena* arena) { for (auto& m : memlist_) { - iterator_list->push_back(m->NewIterator(options)); + iterator_list->push_back(m->NewIterator(options, arena)); } } diff --git a/db/memtable_list.h b/db/memtable_list.h index f4923e831a4..92688825a20 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -46,10 +46,10 @@ class MemTableListVersion { // Search all the memtables starting from the most recent one. // Return the most recent value found, if any. bool Get(const LookupKey& key, std::string* value, Status* s, - MergeContext& merge_context, const Options& options); + MergeContext* merge_context); void AddIterators(const ReadOptions& options, - std::vector* iterator_list); + std::vector* iterator_list, Arena* arena); void AddIterators(const ReadOptions& options, MergeIteratorBuilder* merge_iter_builder); diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index bb0f96f1584..1750d265c29 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -192,16 +192,17 @@ extern const uint64_t kPlainTableMagicNumber; class TestPlainTableReader : public PlainTableReader { public: - TestPlainTableReader(const EnvOptions& storage_options, + TestPlainTableReader(const EnvOptions& env_options, const InternalKeyComparator& icomparator, EncodingType encoding_type, uint64_t file_size, int bloom_bits_per_key, double hash_table_ratio, size_t index_sparseness, const TableProperties* table_properties, unique_ptr&& file, - const Options& options, bool* expect_bloom_not_match, + const ImmutableCFOptions& ioptions, + bool* expect_bloom_not_match, bool store_index_in_file) - : PlainTableReader(options, std::move(file), storage_options, icomparator, + : PlainTableReader(ioptions, std::move(file), env_options, icomparator, encoding_type, file_size, table_properties), expect_bloom_not_match_(expect_bloom_not_match) { Status s = MmapDataFile(); @@ -218,7 +219,7 @@ class TestPlainTableReader : public PlainTableReader { PlainTablePropertyNames::kBloomVersion); ASSERT_TRUE(bloom_version_ptr != props->user_collected_properties.end()); ASSERT_EQ(bloom_version_ptr->second, std::string("1")); - if (options.bloom_locality > 0) { + if (ioptions.bloom_locality > 0) { auto num_blocks_ptr = props->user_collected_properties.find( PlainTablePropertyNames::kNumBloomBlocks); ASSERT_TRUE(num_blocks_ptr != props->user_collected_properties.end()); @@ -253,25 +254,26 @@ class TestPlainTableFactory : public PlainTableFactory { store_index_in_file_(options.store_index_in_file), expect_bloom_not_match_(expect_bloom_not_match) {} - Status NewTableReader(const Options& options, const EnvOptions& soptions, + Status NewTableReader(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table) const override { TableProperties* props = nullptr; auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, - options.env, options.info_log.get(), &props); + ioptions.env, ioptions.info_log, &props); ASSERT_TRUE(s.ok()); if (store_index_in_file_) { BlockHandle bloom_block_handle; s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber, - options.env, BloomBlockBuilder::kBloomBlock, + ioptions.env, BloomBlockBuilder::kBloomBlock, &bloom_block_handle); ASSERT_TRUE(s.ok()); BlockHandle index_block_handle; s = FindMetaBlock( - file.get(), file_size, kPlainTableMagicNumber, options.env, + file.get(), file_size, kPlainTableMagicNumber, ioptions.env, PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_handle); ASSERT_TRUE(s.ok()); } @@ -284,9 +286,9 @@ class TestPlainTableFactory : public PlainTableFactory { DecodeFixed32(encoding_type_prop->second.c_str())); std::unique_ptr new_reader(new TestPlainTableReader( - soptions, internal_comparator, encoding_type, file_size, + env_options, internal_comparator, encoding_type, file_size, bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, props, - std::move(file), options, expect_bloom_not_match_, + std::move(file), ioptions, expect_bloom_not_match_, store_index_in_file_)); *table = std::move(new_reader); diff --git a/db/repair.cc b/db/repair.cc index 66ca946a62a..2773d4c71cb 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -31,7 +31,10 @@ #ifndef ROCKSDB_LITE +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include "db/builder.h" #include "db/db_impl.h" @@ -46,6 +49,9 @@ #include "rocksdb/comparator.h" #include "rocksdb/db.h" #include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/immutable_options.h" +#include "util/scoped_arena_iterator.h" namespace rocksdb { @@ -57,8 +63,8 @@ class Repairer { : dbname_(dbname), env_(options.env), icmp_(options.comparator), - ipolicy_(options.filter_policy), - options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)), + options_(SanitizeOptions(dbname, &icmp_, options)), + ioptions_(options_), raw_table_cache_( // TableCache can be small since we expect each table to be opened // once. @@ -66,7 +72,7 @@ class Repairer { options_.table_cache_remove_scan_count_limit)), next_file_number_(1) { table_cache_ = - new TableCache(&options_, storage_options_, raw_table_cache_.get()); + new TableCache(ioptions_, env_options_, raw_table_cache_.get()); edit_ = new VersionEdit(); } @@ -108,9 +114,9 @@ class Repairer { std::string const dbname_; Env* const env_; - InternalKeyComparator const icmp_; - InternalFilterPolicy const ipolicy_; - Options const options_; + const InternalKeyComparator icmp_; + const Options options_; + const ImmutableCFOptions ioptions_; std::shared_ptr raw_table_cache_; TableCache* table_cache_; VersionEdit* edit_; @@ -120,7 +126,7 @@ class Repairer { std::vector logs_; std::vector tables_; uint64_t next_file_number_; - const EnvOptions storage_options_; + const EnvOptions env_options_; Status FindFiles() { std::vector filenames; @@ -192,7 +198,7 @@ class Repairer { // Open the log file std::string logname = LogFileName(dbname_, log); unique_ptr lfile; - Status status = env_->NewSequentialFile(logname, &lfile, storage_options_); + Status status = env_->NewSequentialFile(logname, &lfile, env_options_); if (!status.ok()) { return status; } @@ -213,7 +219,8 @@ class Repairer { std::string scratch; Slice record; WriteBatch batch; - MemTable* mem = new MemTable(icmp_, options_); + MemTable* mem = new MemTable(icmp_, ioptions_, + MemTableOptions(MutableCFOptions(options_), options_)); auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem, &options_); mem->Ref(); int counter = 0; @@ -238,11 +245,15 @@ class Repairer { // since ExtractMetaData() will also generate edits. FileMetaData meta; meta.fd = FileDescriptor(next_file_number_++, 0, 0); - ReadOptions ro; - Iterator* iter = mem->NewIterator(ro, true /* enforce_total_order */); - status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_, - iter, &meta, icmp_, 0, 0, kNoCompression); - delete iter; + { + ReadOptions ro; + ro.total_order_seek = true; + Arena arena; + ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); + status = BuildTable(dbname_, env_, ioptions_, env_options_, table_cache_, + iter.get(), &meta, icmp_, 0, 0, kNoCompression, + CompressionOptions()); + } delete mem->Unref(); delete cf_mems_default; mem = nullptr; @@ -287,7 +298,7 @@ class Repairer { file_size); if (status.ok()) { Iterator* iter = table_cache_->NewIterator( - ReadOptions(), storage_options_, icmp_, t->meta.fd); + ReadOptions(), env_options_, icmp_, t->meta.fd); bool empty = true; ParsedInternalKey parsed; t->min_sequence = 0; @@ -327,7 +338,7 @@ class Repairer { std::string tmp = TempFileName(dbname_, 1); unique_ptr file; Status status = env_->NewWritableFile( - tmp, &file, env_->OptimizeForManifestWrite(storage_options_)); + tmp, &file, env_->OptimizeForManifestWrite(env_options_)); if (!status.ok()) { return status; } diff --git a/db/simple_table_db_test.cc b/db/simple_table_db_test.cc index 5c140cd40ed..0a0ecf06485 100644 --- a/db/simple_table_db_test.cc +++ b/db/simple_table_db_test.cc @@ -79,7 +79,8 @@ class SimpleTableReader: public TableReader { // for the duration of the returned table's lifetime. // // *file must remain live while this Table is in use. - static Status Open(const Options& options, const EnvOptions& soptions, + static Status Open(const ImmutableCFOptions& options, + const EnvOptions& env_options, unique_ptr && file, uint64_t file_size, unique_ptr* table_reader); @@ -160,14 +161,14 @@ class SimpleTableIterator: public Iterator { struct SimpleTableReader::Rep { ~Rep() { } - Rep(const EnvOptions& storage_options, uint64_t index_start_offset, - int num_entries) : - soptions(storage_options), index_start_offset(index_start_offset), - num_entries(num_entries) { + Rep(const ImmutableCFOptions& ioptions, const EnvOptions& env_options, + uint64_t index_start_offset, int num_entries) : + ioptions(ioptions), env_options(env_options), + index_start_offset(index_start_offset), num_entries(num_entries) { } - Options options; - const EnvOptions& soptions; + const ImmutableCFOptions& ioptions; + const EnvOptions& env_options; Status status; unique_ptr file; uint64_t index_start_offset; @@ -187,8 +188,8 @@ SimpleTableReader::~SimpleTableReader() { delete rep_; } -Status SimpleTableReader::Open(const Options& options, - const EnvOptions& soptions, +Status SimpleTableReader::Open(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, unique_ptr && file, uint64_t size, unique_ptr* table_reader) { @@ -201,12 +202,10 @@ Status SimpleTableReader::Open(const Options& options, int num_entries = (size - Rep::offset_length - index_start_offset) / (Rep::GetInternalKeyLength() + Rep::offset_length); - SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(soptions, - index_start_offset, - num_entries); + SimpleTableReader::Rep* rep = new SimpleTableReader::Rep( + ioptions, env_options, index_start_offset, num_entries); rep->file = std::move(file); - rep->options = options; table_reader->reset(new SimpleTableReader(rep)); } return s; @@ -248,7 +247,7 @@ Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) { return s; } - InternalKeyComparator ikc(rep_->options.comparator); + InternalKeyComparator ikc(rep_->ioptions.comparator); int compare_result = ikc.Compare(tmp_slice, target); if (compare_result < 0) { @@ -382,7 +381,7 @@ void SimpleTableIterator::Prev() { } Slice SimpleTableIterator::key() const { - Log(table_->rep_->options.info_log, "key!!!!"); + Log(table_->rep_->ioptions.info_log, "key!!!!"); return key_; } @@ -401,7 +400,7 @@ class SimpleTableBuilder: public TableBuilder { // caller to close the file after calling Finish(). The output file // will be part of level specified by 'level'. A value of -1 means // that the caller does not know which level the output file will reside. - SimpleTableBuilder(const Options& options, WritableFile* file, + SimpleTableBuilder(const ImmutableCFOptions& ioptions, WritableFile* file, CompressionType compression_type); // REQUIRES: Either Finish() or Abandon() has been called. @@ -444,7 +443,7 @@ class SimpleTableBuilder: public TableBuilder { }; struct SimpleTableBuilder::Rep { - Options options; + const ImmutableCFOptions& ioptions; WritableFile* file; uint64_t offset = 0; Status status; @@ -463,17 +462,17 @@ struct SimpleTableBuilder::Rep { std::string index; - Rep(const Options& opt, WritableFile* f) : - options(opt), file(f) { + Rep(const ImmutableCFOptions& iopt, WritableFile* f) : + ioptions(iopt), file(f) { } ~Rep() { } }; -SimpleTableBuilder::SimpleTableBuilder(const Options& options, +SimpleTableBuilder::SimpleTableBuilder(const ImmutableCFOptions& ioptions, WritableFile* file, CompressionType compression_type) : - rep_(new SimpleTableBuilder::Rep(options, file)) { + rep_(new SimpleTableBuilder::Rep(ioptions, file)) { } SimpleTableBuilder::~SimpleTableBuilder() { @@ -546,31 +545,45 @@ class SimpleTableFactory: public TableFactory { const char* Name() const override { return "SimpleTable"; } - Status NewTableReader(const Options& options, const EnvOptions& soptions, + Status NewTableReader(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const InternalKeyComparator& internal_key, unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader) const; - TableBuilder* NewTableBuilder(const Options& options, - const InternalKeyComparator& internal_key, - WritableFile* file, - CompressionType compression_type) const; + TableBuilder* NewTableBuilder( + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& internal_key, + WritableFile* file, + const CompressionType compression_type, + const CompressionOptions& compression_opts) const; + + virtual Status SanitizeDBOptions(const DBOptions* db_opts) const override { + return Status::OK(); + } + + virtual std::string GetPrintableTableOptions() const override { + return std::string(); + } }; Status SimpleTableFactory::NewTableReader( - const Options& options, const EnvOptions& soptions, + const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const InternalKeyComparator& internal_key, unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader) const { - return SimpleTableReader::Open(options, soptions, std::move(file), file_size, - table_reader); + return SimpleTableReader::Open(ioptions, env_options, std::move(file), + file_size, table_reader); } TableBuilder* SimpleTableFactory::NewTableBuilder( - const Options& options, const InternalKeyComparator& internal_key, - WritableFile* file, CompressionType compression_type) const { - return new SimpleTableBuilder(options, file, compression_type); + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& internal_key, + WritableFile* file, const CompressionType compression_type, + const CompressionOptions& compression_opts) const { + return new SimpleTableBuilder(ioptions, file, compression_type); } class SimpleTableDBTest { diff --git a/db/snapshot.h b/db/snapshot.h index 2c2e3eac803..51fa556c825 100644 --- a/db/snapshot.h +++ b/db/snapshot.h @@ -71,7 +71,7 @@ class SnapshotList { } // get the sequence number of the most recent snapshot - const SequenceNumber GetNewest() { + SequenceNumber GetNewest() { if (empty()) { return 0; } diff --git a/db/table_cache.cc b/db/table_cache.cc index c362499a685..5cb96f8bf52 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -36,12 +36,10 @@ static Slice GetSliceForFileNumber(const uint64_t* file_number) { sizeof(*file_number)); } -TableCache::TableCache(const Options* options, - const EnvOptions& storage_options, Cache* const cache) - : env_(options->env), - db_paths_(options->db_paths), - options_(options), - storage_options_(storage_options), +TableCache::TableCache(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, Cache* const cache) + : ioptions_(ioptions), + env_options_(env_options), cache_(cache) {} TableCache::~TableCache() { @@ -55,7 +53,7 @@ void TableCache::ReleaseHandle(Cache::Handle* handle) { cache_->Release(handle); } -Status TableCache::FindTable(const EnvOptions& toptions, +Status TableCache::FindTable(const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, Cache::Handle** handle, const bool no_io) { @@ -68,24 +66,24 @@ Status TableCache::FindTable(const EnvOptions& toptions, return Status::Incomplete("Table not found in table_cache, no_io is set"); } std::string fname = - TableFileName(db_paths_, fd.GetNumber(), fd.GetPathId()); + TableFileName(ioptions_.db_paths, fd.GetNumber(), fd.GetPathId()); unique_ptr file; unique_ptr table_reader; - s = env_->NewRandomAccessFile(fname, &file, toptions); - RecordTick(options_->statistics.get(), NO_FILE_OPENS); + s = ioptions_.env->NewRandomAccessFile(fname, &file, env_options); + RecordTick(ioptions_.statistics, NO_FILE_OPENS); if (s.ok()) { - if (options_->advise_random_on_open) { + if (ioptions_.advise_random_on_open) { file->Hint(RandomAccessFile::RANDOM); } - StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS); - s = options_->table_factory->NewTableReader( - *options_, toptions, internal_comparator, std::move(file), + StopWatch sw(ioptions_.env, ioptions_.statistics, TABLE_OPEN_IO_MICROS); + s = ioptions_.table_factory->NewTableReader( + ioptions_, env_options, internal_comparator, std::move(file), fd.GetFileSize(), &table_reader); } if (!s.ok()) { assert(table_reader == nullptr); - RecordTick(options_->statistics.get(), NO_FILE_ERRORS); + RecordTick(ioptions_.statistics, NO_FILE_ERRORS); // We do not cache error results so that if the error is transient, // or somebody repairs the file, we recover automatically. } else { @@ -97,7 +95,7 @@ Status TableCache::FindTable(const EnvOptions& toptions, } Iterator* TableCache::NewIterator(const ReadOptions& options, - const EnvOptions& toptions, + const EnvOptions& env_options, const InternalKeyComparator& icomparator, const FileDescriptor& fd, TableReader** table_reader_ptr, @@ -109,7 +107,7 @@ Iterator* TableCache::NewIterator(const ReadOptions& options, Cache::Handle* handle = nullptr; Status s; if (table_reader == nullptr) { - s = FindTable(toptions, icomparator, fd, &handle, + s = FindTable(env_options, icomparator, fd, &handle, options.read_tier == kBlockCacheTier); if (!s.ok()) { return NewErrorIterator(s, arena); @@ -142,7 +140,7 @@ Status TableCache::Get(const ReadOptions& options, Status s; Cache::Handle* handle = nullptr; if (!t) { - s = FindTable(storage_options_, internal_comparator, fd, &handle, + s = FindTable(env_options_, internal_comparator, fd, &handle, options.read_tier == kBlockCacheTier); if (s.ok()) { t = GetTableReaderFromHandle(handle); @@ -160,8 +158,9 @@ Status TableCache::Get(const ReadOptions& options, } return s; } + Status TableCache::GetTableProperties( - const EnvOptions& toptions, + const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, std::shared_ptr* properties, bool no_io) { Status s; @@ -174,7 +173,7 @@ Status TableCache::GetTableProperties( } Cache::Handle* table_handle = nullptr; - s = FindTable(toptions, internal_comparator, fd, &table_handle, no_io); + s = FindTable(env_options, internal_comparator, fd, &table_handle, no_io); if (!s.ok()) { return s; } @@ -186,7 +185,7 @@ Status TableCache::GetTableProperties( } size_t TableCache::GetMemoryUsageByTableReader( - const EnvOptions& toptions, + const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd) { Status s; @@ -197,7 +196,7 @@ size_t TableCache::GetMemoryUsageByTableReader( } Cache::Handle* table_handle = nullptr; - s = FindTable(toptions, internal_comparator, fd, &table_handle, true); + s = FindTable(env_options, internal_comparator, fd, &table_handle, true); if (!s.ok()) { return 0; } diff --git a/db/table_cache.h b/db/table_cache.h index 79090e0649b..2f6740d9f11 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -19,6 +19,7 @@ #include "rocksdb/cache.h" #include "rocksdb/env.h" #include "rocksdb/table.h" +#include "rocksdb/options.h" #include "table/table_reader.h" namespace rocksdb { @@ -29,8 +30,8 @@ struct FileDescriptor; class TableCache { public: - TableCache(const Options* options, const EnvOptions& storage_options, - Cache* cache); + TableCache(const ImmutableCFOptions& ioptions, + const EnvOptions& storage_options, Cache* cache); ~TableCache(); // Return an iterator for the specified file number (the corresponding @@ -91,10 +92,8 @@ class TableCache { void ReleaseHandle(Cache::Handle* handle); private: - Env* const env_; - const std::vector db_paths_; - const Options* options_; - const EnvOptions& storage_options_; + const ImmutableCFOptions& ioptions_; + const EnvOptions& env_options_; Cache* const cache_; }; diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc index f4055d4bab4..74abf867098 100644 --- a/db/table_properties_collector_test.cc +++ b/db/table_properties_collector_test.cc @@ -11,6 +11,7 @@ #include "db/dbformat.h" #include "db/table_properties_collector.h" #include "rocksdb/table.h" +#include "rocksdb/immutable_options.h" #include "table/block_based_table_factory.h" #include "table/meta_blocks.h" #include "table/plain_table_factory.h" @@ -85,12 +86,14 @@ class DumbLogger : public Logger { // Utilities test functions namespace { void MakeBuilder(const Options& options, + const ImmutableCFOptions& ioptions, const InternalKeyComparator& internal_comparator, std::unique_ptr* writable, std::unique_ptr* builder) { writable->reset(new FakeWritableFile); - builder->reset(options.table_factory->NewTableBuilder( - options, internal_comparator, writable->get(), options.compression)); + builder->reset(ioptions.table_factory->NewTableBuilder( + ioptions, internal_comparator, writable->get(), + options.compression, options.compression_opts)); } } // namespace @@ -153,7 +156,8 @@ void TestCustomizedTablePropertiesCollector( // -- Step 1: build table std::unique_ptr builder; std::unique_ptr writable; - MakeBuilder(options, internal_comparator, &writable, &builder); + const ImmutableCFOptions ioptions(options); + MakeBuilder(options, ioptions, internal_comparator, &writable, &builder); for (const auto& kv : kvs) { if (encode_as_internal) { @@ -257,16 +261,17 @@ void TestInternalKeyPropertiesCollector( // SanitizeOptions(). options.info_log = std::make_shared(); options = SanitizeOptions("db", // just a place holder - &pikc, nullptr, // don't care filter policy + &pikc, options); options.comparator = comparator; } else { options.table_properties_collector_factories = { std::make_shared()}; } + const ImmutableCFOptions ioptions(options); for (int iter = 0; iter < 2; ++iter) { - MakeBuilder(options, pikc, &writable, &builder); + MakeBuilder(options, ioptions, pikc, &writable, &builder); for (const auto& k : keys) { builder->Add(k.Encode(), "val"); } diff --git a/db/version_edit.h b/db/version_edit.h index 58edfed4515..db133402c95 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -163,13 +163,13 @@ class VersionEdit { // Add the specified file at the specified number. // REQUIRES: This version has not been saved (see VersionSet::SaveTo) // REQUIRES: "smallest" and "largest" are smallest and largest keys in file - void AddFile(int level, uint64_t file, uint64_t file_size, - uint64_t file_path_id, const InternalKey& smallest, + void AddFile(int level, uint64_t file, uint64_t file_path_id, + uint64_t file_size, const InternalKey& smallest, const InternalKey& largest, const SequenceNumber& smallest_seqno, const SequenceNumber& largest_seqno) { assert(smallest_seqno <= largest_seqno); FileMetaData f; - f.fd = FileDescriptor(file, file_size, file_path_id); + f.fd = FileDescriptor(file, file_path_id, file_size); f.smallest = smallest; f.largest = largest; f.smallest_seqno = smallest_seqno; diff --git a/db/version_set.cc b/db/version_set.cc index 3a15458532e..7edfaa788e0 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -9,7 +9,10 @@ #include "db/version_set.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include @@ -509,9 +512,9 @@ Status Version::GetTableProperties(std::shared_ptr* tp, const FileMetaData* file_meta, const std::string* fname) { auto table_cache = cfd_->table_cache(); - auto options = cfd_->options(); + auto ioptions = cfd_->ioptions(); Status s = table_cache->GetTableProperties( - vset_->storage_options_, cfd_->internal_comparator(), file_meta->fd, + vset_->env_options_, cfd_->internal_comparator(), file_meta->fd, tp, true /* no io */); if (s.ok()) { return s; @@ -527,13 +530,13 @@ Status Version::GetTableProperties(std::shared_ptr* tp, // directly from the properties block in the file. std::unique_ptr file; if (fname != nullptr) { - s = options->env->NewRandomAccessFile( - *fname, &file, vset_->storage_options_); + s = ioptions->env->NewRandomAccessFile( + *fname, &file, vset_->env_options_); } else { - s = options->env->NewRandomAccessFile( - TableFileName(vset_->options_->db_paths, file_meta->fd.GetNumber(), + s = ioptions->env->NewRandomAccessFile( + TableFileName(vset_->db_options_->db_paths, file_meta->fd.GetNumber(), file_meta->fd.GetPathId()), - &file, vset_->storage_options_); + &file, vset_->env_options_); } if (!s.ok()) { return s; @@ -545,11 +548,11 @@ Status Version::GetTableProperties(std::shared_ptr* tp, s = ReadTableProperties( file.get(), file_meta->fd.GetFileSize(), Footer::kInvalidTableMagicNumber /* table's magic number */, - vset_->env_, options->info_log.get(), &raw_table_properties); + vset_->env_, ioptions->info_log, &raw_table_properties); if (!s.ok()) { return s; } - RecordTick(options->statistics.get(), NUMBER_DIRECT_LOAD_TABLE_PROPERTIES); + RecordTick(ioptions->statistics, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES); *tp = std::shared_ptr(raw_table_properties); return s; @@ -559,7 +562,7 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) { for (int level = 0; level < num_levels_; level++) { for (const auto& file_meta : files_[level]) { auto fname = - TableFileName(vset_->options_->db_paths, file_meta->fd.GetNumber(), + TableFileName(vset_->db_options_->db_paths, file_meta->fd.GetNumber(), file_meta->fd.GetPathId()); // 1. If the table is already present in table cache, load table // properties from there. @@ -581,7 +584,7 @@ size_t Version::GetMemoryUsageByTableReaders() { for (auto& file_level : file_levels_) { for (size_t i = 0; i < file_level.num_files; i++) { total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader( - vset_->storage_options_, cfd_->internal_comparator(), + vset_->env_options_, cfd_->internal_comparator(), file_level.files[i].fd); } } @@ -596,31 +599,6 @@ uint64_t Version::GetEstimatedActiveKeys() { return num_non_deletions_ - num_deletions_; } -void Version::AddIterators(const ReadOptions& read_options, - const EnvOptions& soptions, - std::vector* iters) { - // Merge all level zero files together since they may overlap - for (size_t i = 0; i < file_levels_[0].num_files; i++) { - const auto& file = file_levels_[0].files[i]; - iters->push_back(cfd_->table_cache()->NewIterator( - read_options, soptions, cfd_->internal_comparator(), file.fd)); - } - - // For levels > 0, we can use a concatenating iterator that sequentially - // walks through the non-overlapping files in the level, opening them - // lazily. - for (int level = 1; level < num_levels_; level++) { - if (file_levels_[level].num_files != 0) { - iters->push_back(NewTwoLevelIterator(new LevelFileIteratorState( - cfd_->table_cache(), read_options, soptions, - cfd_->internal_comparator(), false /* for_compaction */, - cfd_->options()->prefix_extractor != nullptr), - new LevelFileNumIterator(cfd_->internal_comparator(), - &file_levels_[level]))); - } - } -} - void Version::AddIterators(const ReadOptions& read_options, const EnvOptions& soptions, MergeIteratorBuilder* merge_iter_builder) { @@ -641,7 +619,7 @@ void Version::AddIterators(const ReadOptions& read_options, new LevelFileIteratorState( cfd_->table_cache(), read_options, soptions, cfd_->internal_comparator(), false /* for_compaction */, - cfd_->options()->prefix_extractor != nullptr), + cfd_->ioptions()->prefix_extractor != nullptr), new LevelFileNumIterator(cfd_->internal_comparator(), &file_levels_[level]), merge_iter_builder->GetArena())); } @@ -757,10 +735,10 @@ Version::Version(ColumnFamilyData* cfd, VersionSet* vset, (cfd == nullptr) ? nullptr : internal_comparator_->user_comparator()), table_cache_((cfd == nullptr) ? nullptr : cfd->table_cache()), merge_operator_((cfd == nullptr) ? nullptr - : cfd->options()->merge_operator.get()), - info_log_((cfd == nullptr) ? nullptr : cfd->options()->info_log.get()), + : cfd->ioptions()->merge_operator), + info_log_((cfd == nullptr) ? nullptr : cfd->ioptions()->info_log), db_statistics_((cfd == nullptr) ? nullptr - : cfd->options()->statistics.get()), + : cfd->ioptions()->statistics), // cfd is nullptr if Version is dummy num_levels_(cfd == nullptr ? 0 : cfd->NumberLevels()), num_non_empty_levels_(num_levels_), @@ -886,7 +864,7 @@ bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) { Status s = GetTableProperties(&tp, file_meta); file_meta->init_stats_from_file = true; if (!s.ok()) { - Log(vset_->options_->info_log, + Log(vset_->db_options_->info_log, "Unable to load table properties for file %" PRIu64 " --- %s\n", file_meta->fd.GetNumber(), s.ToString().c_str()); return false; @@ -969,7 +947,7 @@ void Version::ComputeCompactionScore( numfiles++; } } - if (cfd_->options()->compaction_style == kCompactionStyleFIFO) { + if (cfd_->ioptions()->compaction_style == kCompactionStyleFIFO) { score = static_cast(total_size) / cfd_->options()->compaction_options_fifo.max_table_files_size; } else if (numfiles >= cfd_->options()->level0_stop_writes_trigger) { @@ -1038,8 +1016,8 @@ void Version::UpdateNumNonEmptyLevels() { } void Version::UpdateFilesBySize() { - if (cfd_->options()->compaction_style == kCompactionStyleFIFO || - cfd_->options()->compaction_style == kCompactionStyleUniversal) { + if (cfd_->ioptions()->compaction_style == kCompactionStyleFIFO || + cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) { // don't need this return; } @@ -1699,7 +1677,7 @@ class VersionSet::Builder { for (auto& file_meta : *(levels_[level].added_files)) { assert (!file_meta->table_reader_handle); cfd_->table_cache()->FindTable( - base_->vset_->storage_options_, cfd_->internal_comparator(), + base_->vset_->env_options_, cfd_->internal_comparator(), file_meta->fd, &file_meta->table_reader_handle, false); if (file_meta->table_reader_handle != nullptr) { // Load table_reader @@ -1727,13 +1705,14 @@ class VersionSet::Builder { } }; -VersionSet::VersionSet(const std::string& dbname, const DBOptions* options, - const EnvOptions& storage_options, Cache* table_cache) - : column_family_set_(new ColumnFamilySet(dbname, options, storage_options, - table_cache)), - env_(options->env), +VersionSet::VersionSet(const std::string& dbname, const DBOptions* db_options, + const EnvOptions& env_options, Cache* table_cache, + WriteController* write_controller) + : column_family_set_(new ColumnFamilySet(dbname, db_options, env_options, + table_cache, write_controller)), + env_(db_options->env), dbname_(dbname), - options_(options), + db_options_(db_options), next_file_number_(2), manifest_file_number_(0), // Filled by Recover() pending_manifest_file_number_(0), @@ -1741,8 +1720,8 @@ VersionSet::VersionSet(const std::string& dbname, const DBOptions* options, prev_log_number_(0), current_version_number_(0), manifest_file_size_(0), - storage_options_(storage_options), - storage_options_compactions_(storage_options_) {} + env_options_(env_options), + env_options_compactions_(env_options_) {} VersionSet::~VersionSet() { // we need to delete column_family_set_ because its destructor depends on @@ -1844,7 +1823,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, assert(pending_manifest_file_number_ == 0); if (!descriptor_log_ || - manifest_file_size_ > options_->max_manifest_file_size) { + manifest_file_size_ > db_options_->max_manifest_file_size) { pending_manifest_file_number_ = NewFileNumber(); batch_edits.back()->SetNextFile(next_file_number_); new_descriptor_log = true; @@ -1872,7 +1851,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, mu->Unlock(); - if (!edit->IsColumnFamilyManipulation() && options_->max_open_files == -1) { + if (!edit->IsColumnFamilyManipulation() && + db_options_->max_open_files == -1) { // unlimited table cache. Pre-load table handle now. // Need to do it out of the mutex. builder->LoadTableHandlers(); @@ -1882,15 +1862,15 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, // only one thread can be here at the same time if (new_descriptor_log) { // create manifest file - Log(options_->info_log, + Log(db_options_->info_log, "Creating manifest %" PRIu64 "\n", pending_manifest_file_number_); unique_ptr descriptor_file; s = env_->NewWritableFile( DescriptorFileName(dbname_, pending_manifest_file_number_), - &descriptor_file, env_->OptimizeForManifestWrite(storage_options_)); + &descriptor_file, env_->OptimizeForManifestWrite(env_options_)); if (s.ok()) { descriptor_file->SetPreallocationBlockSize( - options_->manifest_preallocation_size); + db_options_->manifest_preallocation_size); descriptor_log_.reset(new log::Writer(std::move(descriptor_file))); s = WriteSnapshot(descriptor_log_.get()); } @@ -1911,19 +1891,20 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, break; } } - if (s.ok()) { - if (options_->use_fsync) { - StopWatch sw(env_, options_->statistics.get(), + if (s.ok() && db_options_->disableDataSync == false) { + if (db_options_->use_fsync) { + StopWatch sw(env_, db_options_->statistics.get(), MANIFEST_FILE_SYNC_MICROS); s = descriptor_log_->file()->Fsync(); } else { - StopWatch sw(env_, options_->statistics.get(), + StopWatch sw(env_, db_options_->statistics.get(), MANIFEST_FILE_SYNC_MICROS); s = descriptor_log_->file()->Sync(); } } if (!s.ok()) { - Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str()); + Log(db_options_->info_log, "MANIFEST write: %s\n", + s.ToString().c_str()); bool all_records_in = true; for (auto& e : batch_edits) { std::string record; @@ -1934,7 +1915,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, } } if (all_records_in) { - Log(options_->info_log, + Log(db_options_->info_log, "MANIFEST contains log record despite error; advancing to new " "version to prevent mismatch between in-memory and logged state" " If paranoid is set, then the db is now in readonly mode."); @@ -1947,10 +1928,10 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, // new CURRENT file that points to it. if (s.ok() && new_descriptor_log) { s = SetCurrentFile(env_, dbname_, pending_manifest_file_number_, - db_directory); + db_options_->disableDataSync ? nullptr : db_directory); if (s.ok() && pending_manifest_file_number_ > manifest_file_number_) { // delete old manifest file - Log(options_->info_log, + Log(db_options_->info_log, "Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n", manifest_file_number_, pending_manifest_file_number_); // we don't care about an error here, PurgeObsoleteFiles will take care @@ -1964,7 +1945,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, new_manifest_file_size = descriptor_log_->file()->GetFileSize(); } - LogFlush(options_->info_log); + LogFlush(db_options_->info_log); mu->Lock(); } @@ -2000,12 +1981,12 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, manifest_file_size_ = new_manifest_file_size; prev_log_number_ = edit->prev_log_number_; } else { - Log(options_->info_log, "Error in committing version %lu to [%s]", + Log(db_options_->info_log, "Error in committing version %lu to [%s]", (unsigned long)v->GetVersionNumber(), column_family_data->GetName().c_str()); delete v; if (new_descriptor_log) { - Log(options_->info_log, + Log(db_options_->info_log, "Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n", manifest_file_number_, pending_manifest_file_number_); descriptor_log_.reset(); @@ -2097,13 +2078,13 @@ Status VersionSet::Recover( return Status::Corruption("CURRENT file corrupted"); } - Log(options_->info_log, "Recovering from manifest file: %s\n", + Log(db_options_->info_log, "Recovering from manifest file: %s\n", manifest_filename.c_str()); manifest_filename = dbname_ + "/" + manifest_filename; unique_ptr manifest_file; s = env_->NewSequentialFile(manifest_filename, &manifest_file, - storage_options_); + env_options_); if (!s.ok()) { return s; } @@ -2230,7 +2211,7 @@ Status VersionSet::Recover( if (cfd != nullptr) { if (edit.has_log_number_) { if (cfd->GetLogNumber() > edit.log_number_) { - Log(options_->info_log, + Log(db_options_->info_log, "MANIFEST corruption detected, but ignored - Log numbers in " "records NOT monotonically increasing"); } else { @@ -2306,7 +2287,7 @@ Status VersionSet::Recover( assert(builders_iter != builders.end()); auto builder = builders_iter->second; - if (options_->max_open_files == -1) { + if (db_options_->max_open_files == -1) { // unlimited table cache. Pre-load table handle now. // Need to do it out of the mutex. builder->LoadTableHandlers(); @@ -2327,7 +2308,7 @@ Status VersionSet::Recover( last_sequence_ = last_sequence; prev_log_number_ = prev_log_number; - Log(options_->info_log, + Log(db_options_->info_log, "Recovered from manifest file:%s succeeded," "manifest_file_number is %lu, next_file_number is %lu, " "last_sequence is %lu, log_number is %lu," @@ -2339,7 +2320,7 @@ Status VersionSet::Recover( column_family_set_->GetMaxColumnFamily()); for (auto cfd : *column_family_set_) { - Log(options_->info_log, + Log(db_options_->info_log, "Column family [%s] (ID %u), log number is %" PRIu64 "\n", cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber()); } @@ -2422,7 +2403,7 @@ Status VersionSet::ListColumnFamilies(std::vector* column_families, #ifndef ROCKSDB_LITE Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, const Options* options, - const EnvOptions& storage_options, + const EnvOptions& env_options, int new_levels) { if (new_levels <= 1) { return Status::InvalidArgument( @@ -2433,7 +2414,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, std::shared_ptr tc(NewLRUCache( options->max_open_files - 10, options->table_cache_numshardbits, options->table_cache_remove_scan_count_limit)); - VersionSet versions(dbname, options, storage_options, tc.get()); + WriteController wc; + VersionSet versions(dbname, options, env_options, tc.get(), &wc); Status status; std::vector dummy; @@ -2504,7 +2486,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, bool verbose, bool hex) { // Open the specified manifest file. unique_ptr file; - Status s = options.env->NewSequentialFile(dscname, &file, storage_options_); + Status s = options.env->NewSequentialFile(dscname, &file, env_options_); if (!s.ok()) { return s; } @@ -2746,12 +2728,12 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_number, const std::string& record) const { std::string fname = DescriptorFileName(dbname_, manifest_file_number); - Log(options_->info_log, "ManifestContains: checking %s\n", fname.c_str()); + Log(db_options_->info_log, "ManifestContains: checking %s\n", fname.c_str()); unique_ptr file; - Status s = env_->NewSequentialFile(fname, &file, storage_options_); + Status s = env_->NewSequentialFile(fname, &file, env_options_); if (!s.ok()) { - Log(options_->info_log, "ManifestContains: %s\n", s.ToString().c_str()); - Log(options_->info_log, + Log(db_options_->info_log, "ManifestContains: %s\n", s.ToString().c_str()); + Log(db_options_->info_log, "ManifestContains: is unable to reopen the manifest file %s", fname.c_str()); return false; @@ -2766,7 +2748,7 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_number, break; } } - Log(options_->info_log, "ManifestContains: result = %d\n", result ? 1 : 0); + Log(db_options_->info_log, "ManifestContains: result = %d\n", result ? 1 : 0); return result; } @@ -2794,7 +2776,7 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { // approximate offset of "ikey" within the table. TableReader* table_reader_ptr; Iterator* iter = v->cfd_->table_cache()->NewIterator( - ReadOptions(), storage_options_, v->cfd_->internal_comparator(), + ReadOptions(), env_options_, v->cfd_->internal_comparator(), files[i]->fd, &table_reader_ptr); if (table_reader_ptr != nullptr) { result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode()); @@ -2856,14 +2838,14 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { const FileLevel* flevel = c->input_levels(which); for (size_t i = 0; i < flevel->num_files; i++) { list[num++] = cfd->table_cache()->NewIterator( - read_options, storage_options_compactions_, + read_options, env_options_compactions_, cfd->internal_comparator(), flevel->files[i].fd, nullptr, true /* for compaction */); } } else { // Create concatenating iterator for the files from this level list[num++] = NewTwoLevelIterator(new Version::LevelFileIteratorState( - cfd->table_cache(), read_options, storage_options_, + cfd->table_cache(), read_options, env_options_, cfd->internal_comparator(), true /* for_compaction */, false /* prefix enabled */), new Version::LevelFileNumIterator(cfd->internal_comparator(), @@ -2884,7 +2866,7 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) { #ifndef NDEBUG Version* version = c->column_family_data()->current(); if (c->input_version() != version) { - Log(options_->info_log, + Log(db_options_->info_log, "[%s] VerifyCompactionFileConsistency version mismatch", c->column_family_data()->GetName().c_str()); } @@ -2955,11 +2937,11 @@ void VersionSet::GetLiveFilesMetaData(std::vector* metadata) { LiveFileMetaData filemetadata; filemetadata.column_family_name = cfd->GetName(); uint32_t path_id = file->fd.GetPathId(); - if (path_id < options_->db_paths.size()) { - filemetadata.db_path = options_->db_paths[path_id].path; + if (path_id < db_options_->db_paths.size()) { + filemetadata.db_path = db_options_->db_paths[path_id].path; } else { - assert(!options_->db_paths.empty()); - filemetadata.db_path = options_->db_paths.back().path; + assert(!db_options_->db_paths.empty()); + filemetadata.db_path = db_options_->db_paths.back().path; } filemetadata.name = MakeTableFileName("", file->fd.GetNumber()); filemetadata.level = level; @@ -2980,17 +2962,21 @@ void VersionSet::GetObsoleteFiles(std::vector* files) { } ColumnFamilyData* VersionSet::CreateColumnFamily( - const ColumnFamilyOptions& options, VersionEdit* edit) { + const ColumnFamilyOptions& cf_options, VersionEdit* edit) { assert(edit->is_column_family_add_); Version* dummy_versions = new Version(nullptr, this); auto new_cfd = column_family_set_->CreateColumnFamily( - edit->column_family_name_, edit->column_family_, dummy_versions, options); + edit->column_family_name_, edit->column_family_, dummy_versions, + cf_options); Version* v = new Version(new_cfd, this, current_version_number_++); AppendVersion(new_cfd, v); - new_cfd->CreateNewMemtable(); + // GetLatestMutableCFOptions() is safe here without mutex since the + // cfd is not available to client + new_cfd->CreateNewMemtable(MemTableOptions( + *new_cfd->GetLatestMutableCFOptions(), *new_cfd->options())); new_cfd->SetLogNumber(edit->log_number_); return new_cfd; } diff --git a/db/version_set.h b/db/version_set.h index 2f6d477a1db..353adbfece4 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -34,6 +34,7 @@ #include "db/column_family.h" #include "db/log_reader.h" #include "db/file_indexer.h" +#include "db/write_controller.h" namespace rocksdb { @@ -86,8 +87,6 @@ class Version { // Append to *iters a sequence of iterators that will // yield the contents of this Version when merged together. // REQUIRES: This version has been saved (see VersionSet::SaveTo) - void AddIterators(const ReadOptions&, const EnvOptions& soptions, - std::vector* iters); void AddIterators(const ReadOptions&, const EnvOptions& soptions, MergeIteratorBuilder* merger_iter_builder); @@ -257,7 +256,7 @@ class Version { class LevelFileNumIterator; class LevelFileIteratorState; - bool PrefixMayMatch(const ReadOptions& options, Iterator* level_iter, + bool PrefixMayMatch(const ReadOptions& read_options, Iterator* level_iter, const Slice& internal_prefix) const; // Update num_non_empty_levels_. @@ -323,8 +322,8 @@ class Version { // These are used to pick the best compaction level std::vector compaction_score_; std::vector compaction_level_; - double max_compaction_score_; // max score in l1 to ln-1 - int max_compaction_score_level_; // level on which max score occurs + double max_compaction_score_ = 0.0; // max score in l1 to ln-1 + int max_compaction_score_level_ = 0; // level on which max score occurs // A version number that uniquely represents this version. This is // used for debugging and logging purposes only. @@ -358,8 +357,9 @@ class Version { class VersionSet { public: - VersionSet(const std::string& dbname, const DBOptions* options, - const EnvOptions& storage_options, Cache* table_cache); + VersionSet(const std::string& dbname, const DBOptions* db_options, + const EnvOptions& env_options, Cache* table_cache, + WriteController* write_controller); ~VersionSet(); // Apply *edit to the current version to form a new descriptor that @@ -397,7 +397,7 @@ class VersionSet { // among [4-6] contains files. static Status ReduceNumberOfLevels(const std::string& dbname, const Options* options, - const EnvOptions& storage_options, + const EnvOptions& env_options, int new_levels); // printf contents (for debugging) @@ -506,14 +506,14 @@ class VersionSet { bool ManifestContains(uint64_t manifest_file_number, const std::string& record) const; - ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& options, + ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, VersionEdit* edit); std::unique_ptr column_family_set_; Env* const env_; const std::string dbname_; - const DBOptions* const options_; + const DBOptions* const db_options_; uint64_t next_file_number_; uint64_t manifest_file_number_; uint64_t pending_manifest_file_number_; @@ -534,12 +534,12 @@ class VersionSet { std::vector obsolete_files_; - // storage options for all reads and writes except compactions - const EnvOptions& storage_options_; + // env options for all reads and writes except compactions + const EnvOptions& env_options_; - // storage options used for compactions. This is a copy of - // storage_options_ but with readaheads set to readahead_compactions_. - const EnvOptions storage_options_compactions_; + // env options used for compactions. This is a copy of + // env_options_ but with readaheads set to readahead_compactions_. + const EnvOptions env_options_compactions_; // No copying allowed VersionSet(const VersionSet&); diff --git a/db/write_batch.cc b/db/write_batch.cc index dc72a113838..b8d0322d850 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -23,10 +23,10 @@ // data: uint8[len] #include "rocksdb/write_batch.h" -#include "rocksdb/options.h" #include "rocksdb/merge_operator.h" #include "db/dbformat.h" #include "db/db_impl.h" +#include "db/column_family.h" #include "db/memtable.h" #include "db/snapshot.h" #include "db/write_batch_internal.h" @@ -80,6 +80,58 @@ int WriteBatch::Count() const { return WriteBatchInternal::Count(this); } +Status ReadRecordFromWriteBatch(Slice* input, char* tag, + uint32_t* column_family, Slice* key, + Slice* value, Slice* blob) { + assert(key != nullptr && value != nullptr); + *tag = (*input)[0]; + input->remove_prefix(1); + *column_family = 0; // default + switch (*tag) { + case kTypeColumnFamilyValue: + if (!GetVarint32(input, column_family)) { + return Status::Corruption("bad WriteBatch Put"); + } + // intentional fallthrough + case kTypeValue: + if (!GetLengthPrefixedSlice(input, key) || + !GetLengthPrefixedSlice(input, value)) { + return Status::Corruption("bad WriteBatch Put"); + } + break; + case kTypeColumnFamilyDeletion: + if (!GetVarint32(input, column_family)) { + return Status::Corruption("bad WriteBatch Delete"); + } + // intentional fallthrough + case kTypeDeletion: + if (!GetLengthPrefixedSlice(input, key)) { + return Status::Corruption("bad WriteBatch Delete"); + } + break; + case kTypeColumnFamilyMerge: + if (!GetVarint32(input, column_family)) { + return Status::Corruption("bad WriteBatch Merge"); + } + // intentional fallthrough + case kTypeMerge: + if (!GetLengthPrefixedSlice(input, key) || + !GetLengthPrefixedSlice(input, value)) { + return Status::Corruption("bad WriteBatch Merge"); + } + break; + case kTypeLogData: + assert(blob != nullptr); + if (!GetLengthPrefixedSlice(input, blob)) { + return Status::Corruption("bad WriteBatch Blob"); + } + break; + default: + return Status::Corruption("unknown WriteBatch tag"); + } + return Status::OK(); +} + Status WriteBatch::Iterate(Handler* handler) const { Slice input(rep_); if (input.size() < kHeader) { @@ -91,57 +143,33 @@ Status WriteBatch::Iterate(Handler* handler) const { int found = 0; Status s; while (s.ok() && !input.empty() && handler->Continue()) { - char tag = input[0]; - input.remove_prefix(1); + char tag = 0; uint32_t column_family = 0; // default + + s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value, + &blob); + if (!s.ok()) { + return s; + } + switch (tag) { case kTypeColumnFamilyValue: - if (!GetVarint32(&input, &column_family)) { - return Status::Corruption("bad WriteBatch Put"); - } - // intentional fallthrough case kTypeValue: - if (GetLengthPrefixedSlice(&input, &key) && - GetLengthPrefixedSlice(&input, &value)) { - s = handler->PutCF(column_family, key, value); - found++; - } else { - return Status::Corruption("bad WriteBatch Put"); - } + s = handler->PutCF(column_family, key, value); + found++; break; case kTypeColumnFamilyDeletion: - if (!GetVarint32(&input, &column_family)) { - return Status::Corruption("bad WriteBatch Delete"); - } - // intentional fallthrough case kTypeDeletion: - if (GetLengthPrefixedSlice(&input, &key)) { - s = handler->DeleteCF(column_family, key); - found++; - } else { - return Status::Corruption("bad WriteBatch Delete"); - } + s = handler->DeleteCF(column_family, key); + found++; break; case kTypeColumnFamilyMerge: - if (!GetVarint32(&input, &column_family)) { - return Status::Corruption("bad WriteBatch Merge"); - } - // intentional fallthrough case kTypeMerge: - if (GetLengthPrefixedSlice(&input, &key) && - GetLengthPrefixedSlice(&input, &value)) { - s = handler->MergeCF(column_family, key, value); - found++; - } else { - return Status::Corruption("bad WriteBatch Merge"); - } + s = handler->MergeCF(column_family, key, value); + found++; break; case kTypeLogData: - if (GetLengthPrefixedSlice(&input, &blob)) { - handler->LogData(blob); - } else { - return Status::Corruption("bad WriteBatch Blob"); - } + handler->LogData(blob); break; default: return Status::Corruption("unknown WriteBatch tag"); @@ -186,17 +214,6 @@ void WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id, PutLengthPrefixedSlice(&b->rep_, value); } -namespace { -inline uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) { - uint32_t column_family_id = 0; - if (column_family != nullptr) { - auto cfh = reinterpret_cast(column_family); - column_family_id = cfh->GetID(); - } - return column_family_id; -} -} // namespace - void WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) { WriteBatchInternal::Put(this, GetColumnFamilyID(column_family), key, value); @@ -281,17 +298,17 @@ class MemTableInserter : public WriteBatch::Handler { public: SequenceNumber sequence_; ColumnFamilyMemTables* cf_mems_; - bool recovery_; + bool ignore_missing_column_families_; uint64_t log_number_; DBImpl* db_; const bool dont_filter_deletes_; MemTableInserter(SequenceNumber sequence, ColumnFamilyMemTables* cf_mems, - bool recovery, uint64_t log_number, DB* db, - const bool dont_filter_deletes) + bool ignore_missing_column_families, uint64_t log_number, + DB* db, const bool dont_filter_deletes) : sequence_(sequence), cf_mems_(cf_mems), - recovery_(recovery), + ignore_missing_column_families_(ignore_missing_column_families), log_number_(log_number), db_(reinterpret_cast(db)), dont_filter_deletes_(dont_filter_deletes) { @@ -303,12 +320,18 @@ class MemTableInserter : public WriteBatch::Handler { bool SeekToColumnFamily(uint32_t column_family_id, Status* s) { bool found = cf_mems_->Seek(column_family_id); - if (recovery_ && (!found || log_number_ < cf_mems_->GetLogNumber())) { - // if in recovery envoronment: - // * If column family was not found, it might mean that the WAL write - // batch references to the column family that was dropped after the - // insert. We don't want to fail the whole write batch in that case -- we - // just ignore the update. + if (!found) { + if (ignore_missing_column_families_) { + *s = Status::OK(); + } else { + *s = Status::InvalidArgument( + "Invalid column family specified in write batch"); + } + return false; + } + if (log_number_ != 0 && log_number_ < cf_mems_->GetLogNumber()) { + // This is true only in recovery environment (log_number_ is always 0 in + // non-recovery, regular write code-path) // * If log_number_ < cf_mems_->GetLogNumber(), this means that column // family already contains updates from this log. We can't apply updates // twice because of update-in-place or merge workloads -- ignore the @@ -316,18 +339,8 @@ class MemTableInserter : public WriteBatch::Handler { *s = Status::OK(); return false; } - if (!found) { - assert(!recovery_); - // If the column family was not found in non-recovery enviornment - // (client's write code-path), we have to fail the write and return - // the failure status to the client. - *s = Status::InvalidArgument( - "Invalid column family specified in write batch"); - return false; - } return true; } - virtual Status PutCF(uint32_t column_family_id, const Slice& key, const Slice& value) { Status seek_status; @@ -336,14 +349,15 @@ class MemTableInserter : public WriteBatch::Handler { return seek_status; } MemTable* mem = cf_mems_->GetMemTable(); - const Options* options = cf_mems_->GetOptions(); - if (!options->inplace_update_support) { + auto* ioptions = mem->GetImmutableOptions(); + auto* moptions = mem->GetMemTableOptions(); + if (!moptions->inplace_update_support) { mem->Add(sequence_, kTypeValue, key, value); - } else if (options->inplace_callback == nullptr) { + } else if (moptions->inplace_callback == nullptr) { mem->Update(sequence_, key, value); - RecordTick(options->statistics.get(), NUMBER_KEYS_UPDATED); + RecordTick(ioptions->statistics, NUMBER_KEYS_UPDATED); } else { - if (mem->UpdateCallback(sequence_, key, value, *options)) { + if (mem->UpdateCallback(sequence_, key, value)) { } else { // key not found in memtable. Do sst get, update, add SnapshotImpl read_from_snapshot; @@ -362,17 +376,17 @@ class MemTableInserter : public WriteBatch::Handler { char* prev_buffer = const_cast(prev_value.c_str()); uint32_t prev_size = prev_value.size(); - auto status = options->inplace_callback(s.ok() ? prev_buffer : nullptr, - s.ok() ? &prev_size : nullptr, - value, &merged_value); + auto status = moptions->inplace_callback(s.ok() ? prev_buffer : nullptr, + s.ok() ? &prev_size : nullptr, + value, &merged_value); if (status == UpdateStatus::UPDATED_INPLACE) { // prev_value is updated in-place with final value. mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size)); - RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN); + RecordTick(ioptions->statistics, NUMBER_KEYS_WRITTEN); } else if (status == UpdateStatus::UPDATED) { // merged_value contains the final value. mem->Add(sequence_, kTypeValue, key, Slice(merged_value)); - RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN); + RecordTick(ioptions->statistics, NUMBER_KEYS_WRITTEN); } } } @@ -380,6 +394,7 @@ class MemTableInserter : public WriteBatch::Handler { // sequence number. Even if the update eventually fails and does not result // in memtable add/update. sequence_++; + cf_mems_->CheckMemtableFull(); return Status::OK(); } @@ -391,17 +406,18 @@ class MemTableInserter : public WriteBatch::Handler { return seek_status; } MemTable* mem = cf_mems_->GetMemTable(); - const Options* options = cf_mems_->GetOptions(); + auto* ioptions = mem->GetImmutableOptions(); + auto* moptions = mem->GetMemTableOptions(); bool perform_merge = false; - if (options->max_successive_merges > 0 && db_ != nullptr) { + if (moptions->max_successive_merges > 0 && db_ != nullptr) { LookupKey lkey(key, sequence_); // Count the number of successive merges at the head // of the key in the memtable size_t num_merges = mem->CountSuccessiveMergeEntries(lkey); - if (num_merges >= options->max_successive_merges) { + if (num_merges >= moptions->max_successive_merges) { perform_merge = true; } } @@ -425,16 +441,16 @@ class MemTableInserter : public WriteBatch::Handler { Slice get_value_slice = Slice(get_value); // 2) Apply this merge - auto merge_operator = options->merge_operator.get(); + auto merge_operator = ioptions->merge_operator; assert(merge_operator); std::deque operands; operands.push_front(value.ToString()); std::string new_value; if (!merge_operator->FullMerge(key, &get_value_slice, operands, - &new_value, options->info_log.get())) { + &new_value, ioptions->info_log)) { // Failed to merge! - RecordTick(options->statistics.get(), NUMBER_MERGE_FAILURES); + RecordTick(ioptions->statistics, NUMBER_MERGE_FAILURES); // Store the delta in memtable perform_merge = false; @@ -450,6 +466,7 @@ class MemTableInserter : public WriteBatch::Handler { } sequence_++; + cf_mems_->CheckMemtableFull(); return Status::OK(); } @@ -460,8 +477,9 @@ class MemTableInserter : public WriteBatch::Handler { return seek_status; } MemTable* mem = cf_mems_->GetMemTable(); - const Options* options = cf_mems_->GetOptions(); - if (!dont_filter_deletes_ && options->filter_deletes) { + auto* ioptions = mem->GetImmutableOptions(); + auto* moptions = mem->GetMemTableOptions(); + if (!dont_filter_deletes_ && moptions->filter_deletes) { SnapshotImpl read_from_snapshot; read_from_snapshot.number_ = sequence_; ReadOptions ropts; @@ -472,12 +490,13 @@ class MemTableInserter : public WriteBatch::Handler { cf_handle = db_->DefaultColumnFamily(); } if (!db_->KeyMayExist(ropts, cf_handle, key, &value)) { - RecordTick(options->statistics.get(), NUMBER_FILTERED_DELETES); + RecordTick(ioptions->statistics, NUMBER_FILTERED_DELETES); return Status::OK(); } } mem->Add(sequence_, kTypeDeletion, key, Slice()); sequence_++; + cf_mems_->CheckMemtableFull(); return Status::OK(); } }; @@ -485,10 +504,12 @@ class MemTableInserter : public WriteBatch::Handler { Status WriteBatchInternal::InsertInto(const WriteBatch* b, ColumnFamilyMemTables* memtables, - bool recovery, uint64_t log_number, - DB* db, const bool dont_filter_deletes) { + bool ignore_missing_column_families, + uint64_t log_number, DB* db, + const bool dont_filter_deletes) { MemTableInserter inserter(WriteBatchInternal::Sequence(b), memtables, - recovery, log_number, db, dont_filter_deletes); + ignore_missing_column_families, log_number, db, + dont_filter_deletes); return b->Iterate(&inserter); } diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h index 9a191f4cb3a..568cd70d812 100644 --- a/db/write_batch_internal.h +++ b/db/write_batch_internal.h @@ -28,6 +28,7 @@ class ColumnFamilyMemTables { virtual MemTable* GetMemTable() const = 0; virtual const Options* GetOptions() const = 0; virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0; + virtual void CheckMemtableFull() = 0; }; class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables { @@ -54,6 +55,8 @@ class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables { ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; } + void CheckMemtableFull() override {} + private: bool ok_; MemTable* mem_; @@ -106,18 +109,18 @@ class WriteBatchInternal { // Inserts batch entries into memtable // If dont_filter_deletes is false AND options.filter_deletes is true, // then --> Drops deletes in batch if db->KeyMayExist returns false - // If recovery == true, this means InsertInto is executed on a recovery - // code-path. WriteBatch referencing a dropped column family can be - // found on a recovery code-path and should be ignored (recovery should not - // fail). Additionally, the memtable will be updated only if + // If ignore_missing_column_families == true. WriteBatch referencing + // non-existing column family should be ignored. + // However, if ignore_missing_column_families == false, any WriteBatch + // referencing non-existing column family will return a InvalidArgument() + // failure. + // + // If log_number is non-zero, the memtable will be updated only if // memtables->GetLogNumber() >= log_number - // However, if recovery == false, any WriteBatch referencing - // non-existing column family will return a failure. Also, log_number is - // ignored in that case static Status InsertInto(const WriteBatch* batch, ColumnFamilyMemTables* memtables, - bool recovery = false, uint64_t log_number = 0, - DB* db = nullptr, + bool ignore_missing_column_families = false, + uint64_t log_number = 0, DB* db = nullptr, const bool dont_filter_deletes = true); static void Append(WriteBatch* dst, const WriteBatch* src); diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index febd35c05db..ba7451078be 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -15,8 +15,10 @@ #include "db/write_batch_internal.h" #include "rocksdb/env.h" #include "rocksdb/memtablerep.h" +#include "rocksdb/utilities/write_batch_with_index.h" #include "util/logging.h" #include "util/testharness.h" +#include "util/scoped_arena_iterator.h" namespace rocksdb { @@ -25,13 +27,15 @@ static std::string PrintContents(WriteBatch* b) { auto factory = std::make_shared(); Options options; options.memtable_factory = factory; - MemTable* mem = new MemTable(cmp, options); + MemTable* mem = new MemTable(cmp, ImmutableCFOptions(options), + MemTableOptions(MutableCFOptions(options), options)); mem->Ref(); std::string state; ColumnFamilyMemTablesDefault cf_mems_default(mem, &options); Status s = WriteBatchInternal::InsertInto(b, &cf_mems_default); int count = 0; - Iterator* iter = mem->NewIterator(ReadOptions()); + Arena arena; + ScopedArenaIterator iter(mem->NewIterator(ReadOptions(), &arena)); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ParsedInternalKey ikey; memset((void *)&ikey, 0, sizeof(ikey)); @@ -66,7 +70,6 @@ static std::string PrintContents(WriteBatch* b) { state.append("@"); state.append(NumberToString(ikey.sequence)); } - delete iter; if (!s.ok()) { state.append(s.ToString()); } else if (count != WriteBatchInternal::Count(b)) { @@ -286,6 +289,9 @@ class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl { explicit ColumnFamilyHandleImplDummy(int id) : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {} uint32_t GetID() const override { return id_; } + const Comparator* user_comparator() const override { + return BytewiseComparator(); + } private: uint32_t id_; @@ -316,6 +322,88 @@ TEST(WriteBatchTest, ColumnFamiliesBatchTest) { handler.seen); } +TEST(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) { + WriteBatchWithIndex batch; + ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8); + batch.Put(&zero, Slice("foo"), Slice("bar")); + batch.Put(&two, Slice("twofoo"), Slice("bar2")); + batch.Put(&eight, Slice("eightfoo"), Slice("bar8")); + batch.Delete(&eight, Slice("eightfoo")); + batch.Merge(&three, Slice("threethree"), Slice("3three")); + batch.Put(&zero, Slice("foo"), Slice("bar")); + batch.Merge(Slice("omom"), Slice("nom")); + + std::unique_ptr iter; + + iter.reset(batch.NewIterator(&eight)); + iter->Seek("eightfoo"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type); + ASSERT_EQ("eightfoo", iter->Entry().key.ToString()); + ASSERT_EQ("bar8", iter->Entry().value.ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(WriteType::kDeleteRecord, iter->Entry().type); + ASSERT_EQ("eightfoo", iter->Entry().key.ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter.reset(batch.NewIterator()); + iter->Seek("gggg"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(WriteType::kMergeRecord, iter->Entry().type); + ASSERT_EQ("omom", iter->Entry().key.ToString()); + ASSERT_EQ("nom", iter->Entry().value.ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter.reset(batch.NewIterator(&zero)); + iter->Seek("foo"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type); + ASSERT_EQ("foo", iter->Entry().key.ToString()); + ASSERT_EQ("bar", iter->Entry().value.ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type); + ASSERT_EQ("foo", iter->Entry().key.ToString()); + ASSERT_EQ("bar", iter->Entry().value.ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(WriteType::kMergeRecord, iter->Entry().type); + ASSERT_EQ("omom", iter->Entry().key.ToString()); + ASSERT_EQ("nom", iter->Entry().value.ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + TestHandler handler; + batch.GetWriteBatch()->Iterate(&handler); + ASSERT_EQ( + "Put(foo, bar)" + "PutCF(2, twofoo, bar2)" + "PutCF(8, eightfoo, bar8)" + "DeleteCF(8, eightfoo)" + "MergeCF(3, threethree, 3three)" + "Put(foo, bar)" + "Merge(omom, nom)", + handler.seen); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/write_controller.cc b/db/write_controller.cc new file mode 100644 index 00000000000..bb6f8ecf751 --- /dev/null +++ b/db/write_controller.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "db/write_controller.h" + +#include + +namespace rocksdb { + +std::unique_ptr WriteController::GetStopToken() { + ++total_stopped_; + return std::unique_ptr(new StopWriteToken(this)); +} + +std::unique_ptr WriteController::GetDelayToken( + uint64_t delay_us) { + total_delay_us_ += delay_us; + return std::unique_ptr( + new DelayWriteToken(this, delay_us)); +} + +bool WriteController::IsStopped() const { return total_stopped_ > 0; } +uint64_t WriteController::GetDelay() const { return total_delay_us_; } + +StopWriteToken::~StopWriteToken() { + assert(controller_->total_stopped_ >= 1); + --controller_->total_stopped_; +} + +DelayWriteToken::~DelayWriteToken() { + assert(controller_->total_delay_us_ >= delay_us_); + controller_->total_delay_us_ -= delay_us_; +} + +} // namespace rocksdb diff --git a/db/write_controller.h b/db/write_controller.h new file mode 100644 index 00000000000..32e1d58f10c --- /dev/null +++ b/db/write_controller.h @@ -0,0 +1,78 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include + +#include + +namespace rocksdb { + +class WriteControllerToken; + +// WriteController is controlling write stalls in our write code-path. Write +// stalls happen when compaction can't keep up with write rate. +// All of the methods here (including WriteControllerToken's destructors) need +// to be called while holding DB mutex +class WriteController { + public: + WriteController() : total_stopped_(0), total_delay_us_(0) {} + ~WriteController() = default; + + // When an actor (column family) requests a stop token, all writes will be + // stopped until the stop token is released (deleted) + std::unique_ptr GetStopToken(); + // When an actor (column family) requests a delay token, total delay for all + // writes will be increased by delay_us. The delay will last until delay token + // is released + std::unique_ptr GetDelayToken(uint64_t delay_us); + + // these two metods are querying the state of the WriteController + bool IsStopped() const; + uint64_t GetDelay() const; + + private: + friend class WriteControllerToken; + friend class StopWriteToken; + friend class DelayWriteToken; + + int total_stopped_; + uint64_t total_delay_us_; +}; + +class WriteControllerToken { + public: + explicit WriteControllerToken(WriteController* controller) + : controller_(controller) {} + virtual ~WriteControllerToken() {} + + protected: + WriteController* controller_; + + private: + // no copying allowed + WriteControllerToken(const WriteControllerToken&) = delete; + void operator=(const WriteControllerToken&) = delete; +}; + +class StopWriteToken : public WriteControllerToken { + public: + explicit StopWriteToken(WriteController* controller) + : WriteControllerToken(controller) {} + virtual ~StopWriteToken(); +}; + +class DelayWriteToken : public WriteControllerToken { + public: + DelayWriteToken(WriteController* controller, uint64_t delay_us) + : WriteControllerToken(controller), delay_us_(delay_us) {} + virtual ~DelayWriteToken(); + + private: + uint64_t delay_us_; +}; + +} // namespace rocksdb diff --git a/db/write_controller_test.cc b/db/write_controller_test.cc new file mode 100644 index 00000000000..1cec9658d44 --- /dev/null +++ b/db/write_controller_test.cc @@ -0,0 +1,40 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "db/write_controller.h" + +#include "util/testharness.h" + +namespace rocksdb { + +class WriteControllerTest {}; + +TEST(WriteControllerTest, SanityTest) { + WriteController controller; + auto stop_token_1 = controller.GetStopToken(); + auto stop_token_2 = controller.GetStopToken(); + + ASSERT_EQ(true, controller.IsStopped()); + stop_token_1.reset(); + ASSERT_EQ(true, controller.IsStopped()); + stop_token_2.reset(); + ASSERT_EQ(false, controller.IsStopped()); + + auto delay_token_1 = controller.GetDelayToken(5); + ASSERT_EQ(static_cast(5), controller.GetDelay()); + auto delay_token_2 = controller.GetDelayToken(8); + ASSERT_EQ(static_cast(13), controller.GetDelay()); + + delay_token_2.reset(); + ASSERT_EQ(static_cast(5), controller.GetDelay()); + delay_token_1.reset(); + ASSERT_EQ(static_cast(0), controller.GetDelay()); + delay_token_1.reset(); + ASSERT_EQ(false, controller.IsStopped()); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/db/write_thread.cc b/db/write_thread.cc new file mode 100644 index 00000000000..052e1209efa --- /dev/null +++ b/db/write_thread.cc @@ -0,0 +1,147 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "db/write_thread.h" + +namespace rocksdb { + +Status WriteThread::EnterWriteThread(WriteThread::Writer* w, + uint64_t expiration_time) { + // the following code block pushes the current writer "w" into the writer + // queue "writers_" and wait until one of the following conditions met: + // 1. the job of "w" has been done by some other writers. + // 2. "w" becomes the first writer in "writers_" + // 3. "w" timed-out. + writers_.push_back(w); + + bool timed_out = false; + while (!w->done && w != writers_.front()) { + if (expiration_time == 0) { + w->cv.Wait(); + } else if (w->cv.TimedWait(expiration_time)) { + if (w->in_batch_group) { + // then it means the front writer is currently doing the + // write on behalf of this "timed-out" writer. Then it + // should wait until the write completes. + expiration_time = 0; + } else { + timed_out = true; + break; + } + } + } + + if (timed_out) { +#ifndef NDEBUG + bool found = false; +#endif + for (auto iter = writers_.begin(); iter != writers_.end(); iter++) { + if (*iter == w) { + writers_.erase(iter); +#ifndef NDEBUG + found = true; +#endif + break; + } + } +#ifndef NDEBUG + assert(found); +#endif + // writers_.front() might still be in cond_wait without a time-out. + // As a result, we need to signal it to wake it up. Otherwise no + // one else will wake him up, and RocksDB will hang. + if (!writers_.empty()) { + writers_.front()->cv.Signal(); + } + return Status::TimedOut(); + } + return Status::OK(); +} + +void WriteThread::ExitWriteThread(WriteThread::Writer* w, + WriteThread::Writer* last_writer, + Status status) { + // Pop out the current writer and all writers being pushed before the + // current writer from the writer queue. + while (!writers_.empty()) { + Writer* ready = writers_.front(); + writers_.pop_front(); + if (ready != w) { + ready->status = status; + ready->done = true; + ready->cv.Signal(); + } + if (ready == last_writer) break; + } + + // Notify new head of write queue + if (!writers_.empty()) { + writers_.front()->cv.Signal(); + } +} + +// This function will be called only when the first writer succeeds. +// All writers in the to-be-built batch group will be processed. +// +// REQUIRES: Writer list must be non-empty +// REQUIRES: First writer must have a non-nullptr batch +void WriteThread::BuildBatchGroup(WriteThread::Writer** last_writer, + autovector* write_batch_group) { + assert(!writers_.empty()); + Writer* first = writers_.front(); + assert(first->batch != nullptr); + + size_t size = WriteBatchInternal::ByteSize(first->batch); + write_batch_group->push_back(first->batch); + + // Allow the group to grow up to a maximum size, but if the + // original write is small, limit the growth so we do not slow + // down the small write too much. + size_t max_size = 1 << 20; + if (size <= (128<<10)) { + max_size = size + (128<<10); + } + + *last_writer = first; + std::deque::iterator iter = writers_.begin(); + ++iter; // Advance past "first" + for (; iter != writers_.end(); ++iter) { + Writer* w = *iter; + if (w->sync && !first->sync) { + // Do not include a sync write into a batch handled by a non-sync write. + break; + } + + if (!w->disableWAL && first->disableWAL) { + // Do not include a write that needs WAL into a batch that has + // WAL disabled. + break; + } + + if (w->timeout_hint_us < first->timeout_hint_us) { + // Do not include those writes with shorter timeout. Otherwise, we might + // execute a write that should instead be aborted because of timeout. + break; + } + + if (w->batch == nullptr) { + // Do not include those writes with nullptr batch. Those are not writes, + // those are something else. They want to be alone + break; + } + + size += WriteBatchInternal::ByteSize(w->batch); + if (size > max_size) { + // Do not make batch too big + break; + } + + write_batch_group->push_back(w->batch); + w->in_batch_group = true; + *last_writer = w; + } +} + +} // namespace rocksdb diff --git a/db/write_thread.h b/db/write_thread.h new file mode 100644 index 00000000000..8c5baa664b6 --- /dev/null +++ b/db/write_thread.h @@ -0,0 +1,80 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include +#include +#include +#include "rocksdb/status.h" +#include "db/write_batch_internal.h" +#include "util/autovector.h" +#include "port/port.h" + +namespace rocksdb { + +class WriteThread { + public: + static const uint64_t kNoTimeOut = std::numeric_limits::max(); + // Information kept for every waiting writer + struct Writer { + Status status; + WriteBatch* batch; + bool sync; + bool disableWAL; + bool in_batch_group; + bool done; + uint64_t timeout_hint_us; + port::CondVar cv; + + explicit Writer(port::Mutex* mu) + : batch(nullptr), + sync(false), + disableWAL(false), + in_batch_group(false), + done(false), + timeout_hint_us(kNoTimeOut), + cv(mu) {} + }; + + WriteThread() = default; + ~WriteThread() = default; + + // Before applying write operation (such as DBImpl::Write, DBImpl::Flush) + // thread should grab the mutex_ and be the first on writers queue. + // EnterWriteThread is used for it. + // Be aware! Writer's job can be done by other thread (see DBImpl::Write + // for examples), so check it via w.done before applying changes. + // + // Writer* w: writer to be placed in the queue + // uint64_t expiration_time: maximum time to be in the queue + // See also: ExitWriteThread + // REQUIRES: db mutex held + Status EnterWriteThread(Writer* w, uint64_t expiration_time); + + // After doing write job, we need to remove already used writers from + // writers_ queue and notify head of the queue about it. + // ExitWriteThread is used for this. + // + // Writer* w: Writer, that was added by EnterWriteThread function + // Writer* last_writer: Since we can join a few Writers (as DBImpl::Write + // does) + // we should pass last_writer as a parameter to + // ExitWriteThread + // (if you don't touch other writers, just pass w) + // Status status: Status of write operation + // See also: EnterWriteThread + // REQUIRES: db mutex held + void ExitWriteThread(Writer* w, Writer* last_writer, Status status); + + void BuildBatchGroup(Writer** last_writer, + autovector* write_batch_group); + + private: + // Queue of writers. + std::deque writers_; +}; + +} // namespace rocksdb diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index 28b8cdca856..e4b1bb75365 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -75,6 +75,8 @@ typedef struct rocksdb_iterator_t rocksdb_iterator_t; typedef struct rocksdb_logger_t rocksdb_logger_t; typedef struct rocksdb_mergeoperator_t rocksdb_mergeoperator_t; typedef struct rocksdb_options_t rocksdb_options_t; +typedef struct rocksdb_block_based_table_options_t + rocksdb_block_based_table_options_t; typedef struct rocksdb_randomfile_t rocksdb_randomfile_t; typedef struct rocksdb_readoptions_t rocksdb_readoptions_t; typedef struct rocksdb_seqfile_t rocksdb_seqfile_t; @@ -346,6 +348,34 @@ extern void rocksdb_writebatch_iterate( void (*deleted)(void*, const char* k, size_t klen)); extern const char* rocksdb_writebatch_data(rocksdb_writebatch_t*, size_t *size); +/* Block based table options */ + +extern rocksdb_block_based_table_options_t* + rocksdb_block_based_options_create(); +extern void rocksdb_block_based_options_destroy( + rocksdb_block_based_table_options_t* options); +extern void rocksdb_block_based_options_set_block_size( + rocksdb_block_based_table_options_t* options, size_t block_size); +extern void rocksdb_block_based_options_set_block_size_deviation( + rocksdb_block_based_table_options_t* options, int block_size_deviation); +extern void rocksdb_block_based_options_set_block_restart_interval( + rocksdb_block_based_table_options_t* options, int block_restart_interval); +extern void rocksdb_block_based_options_set_filter_policy( + rocksdb_block_based_table_options_t* options, + rocksdb_filterpolicy_t* filter_policy); +extern void rocksdb_block_based_options_set_no_block_cache( + rocksdb_block_based_table_options_t* options, + unsigned char no_block_cache); +extern void rocksdb_block_based_options_set_block_cache( + rocksdb_block_based_table_options_t* options, rocksdb_cache_t* block_cache); +extern void rocksdb_block_based_options_set_block_cache_compressed( + rocksdb_block_based_table_options_t* options, + rocksdb_cache_t* block_cache_compressed); +extern void rocksdb_block_based_options_set_whole_key_filtering( + rocksdb_block_based_table_options_t*, unsigned char); +extern void rocksdb_options_set_block_based_table_factory( + rocksdb_options_t *opt, rocksdb_block_based_table_options_t* table_options); + /* Options */ extern rocksdb_options_t* rocksdb_options_create(); @@ -353,7 +383,7 @@ extern void rocksdb_options_destroy(rocksdb_options_t*); extern void rocksdb_options_increase_parallelism( rocksdb_options_t* opt, int total_threads); extern void rocksdb_options_optimize_for_point_lookup( - rocksdb_options_t* opt); + rocksdb_options_t* opt, uint64_t block_cache_size_mb); extern void rocksdb_options_optimize_level_style_compaction( rocksdb_options_t* opt, uint64_t memtable_memory_budget); extern void rocksdb_options_optimize_universal_style_compaction( @@ -376,9 +406,6 @@ extern void rocksdb_options_set_compression_per_level( rocksdb_options_t* opt, int* level_values, size_t num_levels); -extern void rocksdb_options_set_filter_policy( - rocksdb_options_t*, - rocksdb_filterpolicy_t*); extern void rocksdb_options_set_create_if_missing( rocksdb_options_t*, unsigned char); extern void rocksdb_options_set_create_missing_column_families( @@ -392,13 +419,8 @@ extern void rocksdb_options_set_info_log(rocksdb_options_t*, rocksdb_logger_t*); extern void rocksdb_options_set_info_log_level(rocksdb_options_t*, int); extern void rocksdb_options_set_write_buffer_size(rocksdb_options_t*, size_t); extern void rocksdb_options_set_max_open_files(rocksdb_options_t*, int); -extern void rocksdb_options_set_cache(rocksdb_options_t*, rocksdb_cache_t*); -extern void rocksdb_options_set_cache_compressed(rocksdb_options_t*, rocksdb_cache_t*); -extern void rocksdb_options_set_block_size(rocksdb_options_t*, size_t); -extern void rocksdb_options_set_block_restart_interval(rocksdb_options_t*, int); extern void rocksdb_options_set_compression_options( rocksdb_options_t*, int, int, int); -extern void rocksdb_options_set_whole_key_filtering(rocksdb_options_t*, unsigned char); extern void rocksdb_options_set_prefix_extractor( rocksdb_options_t*, rocksdb_slicetransform_t*); extern void rocksdb_options_set_num_levels(rocksdb_options_t*, int); @@ -449,8 +471,6 @@ extern void rocksdb_options_set_arena_block_size( rocksdb_options_t*, size_t); extern void rocksdb_options_set_use_fsync( rocksdb_options_t*, int); -extern void rocksdb_options_set_db_stats_log_interval( - rocksdb_options_t*, int); extern void rocksdb_options_set_db_log_dir( rocksdb_options_t*, const char*); extern void rocksdb_options_set_wal_dir( @@ -493,7 +513,6 @@ extern void rocksdb_options_set_max_sequential_skip_in_iterations( rocksdb_options_t*, uint64_t); extern void rocksdb_options_set_disable_data_sync(rocksdb_options_t*, int); extern void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t*, int); -extern void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t*, int); extern void rocksdb_options_set_delete_obsolete_files_period_micros( rocksdb_options_t*, uint64_t); extern void rocksdb_options_set_source_compaction_factor(rocksdb_options_t*, int); @@ -679,6 +698,10 @@ extern void rocksdb_readoptions_set_fill_cache( extern void rocksdb_readoptions_set_snapshot( rocksdb_readoptions_t*, const rocksdb_snapshot_t*); +extern void rocksdb_readoptions_set_iterate_upper_bound( + rocksdb_readoptions_t*, + const char* key, + size_t keylen); extern void rocksdb_readoptions_set_read_tier( rocksdb_readoptions_t*, int); extern void rocksdb_readoptions_set_tailing( diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index 65d44b6cbfb..a8a6f9b73a6 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -127,9 +127,6 @@ class Cache { void LRU_Append(Handle* e); void Unref(Handle* e); - struct Rep; - Rep* rep_; - // No copying allowed Cache(const Cache&); void operator=(const Cache&); diff --git a/include/rocksdb/compaction_filter.h b/include/rocksdb/compaction_filter.h index 9c24fc50184..dce69d2d70c 100644 --- a/include/rocksdb/compaction_filter.h +++ b/include/rocksdb/compaction_filter.h @@ -9,6 +9,7 @@ #ifndef STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_ #define STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_ +#include #include #include diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index d9be6b4270a..0653a83868a 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -123,7 +123,7 @@ class DB { // Open DB with column families. // db_options specify database specific options - // column_families is the vector of all column families in the databse, + // column_families is the vector of all column families in the database, // containing column family name and options. You need to open ALL column // families in the database. To get the list of column families, you can use // ListColumnFamilies(). Also, you can open only a subset of column families @@ -359,6 +359,14 @@ class DB { return CompactRange(DefaultColumnFamily(), begin, end, reduce_level, target_level, target_path_id); } + virtual bool SetOptions(ColumnFamilyHandle* column_family, + const std::unordered_map& new_options) { + return true; + } + virtual bool SetOptions( + const std::unordered_map& new_options) { + return SetOptions(DefaultColumnFamily(), new_options); + } // Number of levels used for this DB. virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0; diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h index fa44db45ff5..90aefb388ba 100644 --- a/include/rocksdb/filter_policy.h +++ b/include/rocksdb/filter_policy.h @@ -21,11 +21,52 @@ #define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_ #include +#include namespace rocksdb { class Slice; +// A class that takes a bunch of keys, then generates filter +class FilterBitsBuilder { + public: + virtual ~FilterBitsBuilder() {} + + // Add Key to filter, you could use any way to store the key. + // Such as: storing hashes or original keys + // Keys are in sorted order and duplicated keys are possible. + virtual void AddKey(const Slice& key) = 0; + + // Generate the filter using the keys that are added + // The return value of this function would be the filter bits, + // The ownership of actual data is set to buf + virtual Slice Finish(std::unique_ptr* buf) = 0; +}; + +// A class that checks if a key can be in filter +// It should be initialized by Slice generated by BitsBuilder +class FilterBitsReader { + public: + virtual ~FilterBitsReader() {} + + // Check if the entry match the bits in filter + virtual bool MayMatch(const Slice& entry) = 0; +}; + +// We add a new format of filter block called full filter block +// This new interface gives you more space of customization +// +// For the full filter block, you can plug in your version by implement +// the FilterBitsBuilder and FilterBitsReader +// +// There are two sets of interface in FilterPolicy +// Set 1: CreateFilter, KeyMayMatch: used for blockbased filter +// Set 2: GetFilterBitsBuilder, GetFilterBitsReader, they are used for +// full filter. +// Set 1 MUST be implemented correctly, Set 2 is optional +// RocksDB would first try using functions in Set 2. if they return nullptr, +// it would use Set 1 instead. +// You can choose filter type in NewBloomFilterPolicy class FilterPolicy { public: virtual ~FilterPolicy(); @@ -51,11 +92,28 @@ class FilterPolicy { // This method may return true or false if the key was not on the // list, but it should aim to return false with a high probability. virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0; + + // Get the FilterBitsBuilder, which is ONLY used for full filter block + // It contains interface to take individual key, then generate filter + virtual FilterBitsBuilder* GetFilterBitsBuilder() const { + return nullptr; + } + + // Get the FilterBitsReader, which is ONLY used for full filter block + // It contains interface to tell if key can be in filter + // The input slice should NOT be deleted by FilterPolicy + virtual FilterBitsReader* GetFilterBitsReader(const Slice& contents) const { + return nullptr; + } }; // Return a new filter policy that uses a bloom filter with approximately -// the specified number of bits per key. A good value for bits_per_key +// the specified number of bits per key. +// +// bits_per_key: bits per key in bloom filter. A good value for bits_per_key // is 10, which yields a filter with ~ 1% false positive rate. +// use_block_based_builder: use block based filter rather than full fiter. +// If you want to builder full filter, it needs to be set to false. // // Callers must delete the result after any database that is using the // result has been closed. @@ -67,8 +125,8 @@ class FilterPolicy { // ignores trailing spaces, it would be incorrect to use a // FilterPolicy (like NewBloomFilterPolicy) that does not ignore // trailing spaces in keys. -extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key); - +extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key, + bool use_block_based_builder = true); } #endif // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_ diff --git a/include/rocksdb/flush_block_policy.h b/include/rocksdb/flush_block_policy.h index 8340ad616ef..939725cf409 100644 --- a/include/rocksdb/flush_block_policy.h +++ b/include/rocksdb/flush_block_policy.h @@ -6,6 +6,7 @@ #pragma once #include +#include "rocksdb/table.h" namespace rocksdb { @@ -37,7 +38,8 @@ class FlushBlockPolicyFactory { // Callers must delete the result after any database that is using the // result has been closed. virtual FlushBlockPolicy* NewFlushBlockPolicy( - const Options& options, const BlockBuilder& data_block_builder) const = 0; + const BlockBasedTableOptions& table_options, + const BlockBuilder& data_block_builder) const = 0; virtual ~FlushBlockPolicyFactory() { } }; @@ -51,7 +53,7 @@ class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory { } virtual FlushBlockPolicy* NewFlushBlockPolicy( - const Options& options, + const BlockBasedTableOptions& table_options, const BlockBuilder& data_block_builder) const override; }; diff --git a/include/rocksdb/immutable_options.h b/include/rocksdb/immutable_options.h new file mode 100644 index 00000000000..54b676626f4 --- /dev/null +++ b/include/rocksdb/immutable_options.h @@ -0,0 +1,84 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include +#include "rocksdb/options.h" + +namespace rocksdb { + +// ImmutableCFOptions is a data struct used by RocksDB internal. It contains a +// subset of Options that should not be changed during the entire lifetime +// of DB. You shouldn't need to access this data structure unless you are +// implementing a new TableFactory. Raw pointers defined in this struct do +// not have ownership to the data they point to. Options contains shared_ptr +// to these data. +struct ImmutableCFOptions { + explicit ImmutableCFOptions(const Options& options); + + CompactionStyle compaction_style; + + CompactionOptionsUniversal compaction_options_universal; + + const SliceTransform* prefix_extractor; + + const Comparator* comparator; + + MergeOperator* merge_operator; + + const CompactionFilter* compaction_filter; + + CompactionFilterFactory* compaction_filter_factory; + + CompactionFilterFactoryV2* compaction_filter_factory_v2; + + Logger* info_log; + + Statistics* statistics; + + InfoLogLevel info_log_level; + + Env* env; + + // Allow the OS to mmap file for reading sst tables. Default: false + bool allow_mmap_reads; + + // Allow the OS to mmap file for writing. Default: false + bool allow_mmap_writes; + + std::vector db_paths; + + MemTableRepFactory* memtable_factory; + + TableFactory* table_factory; + + Options::TablePropertiesCollectorFactories + table_properties_collector_factories; + + bool advise_random_on_open; + + // This options is required by PlainTableReader. May need to move it + // to PlainTalbeOptions just like bloom_bits_per_key + uint32_t bloom_locality; + + bool purge_redundant_kvs_while_flush; + + uint32_t min_partial_merge_operands; + + bool disable_data_sync; + + bool use_fsync; + + CompressionType compression; + + std::vector compression_per_level; + + CompressionOptions compression_opts; + + Options::AccessHint access_hint_on_compaction_start; +}; + +} // namespace rocksdb diff --git a/include/rocksdb/iostats_context.h b/include/rocksdb/iostats_context.h index 0a220b53ab5..e06ee1773a8 100644 --- a/include/rocksdb/iostats_context.h +++ b/include/rocksdb/iostats_context.h @@ -27,7 +27,9 @@ struct IOStatsContext { uint64_t bytes_read; }; +#ifndef IOS_CROSS_COMPILE extern __thread IOStatsContext iostats_context; +#endif // IOS_CROSS_COMPILE } // namespace rocksdb diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index ef7972489ed..a60f94268ee 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -14,6 +14,7 @@ #include #include #include +#include #include "rocksdb/version.h" #include "rocksdb/universal_compaction.h" @@ -57,6 +58,7 @@ enum CompactionStyle : char { kCompactionStyleFIFO = 0x2, // FIFO compaction style }; + struct CompactionOptionsFIFO { // once the total sum of table files reaches this, we will delete the oldest // table file @@ -97,7 +99,8 @@ struct ColumnFamilyOptions { // Use this if you don't need to keep the data sorted, i.e. you'll never use // an iterator, only Put() and Get() API calls - ColumnFamilyOptions* OptimizeForPointLookup(); + ColumnFamilyOptions* OptimizeForPointLookup( + uint64_t block_cache_size_mb); // Default values for some parameters in ColumnFamilyOptions are not // optimized for heavy workloads and big datasets, which means you might @@ -206,34 +209,6 @@ struct ColumnFamilyOptions { // individual write buffers. Default: 1 int min_write_buffer_number_to_merge; - // Control over blocks (user data is stored in a set of blocks, and - // a block is the unit of reading from disk). - - // If non-NULL use the specified cache for blocks. - // If NULL, rocksdb will automatically create and use an 8MB internal cache. - // Default: nullptr - std::shared_ptr block_cache; - - // If non-NULL use the specified cache for compressed blocks. - // If NULL, rocksdb will not use a compressed block cache. - // Default: nullptr - std::shared_ptr block_cache_compressed; - - // Approximate size of user data packed per block. Note that the - // block size specified here corresponds to uncompressed data. The - // actual size of the unit read from disk may be smaller if - // compression is enabled. This parameter can be changed dynamically. - // - // Default: 4K - size_t block_size; - - // Number of keys between restart points for delta encoding of keys. - // This parameter can be changed dynamically. Most clients should - // leave this parameter alone. - // - // Default: 16 - int block_restart_interval; - // Compress blocks using the specified compression algorithm. This // parameter can be changed dynamically. // @@ -251,29 +226,17 @@ struct ColumnFamilyOptions { CompressionType compression; // Different levels can have different compression policies. There - // are cases where most lower levels would like to quick compression - // algorithm while the higher levels (which have more data) use + // are cases where most lower levels would like to use quick compression + // algorithms while the higher levels (which have more data) use // compression algorithms that have better compression but could - // be slower. This array, if non nullptr, should have an entry for - // each level of the database. This array, if non nullptr, overides the - // value specified in the previous field 'compression'. The caller is - // reponsible for allocating memory and initializing the values in it - // before invoking Open(). The caller is responsible for freeing this - // array and it could be freed anytime after the return from Open(). - // This could have been a std::vector but that makes the equivalent - // java/C api hard to construct. + // be slower. This array, if non-empty, should have an entry for + // each level of the database; these override the value specified in + // the previous field 'compression'. std::vector compression_per_level; // different options for compression algorithms CompressionOptions compression_opts; - // If non-nullptr, use the specified filter policy to reduce disk reads. - // Many applications will benefit from passing the result of - // NewBloomFilterPolicy() here. - // - // Default: nullptr - const FilterPolicy* filter_policy; - // If non-nullptr, use the specified function to determine the // prefixes for keys. These prefixes will be placed in the filter. // Depending on the workload, this can reduce the number of read-IOP @@ -290,12 +253,6 @@ struct ColumnFamilyOptions { // Default: nullptr std::shared_ptr prefix_extractor; - // If true, place whole keys in the filter (not just prefixes). - // This must generally be true for gets to be efficient. - // - // Default: true - bool whole_key_filtering; - // Number of levels for this database int num_levels; @@ -331,7 +288,7 @@ struct ColumnFamilyOptions { // and each file on level-3 will be 200MB. // by default target_file_size_base is 2MB. - int target_file_size_base; + uint64_t target_file_size_base; // by default target_file_size_multiplier is 1, which means // by default files in different levels will have similar size. int target_file_size_multiplier; @@ -375,18 +332,6 @@ struct ColumnFamilyOptions { // stop building a single file in a level->level+1 compaction. int max_grandparent_overlap_factor; - // We decided to remove seek compaction from RocksDB because: - // 1) It makes more sense for spinning disk workloads, while RocksDB is - // primarily designed for flash and memory, - // 2) It added some complexity to the important code-paths, - // 3) None of our internal customers were really using it. - // - // Since we removed seek compaction, this option is now obsolete. - // We left it here for backwards compatiblity (otherwise it would break the - // build), but we'll remove it at some point. - // Default: true - bool disable_seek_compaction; - // Puts are delayed 0-1 ms when any level has a compaction score that exceeds // soft_rate_limit. This is ignored when == 0.0. // CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not @@ -399,17 +344,9 @@ struct ColumnFamilyOptions { // Default: 0 (disabled) double hard_rate_limit; - // Max time a put will be stalled when hard_rate_limit is enforced. If 0, then - // there is no limit. - // Default: 1000 + // DEPRECATED -- this options is no longer used unsigned int rate_limit_delay_max_milliseconds; - // Disable block cache. If this is set to true, - // then no block cache should be used, and the block_cache should - // point to a nullptr object. - // Default: false - bool no_block_cache; - // size of one block in arena memory allocation. // If <= 0, a proper value is automatically calculated (usually 1/10 of // writer_buffer_size). @@ -433,14 +370,6 @@ struct ColumnFamilyOptions { // Default: true bool purge_redundant_kvs_while_flush; - // This is used to close a block before it reaches the configured - // 'block_size'. If the percentage of free space in the current block is less - // than this specified number and adding a new record to the block will - // exceed the configured block size, then this block will be closed and the - // new record will be written to the next block. - // Default is 10. - int block_size_deviation; - // The compaction style. Default: kCompactionStyleLevel CompactionStyle compaction_style; @@ -475,10 +404,24 @@ struct ColumnFamilyOptions { std::shared_ptr memtable_factory; // This is a factory that provides TableFactory objects. - // Default: a factory that provides a default implementation of - // Table and TableBuilder. + // Default: a block-based table factory that provides a default + // implementation of TableBuilder and TableReader with default + // BlockBasedTableOptions. std::shared_ptr table_factory; + // Block-based table related options are moved to BlockBasedTableOptions. + // Related options that were originally here but now moved include: + // no_block_cache + // block_cache + // block_cache_compressed + // block_size + // block_size_deviation + // block_restart_interval + // filter_policy + // whole_key_filtering + // If you'd like to customize some of these options, you will need to + // use NewBlockBasedTableFactory() to construct a new table factory. + // This option allows user to to collect their own interested statistics of // the tables. // Default: empty vector -- no user-defined statistics collection will be @@ -669,7 +612,7 @@ struct DBOptions { // it does not use any locks to prevent concurrent updates. std::shared_ptr statistics; - // If true, then the contents of data files are not synced + // If true, then the contents of manifest and data files are not synced // to stable storage. Their contents remain in the OS buffers till the // OS decides to flush them. This option is good for bulk-loading // of data. Once the bulk-loading is complete, please issue a @@ -684,9 +627,6 @@ struct DBOptions { // Default: false bool use_fsync; - // This options is not used!! - int db_stats_log_interval; - // A list of paths where SST files can be put into, with its target size. // Newer data is placed into paths specified earlier in the vector while // older data gradually moves to paths specified later in the vector. @@ -844,12 +784,13 @@ struct DBOptions { // Specify the file access pattern once a compaction is started. // It will be applied to all input files of a compaction. // Default: NORMAL - enum { - NONE, - NORMAL, - SEQUENTIAL, - WILLNEED - } access_hint_on_compaction_start; + enum AccessHint { + NONE, + NORMAL, + SEQUENTIAL, + WILLNEED + }; + AccessHint access_hint_on_compaction_start; // Use adaptive mutex, which spins in the user space before resorting // to kernel. This could reduce context switch when the mutex is not @@ -958,6 +899,18 @@ struct ReadOptions { // ! DEPRECATED // const Slice* prefix; + // "iterate_upper_bound" defines the extent upto which the forward iterator + // can returns entries. Once the bound is reached, Valid() will be false. + // "iterate_upper_bound" is exclusive ie the bound value is + // not a valid entry. If iterator_extractor is not null, the Seek target + // and iterator_upper_bound need to have the same prefix. + // This is because ordering is not guaranteed outside of prefix domain. + // There is no lower bound on the iterator. If needed, that can be easily + // implemented + // + // Default: nullptr + const Slice* iterate_upper_bound; + // Specify if this read request should process data that ALREADY // resides on a particular cache. If the required data is not // found at the specified cache, then Status::Incomplete is returned. @@ -972,18 +925,27 @@ struct ReadOptions { // Not supported in ROCKSDB_LITE mode! bool tailing; + // Enable a total order seek regardless of index format (e.g. hash index) + // used in the table. Some table format (e.g. plain table) may not support + // this option. + bool total_order_seek; + ReadOptions() : verify_checksums(true), fill_cache(true), snapshot(nullptr), + iterate_upper_bound(nullptr), read_tier(kReadAllTier), - tailing(false) {} + tailing(false), + total_order_seek(false) {} ReadOptions(bool cksum, bool cache) : verify_checksums(cksum), fill_cache(cache), snapshot(nullptr), + iterate_upper_bound(nullptr), read_tier(kReadAllTier), - tailing(false) {} + tailing(false), + total_order_seek(false) {} }; // Options that control write operations @@ -1021,7 +983,17 @@ struct WriteOptions { // Default: 0 uint64_t timeout_hint_us; - WriteOptions() : sync(false), disableWAL(false), timeout_hint_us(0) {} + // If true and if user is trying to write to column families that don't exist + // (they were dropped), ignore the write (don't return an error). If there + // are multiple writes in a WriteBatch, other writes will succeed. + // Default: false + bool ignore_missing_column_families; + + WriteOptions() + : sync(false), + disableWAL(false), + timeout_hint_us(0), + ignore_missing_column_families(false) {} }; // Options that control flush operations @@ -1043,6 +1015,12 @@ extern Options GetOptions(size_t total_write_buffer_limit, int read_amplification_threshold = 8, int write_amplification_threshold = 32, uint64_t target_db_size = 68719476736 /* 64GB */); + +bool GetOptionsFromStrings( + const Options& base_options, + const std::unordered_map& options_map, + Options* new_options); + } // namespace rocksdb #endif // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_ diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index 6785833b4d7..87ac321c904 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -115,7 +115,7 @@ enum Tickers : uint32_t { // head of the writers queue. WRITE_DONE_BY_SELF, WRITE_DONE_BY_OTHER, - WRITE_TIMEDOUT, // Number of writes ending up with timed-out. + WRITE_TIMEDOUT, // Number of writes ending up with timed-out. WRITE_WITH_WAL, // Number of Write calls that request WAL COMPACT_READ_BYTES, // Bytes read during compaction COMPACT_WRITE_BYTES, // Bytes written during compaction @@ -212,7 +212,6 @@ enum Histograms : uint32_t { READ_BLOCK_COMPACTION_MICROS, READ_BLOCK_GET_MICROS, WRITE_RAW_BLOCK_MICROS, - STALL_L0_SLOWDOWN_COUNT, STALL_MEMTABLE_COMPACTION_COUNT, STALL_L0_NUM_FILES_COUNT, @@ -220,6 +219,7 @@ enum Histograms : uint32_t { SOFT_RATE_LIMIT_DELAY_COUNT, NUM_FILES_IN_SINGLE_COMPACTION, DB_SEEK, + WRITE_STALL, HISTOGRAM_ENUM_MAX, }; diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h index b20689a77c2..d13ff9d81f0 100644 --- a/include/rocksdb/status.h +++ b/include/rocksdb/status.h @@ -96,7 +96,7 @@ class Status { // Returns true iff the status indicates Incomplete bool IsIncomplete() const { return code() == kIncomplete; } - // Returns true iff the status indicates Incomplete + // Returns true iff the status indicates Shutdown In progress bool IsShutdownInProgress() const { return code() == kShutdownInProgress; } bool IsTimedOut() const { return code() == kTimedOut; } diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index 6a73239e8c1..2b0255a97d0 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -23,6 +23,7 @@ #include "rocksdb/env.h" #include "rocksdb/iterator.h" #include "rocksdb/options.h" +#include "rocksdb/immutable_options.h" #include "rocksdb/status.h" namespace rocksdb { @@ -84,6 +85,46 @@ struct BlockBasedTableOptions { // protected with this checksum type. Old table files will still be readable, // even though they have different checksum type. ChecksumType checksum = kCRC32c; + + // Disable block cache. If this is set to true, + // then no block cache should be used, and the block_cache should + // point to a nullptr object. + bool no_block_cache = false; + + // If non-NULL use the specified cache for blocks. + // If NULL, rocksdb will automatically create and use an 8MB internal cache. + std::shared_ptr block_cache = nullptr; + + // If non-NULL use the specified cache for compressed blocks. + // If NULL, rocksdb will not use a compressed block cache. + std::shared_ptr block_cache_compressed = nullptr; + + // Approximate size of user data packed per block. Note that the + // block size specified here corresponds to uncompressed data. The + // actual size of the unit read from disk may be smaller if + // compression is enabled. This parameter can be changed dynamically. + size_t block_size = 4 * 1024; + + // This is used to close a block before it reaches the configured + // 'block_size'. If the percentage of free space in the current block is less + // than this specified number and adding a new record to the block will + // exceed the configured block size, then this block will be closed and the + // new record will be written to the next block. + int block_size_deviation = 10; + + // Number of keys between restart points for delta encoding of keys. + // This parameter can be changed dynamically. Most clients should + // leave this parameter alone. + int block_restart_interval = 16; + + // If non-nullptr, use the specified filter policy to reduce disk reads. + // Many applications will benefit from passing the result of + // NewBloomFilterPolicy() here. + std::shared_ptr filter_policy = nullptr; + + // If true, place whole keys in the filter (not just prefixes). + // This must generally be true for gets to be efficient. + bool whole_key_filtering = true; }; // Table Properties that are specific to block-based table properties. @@ -126,47 +167,49 @@ struct PlainTablePropertyNames { const uint32_t kPlainTableVariableLength = 0; struct PlainTableOptions { -// @user_key_len: plain table has optimization for fix-sized keys, which can be -// specified via user_key_len. Alternatively, you can pass -// `kPlainTableVariableLength` if your keys have variable -// lengths. -uint32_t user_key_len = kPlainTableVariableLength; - -// @bloom_bits_per_key: the number of bits used for bloom filer per prefix. You -// may disable it by passing a zero. -int bloom_bits_per_key = 10; - -// @hash_table_ratio: the desired utilization of the hash table used for prefix -// hashing. hash_table_ratio = number of prefixes / #buckets -// in the hash table -double hash_table_ratio = 0.75; - -// @index_sparseness: inside each prefix, need to build one index record for how -// many keys for binary search inside each hash bucket. -// For encoding type kPrefix, the value will be used when -// writing to determine an interval to rewrite the full key. -// It will also be used as a suggestion and satisfied when -// possible. -size_t index_sparseness = 16; - -// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc. -// Otherwise from huge page TLB. The user needs to reserve -// huge pages for it to be allocated, like: -// sysctl -w vm.nr_hugepages=20 -// See linux doc Documentation/vm/hugetlbpage.txt -size_t huge_page_tlb_size = 0; - -// @encoding_type: how to encode the keys. See enum EncodingType above for -// the choices. The value will determine how to encode keys -// when writing to a new SST file. This value will be stored -// inside the SST file which will be used when reading from the -// file, which makes it possible for users to choose different -// encoding type when reopening a DB. Files with different -// encoding types can co-exist in the same DB and can be read. -EncodingType encoding_type = kPlain; - -// @full_scan_mode: mode for reading the whole file one record by one without -// using the index. + // @user_key_len: plain table has optimization for fix-sized keys, which can + // be specified via user_key_len. Alternatively, you can pass + // `kPlainTableVariableLength` if your keys have variable + // lengths. + uint32_t user_key_len = kPlainTableVariableLength; + + // @bloom_bits_per_key: the number of bits used for bloom filer per prefix. + // You may disable it by passing a zero. + int bloom_bits_per_key = 10; + + // @hash_table_ratio: the desired utilization of the hash table used for + // prefix hashing. + // hash_table_ratio = number of prefixes / #buckets in the + // hash table + double hash_table_ratio = 0.75; + + // @index_sparseness: inside each prefix, need to build one index record for + // how many keys for binary search inside each hash bucket. + // For encoding type kPrefix, the value will be used when + // writing to determine an interval to rewrite the full + // key. It will also be used as a suggestion and satisfied + // when possible. + size_t index_sparseness = 16; + + // @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc. + // Otherwise from huge page TLB. The user needs to + // reserve huge pages for it to be allocated, like: + // sysctl -w vm.nr_hugepages=20 + // See linux doc Documentation/vm/hugetlbpage.txt + size_t huge_page_tlb_size = 0; + + // @encoding_type: how to encode the keys. See enum EncodingType above for + // the choices. The value will determine how to encode keys + // when writing to a new SST file. This value will be stored + // inside the SST file which will be used when reading from + // the file, which makes it possible for users to choose + // different encoding type when reopening a DB. Files with + // different encoding types can co-exist in the same DB and + // can be read. + EncodingType encoding_type = kPlain; + + // @full_scan_mode: mode for reading the whole file one record by one without + // using the index. bool full_scan_mode = false; // @store_index_in_file: compute plain table index and bloom filter during @@ -185,15 +228,59 @@ extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options = PlainTableOptions()); struct CuckooTablePropertyNames { + // The key that is used to fill empty buckets. static const std::string kEmptyKey; + // Fixed length of value. static const std::string kValueLength; - static const std::string kNumHashTable; - static const std::string kMaxNumBuckets; + // Number of hash functions used in Cuckoo Hash. + static const std::string kNumHashFunc; + // It denotes the number of buckets in a Cuckoo Block. Given a key and a + // particular hash function, a Cuckoo Block is a set of consecutive buckets, + // where starting bucket id is given by the hash function on the key. In case + // of a collision during inserting the key, the builder tries to insert the + // key in other locations of the cuckoo block before using the next hash + // function. This reduces cache miss during read operation in case of + // collision. + static const std::string kCuckooBlockSize; + // Size of the hash table. Use this number to compute the modulo of hash + // function. The actual number of buckets will be kMaxHashTableSize + + // kCuckooBlockSize - 1. The last kCuckooBlockSize-1 buckets are used to + // accommodate the Cuckoo Block from end of hash table, due to cache friendly + // implementation. + static const std::string kHashTableSize; + // Denotes if the key sorted in the file is Internal Key (if false) + // or User Key only (if true). static const std::string kIsLastLevel; + // Indicate if using identity function for the first hash function. + static const std::string kIdentityAsFirstHash; +}; + +struct CuckooTableOptions { + // Determines the utilization of hash tables. Smaller values + // result in larger hash tables with fewer collisions. + double hash_table_ratio = 0.9; + // A property used by builder to determine the depth to go to + // to search for a path to displace elements in case of + // collision. See Builder.MakeSpaceForKey method. Higher + // values result in more efficient hash tables with fewer + // lookups but take more time to build. + uint32_t max_search_depth = 100; + // In case of collision while inserting, the builder + // attempts to insert in the next cuckoo_block_size + // locations before skipping over to the next Cuckoo hash + // function. This makes lookups more cache friendly in case + // of collisions. + uint32_t cuckoo_block_size = 5; + // If this options is enabled, user key is treated as uint64_t and its value + // is used as hash value directly. This option changes builder's behavior. + // Reader ignore this option and behave according to what specified in table + // property. + bool identity_as_first_hash = false; }; -extern TableFactory* NewCuckooTableFactory(double hash_table_ratio = 0.9, - uint32_t max_search_depth = 100); +// Cuckoo Table Factory for SST table format using Cache Friendly Cuckoo Hashing +extern TableFactory* NewCuckooTableFactory( + const CuckooTableOptions& table_options = CuckooTableOptions()); #endif // ROCKSDB_LITE @@ -220,14 +307,15 @@ class TableFactory { // and cache the table object returned. // (1) SstFileReader (for SST Dump) opens the table and dump the table // contents using the interator of the table. - // options and soptions are options. options is the general options. + // ImmutableCFOptions is a subset of Options that can not be altered. + // EnvOptions is a subset of Options that will be used by Env. // Multiple configured can be accessed from there, including and not // limited to block cache and key comparators. // file is a file handler to handle the file for the table // file_size is the physical file size of the file // table_reader is the output table reader virtual Status NewTableReader( - const Options& options, const EnvOptions& soptions, + const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader) const = 0; @@ -245,14 +333,27 @@ class TableFactory { // (4) When running Repairer, it creates a table builder to convert logs to // SST files (In Repairer::ConvertLogToTable() by calling BuildTable()) // - // options is the general options. Multiple configured can be acceseed from - // there, including and not limited to compression options. - // file is a handle of a writable file. It is the caller's responsibility to - // keep the file open and close the file after closing the table builder. - // compression_type is the compression type to use in this table. + // ImmutableCFOptions is a subset of Options that can not be altered. + // Multiple configured can be acceseed from there, including and not limited + // to compression options. file is a handle of a writable file. + // It is the caller's responsibility to keep the file open and close the file + // after closing the table builder. compression_type is the compression type + // to use in this table. virtual TableBuilder* NewTableBuilder( - const Options& options, const InternalKeyComparator& internal_comparator, - WritableFile* file, CompressionType compression_type) const = 0; + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& internal_comparator, + WritableFile* file, const CompressionType compression_type, + const CompressionOptions& compression_opts) const = 0; + + // Sanitizes the specified DB Options. + // + // If the function cannot find a way to sanitize the input DB Options, + // a non-ok Status will be returned. + virtual Status SanitizeDBOptions(const DBOptions* db_opts) const = 0; + + // Return a string that contains printable format of table configurations. + // RocksDB prints configurations at DB Open(). + virtual std::string GetPrintableTableOptions() const = 0; }; #ifndef ROCKSDB_LITE diff --git a/include/rocksdb/utilities/backupable_db.h b/include/rocksdb/utilities/backupable_db.h index 78365769d2e..57a8accdf8e 100644 --- a/include/rocksdb/utilities/backupable_db.h +++ b/include/rocksdb/utilities/backupable_db.h @@ -10,7 +10,10 @@ #pragma once #ifndef ROCKSDB_LITE +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include @@ -127,9 +130,41 @@ struct BackupInfo { int64_t timestamp; uint64_t size; + uint32_t number_files; + BackupInfo() {} - BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size) - : backup_id(_backup_id), timestamp(_timestamp), size(_size) {} + + BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size, + uint32_t _number_files) + : backup_id(_backup_id), timestamp(_timestamp), size(_size), + number_files(_number_files) {} +}; + +class BackupStatistics { + public: + BackupStatistics() { + number_success_backup = 0; + number_fail_backup = 0; + } + + BackupStatistics(uint32_t _number_success_backup, + uint32_t _number_fail_backup) + : number_success_backup(_number_success_backup), + number_fail_backup(_number_fail_backup) {} + + ~BackupStatistics() {} + + void IncrementNumberSuccessBackup(); + void IncrementNumberFailBackup(); + + uint32_t GetNumberSuccessBackup() const; + uint32_t GetNumberFailBackup() const; + + std::string ToString() const; + + private: + uint32_t number_success_backup; + uint32_t number_fail_backup; }; class BackupEngineReadOnly { diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h new file mode 100644 index 00000000000..85c80850fc3 --- /dev/null +++ b/include/rocksdb/utilities/write_batch_with_index.h @@ -0,0 +1,105 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A WriteBatchWithIndex with a binary searchable index built for all the keys +// inserted. + +#pragma once + +#include "rocksdb/comparator.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/write_batch.h" + +namespace rocksdb { + +class ColumnFamilyHandle; +struct SliceParts; +class Comparator; + +enum WriteType { kPutRecord, kMergeRecord, kDeleteRecord, kLogDataRecord }; + +// an entry for Put, Merge or Delete entry for write batches. Used in +// WBWIIterator. +struct WriteEntry { + WriteType type; + Slice key; + Slice value; +}; + +// Iterator of one column family out of a WriteBatchWithIndex. +class WBWIIterator { + public: + virtual ~WBWIIterator() {} + + virtual bool Valid() const = 0; + + virtual void Seek(const Slice& key) = 0; + + virtual void Next() = 0; + + virtual const WriteEntry& Entry() const = 0; + + virtual Status status() const = 0; +}; + +// A WriteBatchWithIndex with a binary searchable index built for all the keys +// inserted. +// In Put(), Merge() or Delete(), the same function of the wrapped will be +// called. At the same time, indexes will be built. +// By calling GetWriteBatch(), a user will get the WriteBatch for the data +// they inserted, which can be used for DB::Write(). +// A user can call NewIterator() to create an iterator. +class WriteBatchWithIndex { + public: + // backup_index_comparator: the backup comparator used to compare keys + // within the same column family, if column family is not given in the + // interface, or we can't find a column family from the column family handle + // passed in, backup_index_comparator will be used for the column family. + // reserved_bytes: reserved bytes in underlying WriteBatch + explicit WriteBatchWithIndex( + const Comparator* backup_index_comparator = BytewiseComparator(), + size_t reserved_bytes = 0); + virtual ~WriteBatchWithIndex(); + + WriteBatch* GetWriteBatch(); + + virtual void Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value); + + virtual void Put(const Slice& key, const Slice& value); + + virtual void Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value); + + virtual void Merge(const Slice& key, const Slice& value); + + virtual void PutLogData(const Slice& blob); + + virtual void Delete(ColumnFamilyHandle* column_family, const Slice& key); + virtual void Delete(const Slice& key); + + virtual void Delete(ColumnFamilyHandle* column_family, const SliceParts& key); + + virtual void Delete(const SliceParts& key); + + // Create an iterator of a column family. User can call iterator.Seek() to + // search to the next entry of or after a key. Keys will be iterated in the + // order given by index_comparator. For multiple updates on the same key, + // each update will be returned as a separate entry, in the order of update + // time. + virtual WBWIIterator* NewIterator(ColumnFamilyHandle* column_family); + // Create an iterator of the default column family. + virtual WBWIIterator* NewIterator(); + + private: + struct Rep; + Rep* rep; +}; + +} // namespace rocksdb diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h index 3272fd2f94e..db440be023f 100644 --- a/include/rocksdb/write_batch.h +++ b/include/rocksdb/write_batch.h @@ -152,6 +152,7 @@ class WriteBatch { private: friend class WriteBatchInternal; + protected: std::string rep_; // See comment in write_batch.cc for the format of rep_ // Intentionally copyable diff --git a/java/Makefile b/java/Makefile index 238ddd93ef6..b2f3674f06f 100644 --- a/java/Makefile +++ b/java/Makefile @@ -1,4 +1,4 @@ -NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv +NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig NATIVE_INCLUDE = ./include ROCKSDB_JAR = rocksdbjni.jar diff --git a/java/RocksDBSample.java b/java/RocksDBSample.java index dfecde3428b..d78a070dfae 100644 --- a/java/RocksDBSample.java +++ b/java/RocksDBSample.java @@ -35,16 +35,11 @@ public static void main(String[] args) { assert(db == null); } - Filter filter = new BloomFilter(10); options.setCreateIfMissing(true) .createStatistics() .setWriteBufferSize(8 * SizeUnit.KB) .setMaxWriteBufferNumber(3) - .setDisableSeekCompaction(true) - .setBlockSize(64 * SizeUnit.KB) .setMaxBackgroundCompactions(10) - .setFilter(filter) - .setCacheNumShardBits(6) .setCompressionType(CompressionType.SNAPPY_COMPRESSION) .setCompactionStyle(CompactionStyle.UNIVERSAL); Statistics stats = options.statisticsPtr(); @@ -52,10 +47,7 @@ public static void main(String[] args) { assert(options.createIfMissing() == true); assert(options.writeBufferSize() == 8 * SizeUnit.KB); assert(options.maxWriteBufferNumber() == 3); - assert(options.disableSeekCompaction() == true); - assert(options.blockSize() == 64 * SizeUnit.KB); assert(options.maxBackgroundCompactions() == 10); - assert(options.cacheNumShardBits() == 6); assert(options.compressionType() == CompressionType.SNAPPY_COMPRESSION); assert(options.compactionStyle() == CompactionStyle.UNIVERSAL); @@ -80,7 +72,22 @@ public static void main(String[] args) { assert(options.memTableFactoryName().equals("SkipListFactory")); options.setTableFormatConfig(new PlainTableConfig()); + // Plain-Table requires mmap read + options.setAllowMmapReads(true); assert(options.tableFactoryName().equals("PlainTable")); + + options.setRateLimiterConfig(new GenericRateLimiterConfig(10000000, + 10000, 10)); + options.setRateLimiterConfig(new GenericRateLimiterConfig(10000000)); + + BlockBasedTableConfig table_options = new BlockBasedTableConfig(); + table_options.setBlockCacheSize(64 * SizeUnit.KB) + .setFilterBitsPerKey(10) + .setCacheNumShardBits(6); + assert(table_options.blockCacheSize() == 64 * SizeUnit.KB); + assert(table_options.cacheNumShardBits() == 6); + options.setTableFormatConfig(table_options); + assert(options.tableFactoryName().equals("BlockBasedTable")); try { db = RocksDB.open(options, db_path_not_found); @@ -120,6 +127,29 @@ public static void main(String[] args) { System.out.println(""); } + // write batch test + WriteOptions writeOpt = new WriteOptions(); + for (int i = 10; i <= 19; ++i) { + WriteBatch batch = new WriteBatch(); + for (int j = 10; j <= 19; ++j) { + batch.put(String.format("%dx%d", i, j).getBytes(), + String.format("%d", i * j).getBytes()); + } + db.write(writeOpt, batch); + batch.dispose(); + } + for (int i = 10; i <= 19; ++i) { + for (int j = 10; j <= 19; ++j) { + assert(new String( + db.get(String.format("%dx%d", i, j).getBytes())).equals( + String.format("%d", i * j))); + System.out.format("%s ", new String(db.get( + String.format("%dx%d", i, j).getBytes()))); + } + System.out.println(""); + } + writeOpt.dispose(); + value = db.get("1x1".getBytes()); assert(value != null); value = db.get("world".getBytes()); @@ -254,6 +284,5 @@ public static void main(String[] args) { // be sure to dispose c++ pointers options.dispose(); readOptions.dispose(); - filter.dispose(); } } diff --git a/java/org/rocksdb/BlockBasedTableConfig.java b/java/org/rocksdb/BlockBasedTableConfig.java new file mode 100644 index 00000000000..523a5769118 --- /dev/null +++ b/java/org/rocksdb/BlockBasedTableConfig.java @@ -0,0 +1,210 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +package org.rocksdb; + +/** + * The config for plain table sst format. + * + * BlockBasedTable is a RocksDB's default SST file format. + */ +public class BlockBasedTableConfig extends TableFormatConfig { + + public BlockBasedTableConfig() { + noBlockCache_ = false; + blockCacheSize_ = 8 * 1024 * 1024; + blockSize_ = 4 * 1024; + blockSizeDeviation_ =10; + blockRestartInterval_ =16; + wholeKeyFiltering_ = true; + bitsPerKey_ = 0; + } + + /** + * Disable block cache. If this is set to true, + * then no block cache should be used, and the block_cache should + * point to a nullptr object. + * Default: false + * + * @param noBlockCache if use block cache + * @return the reference to the current config. + */ + public BlockBasedTableConfig setNoBlockCache(boolean noBlockCache) { + noBlockCache_ = noBlockCache; + return this; + } + + /** + * @return if block cache is disabled + */ + public boolean noBlockCache() { + return noBlockCache_; + } + + /** + * Set the amount of cache in bytes that will be used by RocksDB. + * If cacheSize is non-positive, then cache will not be used. + * DEFAULT: 8M + * + * @param blockCacheSize block cache size in bytes + * @return the reference to the current config. + */ + public BlockBasedTableConfig setBlockCacheSize(long blockCacheSize) { + blockCacheSize_ = blockCacheSize; + return this; + } + + /** + * @return block cache size in bytes + */ + public long blockCacheSize() { + return blockCacheSize_; + } + + /** + * Controls the number of shards for the block cache. + * This is applied only if cacheSize is set to non-negative. + * + * @param numShardBits the number of shard bits. The resulting + * number of shards would be 2 ^ numShardBits. Any negative + * number means use default settings." + * @return the reference to the current option. + */ + public BlockBasedTableConfig setCacheNumShardBits(int numShardBits) { + numShardBits_ = numShardBits; + return this; + } + + /** + * Returns the number of shard bits used in the block cache. + * The resulting number of shards would be 2 ^ (returned value). + * Any negative number means use default settings. + * + * @return the number of shard bits used in the block cache. + */ + public int cacheNumShardBits() { + return numShardBits_; + } + + /** + * Approximate size of user data packed per block. Note that the + * block size specified here corresponds to uncompressed data. The + * actual size of the unit read from disk may be smaller if + * compression is enabled. This parameter can be changed dynamically. + * Default: 4K + * + * @param blockSize block size in bytes + * @return the reference to the current config. + */ + public BlockBasedTableConfig setBlockSize(long blockSize) { + blockSize_ = blockSize; + return this; + } + + /** + * @return block size in bytes + */ + public long blockSize() { + return blockSize_; + } + + /** + * This is used to close a block before it reaches the configured + * 'block_size'. If the percentage of free space in the current block is less + * than this specified number and adding a new record to the block will + * exceed the configured block size, then this block will be closed and the + * new record will be written to the next block. + * Default is 10. + * + * @param blockSizeDeviation the deviation to block size allowed + * @return the reference to the current config. + */ + public BlockBasedTableConfig setBlockSizeDeviation(int blockSizeDeviation) { + blockSizeDeviation_ = blockSizeDeviation; + return this; + } + + /** + * @return the hash table ratio. + */ + public int blockSizeDeviation() { + return blockSizeDeviation_; + } + + /** + * Set block restart interval + * + * @param restartInterval block restart interval. + * @return the reference to the current config. + */ + public BlockBasedTableConfig setBlockRestartInterval(int restartInterval) { + blockRestartInterval_ = restartInterval; + return this; + } + + /** + * @return block restart interval + */ + public int blockRestartInterval() { + return blockRestartInterval_; + } + + /** + * If true, place whole keys in the filter (not just prefixes). + * This must generally be true for gets to be efficient. + * Default: true + * + * @param wholeKeyFiltering if enable whole key filtering + * @return the reference to the current config. + */ + public BlockBasedTableConfig setWholeKeyFiltering(boolean wholeKeyFiltering) { + wholeKeyFiltering_ = wholeKeyFiltering; + return this; + } + + /** + * @return if whole key filtering is enabled + */ + public boolean wholeKeyFiltering() { + return wholeKeyFiltering_; + } + + /** + * Use the specified filter policy to reduce disk reads. + * + * Filter should not be disposed before options instances using this filter is + * disposed. If dispose() function is not called, then filter object will be + * GC'd automatically. + * + * Filter instance can be re-used in multiple options instances. + * + * @param Filter policy java instance. + * @return the reference to the current config. + */ + public BlockBasedTableConfig setFilterBitsPerKey(int bitsPerKey) { + bitsPerKey_ = bitsPerKey; + return this; + } + + @Override protected long newTableFactoryHandle() { + return newTableFactoryHandle(noBlockCache_, blockCacheSize_, numShardBits_, + blockSize_, blockSizeDeviation_, blockRestartInterval_, + wholeKeyFiltering_, bitsPerKey_); + } + + private native long newTableFactoryHandle( + boolean noBlockCache, long blockCacheSize, int numShardbits, + long blockSize, int blockSizeDeviation, int blockRestartInterval, + boolean wholeKeyFiltering, int bitsPerKey); + + private boolean noBlockCache_; + private long blockCacheSize_; + private int numShardBits_; + private long shard; + private long blockSize_; + private int blockSizeDeviation_; + private int blockRestartInterval_; + private boolean wholeKeyFiltering_; + private int bitsPerKey_; +} diff --git a/java/org/rocksdb/GenericRateLimiterConfig.java b/java/org/rocksdb/GenericRateLimiterConfig.java new file mode 100644 index 00000000000..78b8b37ec65 --- /dev/null +++ b/java/org/rocksdb/GenericRateLimiterConfig.java @@ -0,0 +1,36 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +package org.rocksdb; + +/** + * Config for rate limiter, which is used to control write rate of flush and + * compaction. + */ +public class GenericRateLimiterConfig extends RateLimiterConfig { + private static final long DEFAULT_REFILL_PERIOD_MICROS = (100 * 1000); + private static final int DEFAULT_FAIRNESS = 10; + + public GenericRateLimiterConfig(long rateBytesPerSecond, + long refillPeriodMicros, int fairness) { + rateBytesPerSecond_ = rateBytesPerSecond; + refillPeriodMicros_ = refillPeriodMicros; + fairness_ = fairness; + } + + public GenericRateLimiterConfig(long rateBytesPerSecond) { + this(rateBytesPerSecond, DEFAULT_REFILL_PERIOD_MICROS, DEFAULT_FAIRNESS); + } + + @Override protected long newRateLimiterHandle() { + return newRateLimiterHandle(rateBytesPerSecond_, refillPeriodMicros_, + fairness_); + } + + private native long newRateLimiterHandle(long rateBytesPerSecond, + long refillPeriodMicros, int fairness); + private final long rateBytesPerSecond_; + private final long refillPeriodMicros_; + private final int fairness_; +} diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java index 4ed0b8025d2..33ca19d9d03 100644 --- a/java/org/rocksdb/Options.java +++ b/java/org/rocksdb/Options.java @@ -139,135 +139,6 @@ public int maxWriteBufferNumber() { return maxWriteBufferNumber(nativeHandle_); } - /* - * Approximate size of user data packed per block. Note that the - * block size specified here corresponds to uncompressed data. The - * actual size of the unit read from disk may be smaller if - * compression is enabled. This parameter can be changed dynamically. - * - * Default: 4K - * - * @param blockSize the size of each block in bytes. - * @return the instance of the current Options. - * @see RocksDB.open() - */ - public Options setBlockSize(long blockSize) { - assert(isInitialized()); - setBlockSize(nativeHandle_, blockSize); - return this; - } - - /* - * Returns the size of a block in bytes. - * - * @return block size. - * @see setBlockSize() - */ - public long blockSize() { - assert(isInitialized()); - return blockSize(nativeHandle_); - } - - /** - * Use the specified filter policy to reduce disk reads. - * - * Filter should not be disposed before options instances using this filter is - * disposed. If dispose() function is not called, then filter object will be - * GC'd automatically. - * - * Filter instance can be re-used in multiple options instances. - * - * @param Filter policy java instance. - * @return the instance of the current Options. - * @see RocksDB.open() - */ - public Options setFilter(Filter filter) { - assert(isInitialized()); - setFilterHandle(nativeHandle_, filter.nativeHandle_); - filter_ = filter; - return this; - } - private native void setFilterHandle(long optHandle, long filterHandle); - - /* - * Disable compaction triggered by seek. - * With bloomfilter and fast storage, a miss on one level - * is very cheap if the file handle is cached in table cache - * (which is true if max_open_files is large). - * Default: true - * - * @param disableSeekCompaction a boolean value to specify whether - * to disable seek compaction. - * @return the instance of the current Options. - * @see RocksDB.open() - */ - public Options setDisableSeekCompaction(boolean disableSeekCompaction) { - assert(isInitialized()); - setDisableSeekCompaction(nativeHandle_, disableSeekCompaction); - return this; - } - - /* - * Returns true if disable seek compaction is set to true. - * - * @return true if disable seek compaction is set to true. - * @see setDisableSeekCompaction() - */ - public boolean disableSeekCompaction() { - assert(isInitialized()); - return disableSeekCompaction(nativeHandle_); - } - - /** - * Set the amount of cache in bytes that will be used by RocksDB. - * If cacheSize is non-positive, then cache will not be used. - * - * DEFAULT: 8M - * @see setCacheNumShardBits() - */ - public Options setCacheSize(long cacheSize) { - cacheSize_ = cacheSize; - return this; - } - - /** - * @return the amount of cache in bytes that will be used by RocksDB. - * - * @see cacheNumShardBits() - */ - public long cacheSize() { - return cacheSize_; - } - - /** - * Controls the number of shards for the block cache. - * This is applied only if cacheSize is set to non-negative. - * - * @param numShardBits the number of shard bits. The resulting - * number of shards would be 2 ^ numShardBits. Any negative - * number means use default settings." - * @return the reference to the current option. - * - * @see setCacheSize() - */ - public Options setCacheNumShardBits(int numShardBits) { - numShardBits_ = numShardBits; - return this; - } - - /** - * Returns the number of shard bits used in the block cache. - * The resulting number of shards would be 2 ^ (returned value). - * Any negative number means use default settings. - * - * @return the number of shard bits used in the block cache. - * - * @see cacheSize() - */ - public int cacheNumShardBits() { - return numShardBits_; - } - /** * If true, an error will be thrown during RocksDB.open() if the * database already exists. @@ -437,40 +308,6 @@ public Options setUseFsync(boolean useFsync) { } private native void setUseFsync(long handle, boolean useFsync); - /** - * The time interval in seconds between each two consecutive stats logs. - * This number controls how often a new scribe log about - * db deploy stats is written out. - * -1 indicates no logging at all. - * - * @return the time interval in seconds between each two consecutive - * stats logs. - */ - public int dbStatsLogInterval() { - assert(isInitialized()); - return dbStatsLogInterval(nativeHandle_); - } - private native int dbStatsLogInterval(long handle); - - /** - * The time interval in seconds between each two consecutive stats logs. - * This number controls how often a new scribe log about - * db deploy stats is written out. - * -1 indicates no logging at all. - * Default value is 1800 (half an hour). - * - * @param dbStatsLogInterval the time interval in seconds between each - * two consecutive stats logs. - * @return the reference to the current option. - */ - public Options setDbStatsLogInterval(int dbStatsLogInterval) { - assert(isInitialized()); - setDbStatsLogInterval(nativeHandle_, dbStatsLogInterval); - return this; - } - private native void setDbStatsLogInterval( - long handle, int dbStatsLogInterval); - /** * Returns the directory of info log. * @@ -1270,6 +1107,19 @@ public Options setMemTableConfig(MemTableConfig config) { setMemTableFactory(nativeHandle_, config.newMemTableFactoryHandle()); return this; } + + /** + * Use to control write rate of flush and compaction. Flush has higher + * priority than compaction. Rate limiting is disabled if nullptr. + * Default: nullptr + * + * @param config rate limiter config. + * @return the instance of the current Options. + */ + public Options setRateLimiterConfig(RateLimiterConfig config) { + setRateLimiter(nativeHandle_, config.newRateLimiterHandle()); + return this; + } /** * Returns the name of the current mem table representation. @@ -1347,26 +1197,26 @@ public Options setBlockRestartInterval(int blockRestartInterval) { } private native void setBlockRestartInterval( long handle, int blockRestartInterval); - + /** * Compress blocks using the specified compression algorithm. This parameter can be changed dynamically. - * + * * Default: SNAPPY_COMPRESSION, which gives lightweight but fast compression. - * + * * @return Compression type. - */ + */ public CompressionType compressionType() { return CompressionType.values()[compressionType(nativeHandle_)]; } private native byte compressionType(long handle); - + /** * Compress blocks using the specified compression algorithm. This parameter can be changed dynamically. - * + * * Default: SNAPPY_COMPRESSION, which gives lightweight but fast compression. - * + * * @param compressionType Compression Type. * @return the reference to the current option. */ @@ -1375,22 +1225,22 @@ public Options setCompressionType(CompressionType compressionType) { return this; } private native void setCompressionType(long handle, byte compressionType); - + /** * Compaction style for DB. - * + * * @return Compaction style. - */ + */ public CompactionStyle compactionStyle() { return CompactionStyle.values()[compactionStyle(nativeHandle_)]; } private native byte compactionStyle(long handle); - + /** * Set compaction style for DB. - * + * * Default: LEVEL. - * + * * @param compactionStyle Compaction style. * @return the reference to the current option. */ @@ -1400,33 +1250,6 @@ public Options setCompactionStyle(CompactionStyle compactionStyle) { } private native void setCompactionStyle(long handle, byte compactionStyle); - /** - * If true, place whole keys in the filter (not just prefixes). - * This must generally be true for gets to be efficient. - * Default: true - * - * @return if true, then whole-key-filtering is on. - */ - public boolean wholeKeyFiltering() { - return wholeKeyFiltering(nativeHandle_); - } - private native boolean wholeKeyFiltering(long handle); - - /** - * If true, place whole keys in the filter (not just prefixes). - * This must generally be true for gets to be efficient. - * Default: true - * - * @param wholeKeyFiltering if true, then whole-key-filtering is on. - * @return the reference to the current option. - */ - public Options setWholeKeyFiltering(boolean wholeKeyFiltering) { - setWholeKeyFiltering(nativeHandle_, wholeKeyFiltering); - return this; - } - private native void setWholeKeyFiltering( - long handle, boolean wholeKeyFiltering); - /** * If level-styled compaction is used, then this number determines * the total number of levels. @@ -1900,35 +1723,6 @@ public Options setRateLimitDelayMaxMilliseconds( private native void setRateLimitDelayMaxMilliseconds( long handle, int rateLimitDelayMaxMilliseconds); - /** - * Disable block cache. If this is set to true, - * then no block cache should be used, and the block_cache should - * point to a nullptr object. - * Default: false - * - * @return true if block cache is disabled. - */ - public boolean noBlockCache() { - return noBlockCache(nativeHandle_); - } - private native boolean noBlockCache(long handle); - - /** - * Disable block cache. If this is set to true, - * then no block cache should be used, and the block_cache should - * point to a nullptr object. - * Default: false - * - * @param noBlockCache true if block-cache is disabled. - * @return the reference to the current option. - */ - public Options setNoBlockCache(boolean noBlockCache) { - setNoBlockCache(nativeHandle_, noBlockCache); - return this; - } - private native void setNoBlockCache( - long handle, boolean noBlockCache); - /** * The size of one block in arena memory allocation. * If <= 0, a proper value is automatically calculated (usually 1/10 of @@ -2026,39 +1820,6 @@ public Options setPurgeRedundantKvsWhileFlush( private native void setPurgeRedundantKvsWhileFlush( long handle, boolean purgeRedundantKvsWhileFlush); - /** - * This is used to close a block before it reaches the configured - * 'block_size'. If the percentage of free space in the current block is less - * than this specified number and adding a new record to the block will - * exceed the configured block size, then this block will be closed and the - * new record will be written to the next block. - * Default is 10. - * - * @return the target block size - */ - public int blockSizeDeviation() { - return blockSizeDeviation(nativeHandle_); - } - private native int blockSizeDeviation(long handle); - - /** - * This is used to close a block before it reaches the configured - * 'block_size'. If the percentage of free space in the current block is less - * than this specified number and adding a new record to the block will - * exceed the configured block size, then this block will be closed and the - * new record will be written to the next block. - * Default is 10. - * - * @param blockSizeDeviation the target block size - * @return the reference to the current option. - */ - public Options setBlockSizeDeviation(int blockSizeDeviation) { - setBlockSizeDeviation(nativeHandle_, blockSizeDeviation); - return this; - } - private native void setBlockSizeDeviation( - long handle, int blockSizeDeviation); - /** * If true, compaction will verify checksum on every read that happens * as part of compaction @@ -2440,11 +2201,6 @@ private native void setMinPartialMergeOperands( private native void setMaxWriteBufferNumber( long handle, int maxWriteBufferNumber); private native int maxWriteBufferNumber(long handle); - private native void setBlockSize(long handle, long blockSize); - private native long blockSize(long handle); - private native void setDisableSeekCompaction( - long handle, boolean disableSeekCompaction); - private native boolean disableSeekCompaction(long handle); private native void setMaxBackgroundCompactions( long handle, int maxBackgroundCompactions); private native int maxBackgroundCompactions(long handle); @@ -2452,6 +2208,8 @@ private native void setMaxBackgroundCompactions( private native long statisticsPtr(long optHandle); private native void setMemTableFactory(long handle, long factoryHandle); + private native void setRateLimiter(long handle, + long rateLimiterHandle); private native String memTableFactoryName(long handle); private native void setTableFactory(long handle, long factoryHandle); @@ -2462,6 +2220,5 @@ private native void useFixedLengthPrefixExtractor( long cacheSize_; int numShardBits_; - Filter filter_; RocksEnv env_; } diff --git a/java/org/rocksdb/RateLimiterConfig.java b/java/org/rocksdb/RateLimiterConfig.java new file mode 100644 index 00000000000..22de6592198 --- /dev/null +++ b/java/org/rocksdb/RateLimiterConfig.java @@ -0,0 +1,20 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +package org.rocksdb; + +/** + * Config for rate limiter, which is used to control write rate of flush and + * compaction. + */ +public abstract class RateLimiterConfig { + /** + * This function should only be called by Options.setRateLimiter(), + * which will create a c++ shared-pointer to the c++ RateLimiter + * that is associated with the Java RateLimtierConifg. + * + * @see Options.setRateLimiter() + */ + abstract protected long newRateLimiterHandle(); +} diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java index ec1cb8a28a1..91726253b27 100644 --- a/java/org/rocksdb/RocksDB.java +++ b/java/org/rocksdb/RocksDB.java @@ -114,11 +114,11 @@ public static RocksDB open(String path) throws RocksDBException { /** * The factory constructor of RocksDB that opens a RocksDB instance given * the path to the database using the specified options and db path. - * + * * Options instance *should* not be disposed before all DBs using this options * instance have been closed. If user doesn't call options dispose explicitly, * then this options instance will be GC'd automatically. - * + * * Options instance can be re-used to open multiple DBs if DB statistics is * not used. If DB statistics are required, then its recommended to open DB * with new Options instance as underlying native statistics instance does not @@ -130,13 +130,12 @@ public static RocksDB open(Options options, String path) // in RocksDB can prevent Java to GC during the life-time of // the currently-created RocksDB. RocksDB db = new RocksDB(); - db.open(options.nativeHandle_, options.cacheSize_, - options.numShardBits_, path); - + db.open(options.nativeHandle_, path); + db.storeOptionsInstance(options); return db; } - + private void storeOptionsInstance(Options options) { options_ = options; } @@ -349,8 +348,7 @@ protected RocksDB() { // native methods protected native void open( - long optionsHandle, long cacheSize, int numShardBits, - String path) throws RocksDBException; + long optionsHandle, String path) throws RocksDBException; protected native void put( long handle, byte[] key, int keyLen, byte[] value, int valueLen) throws RocksDBException; diff --git a/java/org/rocksdb/benchmark/DbBenchmark.java b/java/org/rocksdb/benchmark/DbBenchmark.java index 36eea0c1745..686d39445f3 100644 --- a/java/org/rocksdb/benchmark/DbBenchmark.java +++ b/java/org/rocksdb/benchmark/DbBenchmark.java @@ -255,7 +255,7 @@ public WriteTask( for (long j = 0; j < entriesPerBatch_; j++) { getKey(key, i + j, keyRange_); DbBenchmark.this.gen_.generate(value); - db_.put(writeOpt_, key, value); + batch.put(key, value); stats_.finishedSingleOp(keySize_ + valueSize_); } db_.write(writeOpt_, batch); @@ -446,7 +446,6 @@ public DbBenchmark(Map flags) throws Exception { randSeed_ = (Long) flags.get(Flag.seed); databaseDir_ = (String) flags.get(Flag.db); writesPerSeconds_ = (Integer) flags.get(Flag.writes_per_second); - cacheSize_ = (Long) flags.get(Flag.cache_size); memtable_ = (String) flags.get(Flag.memtablerep); maxWriteBufferNumber_ = (Integer) flags.get(Flag.max_write_buffer_number); prefixSize_ = (Integer) flags.get(Flag.prefix_size); @@ -491,7 +490,6 @@ private void prepareWriteOptions(WriteOptions options) { } private void prepareOptions(Options options) { - options.setCacheSize(cacheSize_); if (!useExisting_) { options.setCreateIfMissing(true); } else { @@ -521,6 +519,13 @@ private void prepareOptions(Options options) { if (usePlainTable_) { options.setTableFormatConfig( new PlainTableConfig().setKeySize(keySize_)); + } else { + BlockBasedTableConfig table_options = new BlockBasedTableConfig(); + table_options.setBlockSize((Long)flags_.get(Flag.block_size)) + .setBlockCacheSize((Long)flags_.get(Flag.cache_size)) + .setFilterBitsPerKey((Integer)flags_.get(Flag.bloom_bits)) + .setCacheNumShardBits((Integer)flags_.get(Flag.cache_numshardbits)); + options.setTableFormatConfig(table_options); } options.setWriteBufferSize( (Long)flags_.get(Flag.write_buffer_size)); @@ -532,12 +537,6 @@ private void prepareOptions(Options options) { (Integer)flags_.get(Flag.max_background_compactions)); options.setMaxBackgroundFlushes( (Integer)flags_.get(Flag.max_background_flushes)); - options.setCacheSize( - (Long)flags_.get(Flag.cache_size)); - options.setCacheNumShardBits( - (Integer)flags_.get(Flag.cache_numshardbits)); - options.setBlockSize( - (Long)flags_.get(Flag.block_size)); options.setMaxOpenFiles( (Integer)flags_.get(Flag.open_files)); options.setTableCacheRemoveScanCountLimit( @@ -548,8 +547,6 @@ private void prepareOptions(Options options) { (Boolean)flags_.get(Flag.use_fsync)); options.setWalDir( (String)flags_.get(Flag.wal_dir)); - options.setDisableSeekCompaction( - (Boolean)flags_.get(Flag.disable_seek_compaction)); options.setDeleteObsoleteFilesPeriodMicros( (Integer)flags_.get(Flag.delete_obsolete_files_period_micros)); options.setTableCacheNumshardbits( @@ -604,15 +601,6 @@ private void prepareOptions(Options options) { (Integer)flags_.get(Flag.max_successive_merges)); options.setWalTtlSeconds((Long)flags_.get(Flag.wal_ttl_seconds)); options.setWalSizeLimitMB((Long)flags_.get(Flag.wal_size_limit_MB)); - int bloomBits = (Integer)flags_.get(Flag.bloom_bits); - if (bloomBits > 0) { - // Internally, options will keep a reference to this BloomFilter. - // This will disallow Java to GC this BloomFilter. In addition, - // options.dispose() will release the c++ object of this BloomFilter. - // As a result, the caller should not directly call - // BloomFilter.dispose(). - options.setFilter(new BloomFilter(bloomBits)); - } /* TODO(yhchiang): enable the following parameters options.setCompressionType((String)flags_.get(Flag.compression_type)); options.setCompressionLevel((Integer)flags_.get(Flag.compression_level)); @@ -1160,7 +1148,7 @@ private enum Flag { return Integer.parseInt(value); } }, - block_size(defaultOptions_.blockSize(), + block_size(defaultBlockBasedTableOptions_.blockSize(), "Number of bytes in a block.") { @Override public Object parseValue(String value) { return Long.parseLong(value); @@ -1312,12 +1300,6 @@ private enum Flag { return Integer.parseInt(value); } }, - disable_seek_compaction(false,"Option to disable compaction\n" + - "\ttriggered by read.") { - @Override public Object parseValue(String value) { - return parseBoolean(value); - } - }, delete_obsolete_files_period_micros(0,"Option to delete\n" + "\tobsolete files periodically. 0 means that obsolete files are\n" + "\tdeleted after every compaction run.") { @@ -1597,7 +1579,6 @@ void setFinished(boolean flag) { final int threadNum_; final int writesPerSeconds_; final long randSeed_; - final long cacheSize_; final boolean useExisting_; final String databaseDir_; double compressionRatio_; @@ -1620,6 +1601,8 @@ void setFinished(boolean flag) { // as the scope of a static member equals to the scope of the problem, // we let its c++ pointer to be disposed in its finalizer. static Options defaultOptions_ = new Options(); + static BlockBasedTableConfig defaultBlockBasedTableOptions_ = + new BlockBasedTableConfig(); String compressionType_; CompressionType compression_; } diff --git a/java/org/rocksdb/test/OptionsTest.java b/java/org/rocksdb/test/OptionsTest.java index e1e0e059e1d..d81ca10761e 100644 --- a/java/org/rocksdb/test/OptionsTest.java +++ b/java/org/rocksdb/test/OptionsTest.java @@ -52,12 +52,6 @@ public static void main(String[] args) { assert(opt.useFsync() == boolValue); } - { // DbStatsLogInterval test - int intValue = rand.nextInt(); - opt.setDbStatsLogInterval(intValue); - assert(opt.dbStatsLogInterval() == intValue); - } - { // DbLogDir test String str = "path/to/DbLogDir"; opt.setDbLogDir(str); @@ -214,24 +208,6 @@ public static void main(String[] args) { assert(opt.minWriteBufferNumberToMerge() == intValue); } - { // BlockSize test - long longValue = rand.nextLong(); - opt.setBlockSize(longValue); - assert(opt.blockSize() == longValue); - } - - { // BlockRestartInterval test - int intValue = rand.nextInt(); - opt.setBlockRestartInterval(intValue); - assert(opt.blockRestartInterval() == intValue); - } - - { // WholeKeyFiltering test - boolean boolValue = rand.nextBoolean(); - opt.setWholeKeyFiltering(boolValue); - assert(opt.wholeKeyFiltering() == boolValue); - } - { // NumLevels test int intValue = rand.nextInt(); opt.setNumLevels(intValue); @@ -304,12 +280,6 @@ public static void main(String[] args) { assert(opt.maxGrandparentOverlapFactor() == intValue); } - { // DisableSeekCompaction test - boolean boolValue = rand.nextBoolean(); - opt.setDisableSeekCompaction(boolValue); - assert(opt.disableSeekCompaction() == boolValue); - } - { // SoftRateLimit test double doubleValue = rand.nextDouble(); opt.setSoftRateLimit(doubleValue); @@ -328,12 +298,6 @@ public static void main(String[] args) { assert(opt.rateLimitDelayMaxMilliseconds() == intValue); } - { // NoBlockCache test - boolean boolValue = rand.nextBoolean(); - opt.setNoBlockCache(boolValue); - assert(opt.noBlockCache() == boolValue); - } - { // ArenaBlockSize test long longValue = rand.nextLong(); opt.setArenaBlockSize(longValue); @@ -352,12 +316,6 @@ public static void main(String[] args) { assert(opt.purgeRedundantKvsWhileFlush() == boolValue); } - { // BlockSizeDeviation test - int intValue = rand.nextInt(); - opt.setBlockSizeDeviation(intValue); - assert(opt.blockSizeDeviation() == intValue); - } - { // VerifyChecksumsInCompaction test boolean boolValue = rand.nextBoolean(); opt.setVerifyChecksumsInCompaction(boolValue); diff --git a/java/rocksjni/memtablejni.cc b/java/rocksjni/memtablejni.cc index a0d50f5f5e0..9b0dc252c05 100644 --- a/java/rocksjni/memtablejni.cc +++ b/java/rocksjni/memtablejni.cc @@ -5,6 +5,7 @@ // // This file implements the "bridge" between Java and C++ for MemTables. +#include "rocksjni/portal.h" #include "include/org_rocksdb_HashSkipListMemTableConfig.h" #include "include/org_rocksdb_HashLinkedListMemTableConfig.h" #include "include/org_rocksdb_VectorMemTableConfig.h" @@ -20,7 +21,7 @@ jlong Java_org_rocksdb_HashSkipListMemTableConfig_newMemTableFactoryHandle( JNIEnv* env, jobject jobj, jlong jbucket_count, jint jheight, jint jbranching_factor) { return reinterpret_cast(rocksdb::NewHashSkipListRepFactory( - static_cast(jbucket_count), + rocksdb::jlong_to_size_t(jbucket_count), static_cast(jheight), static_cast(jbranching_factor))); } @@ -33,7 +34,7 @@ jlong Java_org_rocksdb_HashSkipListMemTableConfig_newMemTableFactoryHandle( jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle( JNIEnv* env, jobject jobj, jlong jbucket_count) { return reinterpret_cast(rocksdb::NewHashLinkListRepFactory( - static_cast(jbucket_count))); + rocksdb::jlong_to_size_t(jbucket_count))); } /* @@ -44,7 +45,7 @@ jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle( jlong Java_org_rocksdb_VectorMemTableConfig_newMemTableFactoryHandle( JNIEnv* env, jobject jobj, jlong jreserved_size) { return reinterpret_cast(new rocksdb::VectorRepFactory( - static_cast(jreserved_size))); + rocksdb::jlong_to_size_t(jreserved_size))); } /* diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc index abbf598a7d5..705e9ff8c17 100644 --- a/java/rocksjni/options.cc +++ b/java/rocksjni/options.cc @@ -21,7 +21,7 @@ #include "rocksdb/memtablerep.h" #include "rocksdb/table.h" #include "rocksdb/slice_transform.h" -#include "rocksdb/filter_policy.h" +#include "rocksdb/rate_limiter.h" /* * Class: org_rocksdb_Options @@ -71,7 +71,7 @@ jboolean Java_org_rocksdb_Options_createIfMissing( void Java_org_rocksdb_Options_setWriteBufferSize( JNIEnv* env, jobject jobj, jlong jhandle, jlong jwrite_buffer_size) { reinterpret_cast(jhandle)->write_buffer_size = - static_cast(jwrite_buffer_size); + rocksdb::jlong_to_size_t(jwrite_buffer_size); } @@ -118,17 +118,6 @@ jlong Java_org_rocksdb_Options_statisticsPtr( return reinterpret_cast(st); } -/* - * Class: org_rocksdb_Options - * Method: setFilterHandle - * Signature: (JJ)V - */ -void Java_org_rocksdb_Options_setFilterHandle( - JNIEnv* env, jobject jobj, jlong jopt_handle, jlong jfilter_handle) { - reinterpret_cast(jopt_handle)->filter_policy = - reinterpret_cast(jfilter_handle); -} - /* * Class: org_rocksdb_Options * Method: maxWriteBufferNumber @@ -139,49 +128,6 @@ jint Java_org_rocksdb_Options_maxWriteBufferNumber( return reinterpret_cast(jhandle)->max_write_buffer_number; } -/* - * Class: org_rocksdb_Options - * Method: setBlockSize - * Signature: (JJ)V - */ -void Java_org_rocksdb_Options_setBlockSize( - JNIEnv* env, jobject jobj, jlong jhandle, jlong jblock_size) { - reinterpret_cast(jhandle)->block_size = - static_cast(jblock_size); -} - -/* - * Class: org_rocksdb_Options - * Method: blockSize - * Signature: (J)J - */ -jlong Java_org_rocksdb_Options_blockSize( - JNIEnv* env, jobject jobj, jlong jhandle) { - return reinterpret_cast(jhandle)->block_size; -} - -/* - * Class: org_rocksdb_Options - * Method: setDisableSeekCompaction - * Signature: (JZ)V - */ -void Java_org_rocksdb_Options_setDisableSeekCompaction( - JNIEnv* env, jobject jobj, jlong jhandle, - jboolean jdisable_seek_compaction) { - reinterpret_cast(jhandle)->disable_seek_compaction = - jdisable_seek_compaction; -} - -/* - * Class: org_rocksdb_Options - * Method: disableSeekCompaction - * Signature: (J)Z - */ -jboolean Java_org_rocksdb_Options_disableSeekCompaction( - JNIEnv* env, jobject jobj, jlong jhandle) { - return reinterpret_cast(jhandle)->disable_seek_compaction; -} - /* * Class: org_rocksdb_Options * Method: errorIfExists @@ -287,27 +233,6 @@ void Java_org_rocksdb_Options_setUseFsync( static_cast(use_fsync); } -/* - * Class: org_rocksdb_Options - * Method: dbStatsLogInterval - * Signature: (J)I - */ -jint Java_org_rocksdb_Options_dbStatsLogInterval( - JNIEnv* env, jobject jobj, jlong jhandle) { - return reinterpret_cast(jhandle)->db_stats_log_interval; -} - -/* - * Class: org_rocksdb_Options - * Method: setDbStatsLogInterval - * Signature: (JI)V - */ -void Java_org_rocksdb_Options_setDbStatsLogInterval( - JNIEnv* env, jobject jobj, jlong jhandle, jint db_stats_log_interval) { - reinterpret_cast(jhandle)->db_stats_log_interval = - static_cast(db_stats_log_interval); -} - /* * Class: org_rocksdb_Options * Method: dbLogDir @@ -438,7 +363,7 @@ jlong Java_org_rocksdb_Options_maxLogFileSize( void Java_org_rocksdb_Options_setMaxLogFileSize( JNIEnv* env, jobject jobj, jlong jhandle, jlong max_log_file_size) { reinterpret_cast(jhandle)->max_log_file_size = - static_cast(max_log_file_size); + rocksdb::jlong_to_size_t(max_log_file_size); } /* @@ -459,7 +384,7 @@ jlong Java_org_rocksdb_Options_logFileTimeToRoll( void Java_org_rocksdb_Options_setLogFileTimeToRoll( JNIEnv* env, jobject jobj, jlong jhandle, jlong log_file_time_to_roll) { reinterpret_cast(jhandle)->log_file_time_to_roll = - static_cast(log_file_time_to_roll); + rocksdb::jlong_to_size_t(log_file_time_to_roll); } /* @@ -480,7 +405,7 @@ jlong Java_org_rocksdb_Options_keepLogFileNum( void Java_org_rocksdb_Options_setKeepLogFileNum( JNIEnv* env, jobject jobj, jlong jhandle, jlong keep_log_file_num) { reinterpret_cast(jhandle)->keep_log_file_num = - static_cast(keep_log_file_num); + rocksdb::jlong_to_size_t(keep_log_file_num); } /* @@ -535,6 +460,17 @@ void Java_org_rocksdb_Options_setMemTableFactory( reinterpret_cast(jfactory_handle)); } +/* + * Class: org_rocksdb_Options + * Method: setRateLimiter + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setRateLimiter( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jrate_limiter_handle) { + reinterpret_cast(jhandle)->rate_limiter.reset( + reinterpret_cast(jrate_limiter_handle)); +} + /* * Class: org_rocksdb_Options * Method: tableCacheNumshardbits @@ -585,7 +521,8 @@ void Java_org_rocksdb_Options_setTableCacheRemoveScanCountLimit( void Java_org_rocksdb_Options_useFixedLengthPrefixExtractor( JNIEnv* env, jobject jobj, jlong jhandle, jint jprefix_length) { reinterpret_cast(jhandle)->prefix_extractor.reset( - rocksdb::NewFixedPrefixTransform(static_cast(jprefix_length))); + rocksdb::NewFixedPrefixTransform( + rocksdb::jlong_to_size_t(jprefix_length))); } /* @@ -649,7 +586,7 @@ jlong Java_org_rocksdb_Options_manifestPreallocationSize( void Java_org_rocksdb_Options_setManifestPreallocationSize( JNIEnv* env, jobject jobj, jlong jhandle, jlong preallocation_size) { reinterpret_cast(jhandle)->manifest_preallocation_size = - static_cast(preallocation_size); + rocksdb::jlong_to_size_t(preallocation_size); } /* @@ -914,27 +851,6 @@ void Java_org_rocksdb_Options_setMinWriteBufferNumberToMerge( static_cast(jmin_write_buffer_number_to_merge); } -/* - * Class: org_rocksdb_Options - * Method: blockRestartInterval - * Signature: (J)I - */ -jint Java_org_rocksdb_Options_blockRestartInterval( - JNIEnv* env, jobject jobj, jlong jhandle) { - return reinterpret_cast(jhandle)->block_restart_interval; -} - -/* - * Class: org_rocksdb_Options - * Method: setBlockRestartInterval - * Signature: (JI)V - */ -void Java_org_rocksdb_Options_setBlockRestartInterval( - JNIEnv* env, jobject jobj, jlong jhandle, jint jblock_restart_interval) { - reinterpret_cast(jhandle)->block_restart_interval = - static_cast(jblock_restart_interval); -} - /* * Class: org_rocksdb_Options * Method: setCompressionType @@ -977,27 +893,6 @@ jbyte Java_org_rocksdb_Options_compactionStyle( return reinterpret_cast(jhandle)->compaction_style; } -/* - * Class: org_rocksdb_Options - * Method: wholeKeyFiltering - * Signature: (J)Z - */ -jboolean Java_org_rocksdb_Options_wholeKeyFiltering( - JNIEnv* env, jobject jobj, jlong jhandle) { - return reinterpret_cast(jhandle)->whole_key_filtering; -} - -/* - * Class: org_rocksdb_Options - * Method: setWholeKeyFiltering - * Signature: (JZ)V - */ -void Java_org_rocksdb_Options_setWholeKeyFiltering( - JNIEnv* env, jobject jobj, jlong jhandle, jboolean jwhole_key_filtering) { - reinterpret_cast(jhandle)->whole_key_filtering = - static_cast(jwhole_key_filtering); -} - /* * Class: org_rocksdb_Options * Method: numLevels @@ -1345,27 +1240,6 @@ void Java_org_rocksdb_Options_setRateLimitDelayMaxMilliseconds( static_cast(jrate_limit_delay_max_milliseconds); } -/* - * Class: org_rocksdb_Options - * Method: noBlockCache - * Signature: (J)Z - */ -jboolean Java_org_rocksdb_Options_noBlockCache( - JNIEnv* env, jobject jobj, jlong jhandle) { - return reinterpret_cast(jhandle)->no_block_cache; -} - -/* - * Class: org_rocksdb_Options - * Method: setNoBlockCache - * Signature: (JZ)V - */ -void Java_org_rocksdb_Options_setNoBlockCache( - JNIEnv* env, jobject jobj, jlong jhandle, jboolean jno_block_cache) { - reinterpret_cast(jhandle)->no_block_cache = - static_cast(jno_block_cache); -} - /* * Class: org_rocksdb_Options * Method: arenaBlockSize @@ -1384,7 +1258,7 @@ jlong Java_org_rocksdb_Options_arenaBlockSize( void Java_org_rocksdb_Options_setArenaBlockSize( JNIEnv* env, jobject jobj, jlong jhandle, jlong jarena_block_size) { reinterpret_cast(jhandle)->arena_block_size = - static_cast(jarena_block_size); + rocksdb::jlong_to_size_t(jarena_block_size); } /* @@ -1435,28 +1309,6 @@ void Java_org_rocksdb_Options_setPurgeRedundantKvsWhileFlush( static_cast(jpurge_redundant_kvs_while_flush); } -/* - * Class: org_rocksdb_Options - * Method: blockSizeDeviation - * Signature: (J)I - */ -jint Java_org_rocksdb_Options_blockSizeDeviation( - JNIEnv* env, jobject jobj, jlong jhandle) { - return reinterpret_cast(jhandle)->block_size_deviation; -} - -/* - * Class: org_rocksdb_Options - * Method: setBlockSizeDeviation - * Signature: (JI)V - */ -void Java_org_rocksdb_Options_setBlockSizeDeviation( - JNIEnv* env, jobject jobj, jlong jhandle, - jint jblock_size_deviation) { - reinterpret_cast(jhandle)->block_size_deviation = - static_cast(jblock_size_deviation); -} - /* * Class: org_rocksdb_Options * Method: verifyChecksumsInCompaction @@ -1571,7 +1423,7 @@ void Java_org_rocksdb_Options_setInplaceUpdateNumLocks( jlong jinplace_update_num_locks) { reinterpret_cast( jhandle)->inplace_update_num_locks = - static_cast(jinplace_update_num_locks); + rocksdb::jlong_to_size_t(jinplace_update_num_locks); } /* @@ -1662,7 +1514,7 @@ void Java_org_rocksdb_Options_setMaxSuccessiveMerges( JNIEnv* env, jobject jobj, jlong jhandle, jlong jmax_successive_merges) { reinterpret_cast(jhandle)->max_successive_merges = - static_cast(jmax_successive_merges); + rocksdb::jlong_to_size_t(jmax_successive_merges); } /* diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index 28fe754f0f6..4c7a8b9b9bb 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -11,12 +11,19 @@ #define JAVA_ROCKSJNI_PORTAL_H_ #include +#include #include "rocksdb/db.h" #include "rocksdb/filter_policy.h" #include "rocksdb/utilities/backupable_db.h" namespace rocksdb { +inline size_t jlong_to_size_t(const jlong& jvalue) { + return static_cast(jvalue) <= + static_cast(std::numeric_limits::max()) ? + static_cast(jvalue) : std::numeric_limits::max(); +} + // The portal class for org.rocksdb.RocksDB class RocksDBJni { public: diff --git a/java/rocksjni/ratelimiterjni.cc b/java/rocksjni/ratelimiterjni.cc new file mode 100644 index 00000000000..5413978a006 --- /dev/null +++ b/java/rocksjni/ratelimiterjni.cc @@ -0,0 +1,24 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ for RateLimiter. + +#include "rocksjni/portal.h" +#include "include/org_rocksdb_GenericRateLimiterConfig.h" +#include "rocksdb/rate_limiter.h" + +/* + * Class: org_rocksdb_GenericRateLimiterConfig + * Method: newRateLimiterHandle + * Signature: (JJI)J + */ +jlong Java_org_rocksdb_GenericRateLimiterConfig_newRateLimiterHandle( + JNIEnv* env, jobject jobj, jlong jrate_bytes_per_second, + jlong jrefill_period_micros, jint jfairness) { + return reinterpret_cast(rocksdb::NewGenericRateLimiter( + rocksdb::jlong_to_size_t(jrate_bytes_per_second), + rocksdb::jlong_to_size_t(jrefill_period_micros), + static_cast(jfairness))); +} diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc index 2e0da85e5f5..f55290f6493 100644 --- a/java/rocksjni/rocksjni.cc +++ b/java/rocksjni/rocksjni.cc @@ -26,21 +26,8 @@ * Signature: (JLjava/lang/String;)V */ void Java_org_rocksdb_RocksDB_open( - JNIEnv* env, jobject jdb, jlong jopt_handle, - jlong jcache_size, jint jnum_shardbits, jstring jdb_path) { + JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path) { auto opt = reinterpret_cast(jopt_handle); - if (jcache_size > 0) { - opt->no_block_cache = false; - if (jnum_shardbits >= 1) { - opt->block_cache = rocksdb::NewLRUCache(jcache_size, jnum_shardbits); - } else { - opt->block_cache = rocksdb::NewLRUCache(jcache_size); - } - } else { - opt->no_block_cache = true; - opt->block_cache = nullptr; - } - rocksdb::DB* db = nullptr; const char* db_path = env->GetStringUTFChars(jdb_path, 0); rocksdb::Status s = rocksdb::DB::Open(*opt, db_path, &db); diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc index 4d6114f18c1..ffda1a2ba43 100644 --- a/java/rocksjni/table.cc +++ b/java/rocksjni/table.cc @@ -7,7 +7,10 @@ #include #include "include/org_rocksdb_PlainTableConfig.h" +#include "include/org_rocksdb_BlockBasedTableConfig.h" #include "rocksdb/table.h" +#include "rocksdb/cache.h" +#include "rocksdb/filter_policy.h" /* * Class: org_rocksdb_PlainTableConfig @@ -24,3 +27,34 @@ jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle( options.index_sparseness = jindex_sparseness; return reinterpret_cast(rocksdb::NewPlainTableFactory(options)); } + +/* + * Class: org_rocksdb_BlockBasedTableConfig + * Method: newTableFactoryHandle + * Signature: (ZJIJIIZI)J + */ +jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( + JNIEnv* env, jobject jobj, jboolean no_block_cache, jlong block_cache_size, + jint num_shardbits, jlong block_size, jint block_size_deviation, + jint block_restart_interval, jboolean whole_key_filtering, + jint bits_per_key) { + rocksdb::BlockBasedTableOptions options; + options.no_block_cache = no_block_cache; + + if (!no_block_cache && block_cache_size > 0) { + if (num_shardbits > 0) { + options.block_cache = + rocksdb::NewLRUCache(block_cache_size, num_shardbits); + } else { + options.block_cache = rocksdb::NewLRUCache(block_cache_size); + } + } + options.block_size = block_size; + options.block_size_deviation = block_size_deviation; + options.block_restart_interval = block_restart_interval; + options.whole_key_filtering = whole_key_filtering; + if (bits_per_key > 0) { + options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(bits_per_key)); + } + return reinterpret_cast(rocksdb::NewBlockBasedTableFactory(options)); +} diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc index e8b2456eeed..0492ea1be2e 100644 --- a/java/rocksjni/write_batch.cc +++ b/java/rocksjni/write_batch.cc @@ -12,12 +12,14 @@ #include "include/org_rocksdb_WriteBatchTest.h" #include "rocksjni/portal.h" #include "rocksdb/db.h" +#include "rocksdb/immutable_options.h" #include "db/memtable.h" #include "rocksdb/write_batch.h" #include "db/write_batch_internal.h" #include "rocksdb/env.h" #include "rocksdb/memtablerep.h" #include "util/logging.h" +#include "util/scoped_arena_iterator.h" #include "util/testharness.h" /* @@ -28,7 +30,7 @@ void Java_org_rocksdb_WriteBatch_newWriteBatch( JNIEnv* env, jobject jobj, jint jreserved_bytes) { rocksdb::WriteBatch* wb = new rocksdb::WriteBatch( - static_cast(jreserved_bytes)); + rocksdb::jlong_to_size_t(jreserved_bytes)); rocksdb::WriteBatchJni::setHandle(env, jobj, wb); } @@ -202,14 +204,18 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents( auto factory = std::make_shared(); rocksdb::Options options; options.memtable_factory = factory; - rocksdb::MemTable* mem = new rocksdb::MemTable(cmp, options); + rocksdb::MemTable* mem = new rocksdb::MemTable( + cmp, rocksdb::ImmutableCFOptions(options), + rocksdb::MemTableOptions(rocksdb::MutableCFOptions(options), options)); mem->Ref(); std::string state; rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem, &options); rocksdb::Status s = rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default); int count = 0; - rocksdb::Iterator* iter = mem->NewIterator(rocksdb::ReadOptions()); + rocksdb::Arena arena; + rocksdb::ScopedArenaIterator iter(mem->NewIterator( + rocksdb::ReadOptions(), &arena)); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { rocksdb::ParsedInternalKey ikey; memset(reinterpret_cast(&ikey), 0, sizeof(ikey)); @@ -244,7 +250,6 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents( state.append("@"); state.append(rocksdb::NumberToString(ikey.sequence)); } - delete iter; if (!s.ok()) { state.append(s.ToString()); } else if (count != rocksdb::WriteBatchInternal::Count(b)) { diff --git a/port/stack_trace.cc b/port/stack_trace.cc index 76866e63cc8..296b1f6209c 100644 --- a/port/stack_trace.cc +++ b/port/stack_trace.cc @@ -33,7 +33,7 @@ const char* GetExecutableName() { char link[1024]; snprintf(link, sizeof(link), "/proc/%d/exe", getpid()); - auto read = readlink(link, name, sizeof(name)); + auto read = readlink(link, name, sizeof(name) - 1); if (-1 == read) { return nullptr; } else { diff --git a/table/adaptive_table_factory.cc b/table/adaptive_table_factory.cc index 3d03a7c4f2f..c693064af9e 100644 --- a/table/adaptive_table_factory.cc +++ b/table/adaptive_table_factory.cc @@ -39,7 +39,7 @@ extern const uint64_t kLegacyBlockBasedTableMagicNumber; extern const uint64_t kCuckooTableMagicNumber; Status AdaptiveTableFactory::NewTableReader( - const Options& options, const EnvOptions& soptions, + const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& icomp, unique_ptr&& file, uint64_t file_size, unique_ptr* table) const { Footer footer; @@ -50,24 +50,59 @@ Status AdaptiveTableFactory::NewTableReader( if (footer.table_magic_number() == kPlainTableMagicNumber || footer.table_magic_number() == kLegacyPlainTableMagicNumber) { return plain_table_factory_->NewTableReader( - options, soptions, icomp, std::move(file), file_size, table); + ioptions, env_options, icomp, std::move(file), file_size, table); } else if (footer.table_magic_number() == kBlockBasedTableMagicNumber || footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) { return block_based_table_factory_->NewTableReader( - options, soptions, icomp, std::move(file), file_size, table); + ioptions, env_options, icomp, std::move(file), file_size, table); } else if (footer.table_magic_number() == kCuckooTableMagicNumber) { return cuckoo_table_factory_->NewTableReader( - options, soptions, icomp, std::move(file), file_size, table); + ioptions, env_options, icomp, std::move(file), file_size, table); } else { return Status::NotSupported("Unidentified table format"); } } TableBuilder* AdaptiveTableFactory::NewTableBuilder( - const Options& options, const InternalKeyComparator& internal_comparator, - WritableFile* file, CompressionType compression_type) const { - return table_factory_to_write_->NewTableBuilder(options, internal_comparator, - file, compression_type); + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& internal_comparator, + WritableFile* file, const CompressionType compression_type, + const CompressionOptions& compression_opts) const { + return table_factory_to_write_->NewTableBuilder( + ioptions, internal_comparator, file, compression_type, compression_opts); +} + +std::string AdaptiveTableFactory::GetPrintableTableOptions() const { + std::string ret; + ret.reserve(20000); + const int kBufferSize = 200; + char buffer[kBufferSize]; + + if (!table_factory_to_write_) { + snprintf(buffer, kBufferSize, " write factory (%s) options:\n%s\n", + table_factory_to_write_->Name(), + table_factory_to_write_->GetPrintableTableOptions().c_str()); + ret.append(buffer); + } + if (!plain_table_factory_) { + snprintf(buffer, kBufferSize, " %s options:\n%s\n", + plain_table_factory_->Name(), + plain_table_factory_->GetPrintableTableOptions().c_str()); + ret.append(buffer); + } + if (!block_based_table_factory_) { + snprintf(buffer, kBufferSize, " %s options:\n%s\n", + block_based_table_factory_->Name(), + block_based_table_factory_->GetPrintableTableOptions().c_str()); + ret.append(buffer); + } + if (!cuckoo_table_factory_) { + snprintf(buffer, kBufferSize, " %s options:\n%s\n", + cuckoo_table_factory_->Name(), + cuckoo_table_factory_->GetPrintableTableOptions().c_str()); + ret.append(buffer); + } + return ret; } extern TableFactory* NewAdaptiveTableFactory( diff --git a/table/adaptive_table_factory.h b/table/adaptive_table_factory.h index d898266409c..f0920db97f0 100644 --- a/table/adaptive_table_factory.h +++ b/table/adaptive_table_factory.h @@ -6,12 +6,12 @@ #ifndef ROCKSDB_LITE +#include #include "rocksdb/options.h" #include "rocksdb/table.h" namespace rocksdb { -struct Options; struct EnvOptions; using std::unique_ptr; @@ -30,16 +30,32 @@ class AdaptiveTableFactory : public TableFactory { std::shared_ptr block_based_table_factory, std::shared_ptr plain_table_factory, std::shared_ptr cuckoo_table_factory); + const char* Name() const override { return "AdaptiveTableFactory"; } - Status NewTableReader(const Options& options, const EnvOptions& soptions, - const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table) const override; - TableBuilder* NewTableBuilder(const Options& options, - const InternalKeyComparator& icomparator, - WritableFile* file, - CompressionType compression_type) const - override; + + Status NewTableReader( + const ImmutableCFOptions& ioptions, const EnvOptions& env_options, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table) const override; + + TableBuilder* NewTableBuilder( + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& icomparator, + WritableFile* file, + const CompressionType compression_type, + const CompressionOptions& compression_opts) const override; + + // Sanitizes the specified DB Options. + Status SanitizeDBOptions(const DBOptions* db_opts) const override { + if (db_opts->allow_mmap_reads == false) { + return Status::NotSupported( + "AdaptiveTable with allow_mmap_reads == false is not supported."); + } + return Status::OK(); + } + + std::string GetPrintableTableOptions() const override; private: std::shared_ptr table_factory_to_write_; diff --git a/table/block.cc b/table/block.cc index 24e7b72fa11..592d175b15a 100644 --- a/table/block.cc +++ b/table/block.cc @@ -297,12 +297,10 @@ uint32_t Block::NumRestarts() const { return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); } -Block::Block(const BlockContents& contents) - : data_(contents.data.data()), - size_(contents.data.size()), - owned_(contents.heap_allocated), - cachable_(contents.cachable), - compression_type_(contents.compression_type) { +Block::Block(BlockContents&& contents) + : contents_(std::move(contents)), + data_(contents_.data.data()), + size_(contents_.data.size()) { if (size_ < sizeof(uint32_t)) { size_ = 0; // Error marker } else { @@ -315,13 +313,8 @@ Block::Block(const BlockContents& contents) } } -Block::~Block() { - if (owned_) { - delete[] data_; - } -} - -Iterator* Block::NewIterator(const Comparator* cmp, BlockIter* iter) { +Iterator* Block::NewIterator( + const Comparator* cmp, BlockIter* iter, bool total_order_seek) { if (size_ < 2*sizeof(uint32_t)) { if (iter != nullptr) { iter->SetStatus(Status::Corruption("bad block contents")); @@ -339,12 +332,17 @@ Iterator* Block::NewIterator(const Comparator* cmp, BlockIter* iter) { return NewEmptyIterator(); } } else { + BlockHashIndex* hash_index_ptr = + total_order_seek ? nullptr : hash_index_.get(); + BlockPrefixIndex* prefix_index_ptr = + total_order_seek ? nullptr : prefix_index_.get(); + if (iter != nullptr) { iter->Initialize(cmp, data_, restart_offset_, num_restarts, - hash_index_.get(), prefix_index_.get()); + hash_index_ptr, prefix_index_ptr); } else { iter = new BlockIter(cmp, data_, restart_offset_, num_restarts, - hash_index_.get(), prefix_index_.get()); + hash_index_ptr, prefix_index_ptr); } } diff --git a/table/block.h b/table/block.h index 494ed24bd73..68b16ea1f3e 100644 --- a/table/block.h +++ b/table/block.h @@ -14,6 +14,10 @@ #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "db/dbformat.h" +#include "table/block_prefix_index.h" +#include "table/block_hash_index.h" + +#include "format.h" namespace rocksdb { @@ -26,15 +30,17 @@ class BlockPrefixIndex; class Block { public: // Initialize the block with the specified contents. - explicit Block(const BlockContents& contents); + explicit Block(BlockContents&& contents); - ~Block(); + ~Block() = default; size_t size() const { return size_; } const char* data() const { return data_; } - bool cachable() const { return cachable_; } + bool cachable() const { return contents_.cachable; } uint32_t NumRestarts() const; - CompressionType compression_type() const { return compression_type_; } + CompressionType compression_type() const { + return contents_.compression_type; + } // If hash index lookup is enabled and `use_hash_index` is true. This block // will do hash lookup for the key prefix. @@ -45,8 +51,12 @@ class Block { // // If iter is null, return new Iterator // If iter is not null, update this one and return it as Iterator* + // + // If total_order_seek is true, hash_index_ and prefix_index_ are ignored. + // This option only applies for index block. For data block, hash_index_ + // and prefix_index_ are null, so this option does not matter. Iterator* NewIterator(const Comparator* comparator, - BlockIter* iter = nullptr); + BlockIter* iter = nullptr, bool total_order_seek = true); void SetBlockHashIndex(BlockHashIndex* hash_index); void SetBlockPrefixIndex(BlockPrefixIndex* prefix_index); @@ -54,12 +64,10 @@ class Block { size_t ApproximateMemoryUsage() const; private: - const char* data_; - size_t size_; + BlockContents contents_; + const char* data_; // contents_.data.data() + size_t size_; // contents_.data.size() uint32_t restart_offset_; // Offset in data_ of restart array - bool owned_; // Block owns data_[] - bool cachable_; - CompressionType compression_type_; std::unique_ptr hash_index_; std::unique_ptr prefix_index_; diff --git a/table/filter_block.cc b/table/block_based_filter_block.cc similarity index 50% rename from table/filter_block.cc rename to table/block_based_filter_block.cc index 3dac4e21068..fea37b67f8a 100644 --- a/table/filter_block.cc +++ b/table/block_based_filter_block.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "table/filter_block.h" +#include "table/block_based_filter_block.h" #include "db/dbformat.h" #include "rocksdb/filter_policy.h" @@ -15,20 +15,39 @@ namespace rocksdb { +namespace { +bool SamePrefix(const SliceTransform* prefix_extractor, + const Slice& key1, const Slice& key2) { + if (!prefix_extractor->InDomain(key1) && + !prefix_extractor->InDomain(key2)) { + return true; + } else if (!prefix_extractor->InDomain(key1) || + !prefix_extractor->InDomain(key2)) { + return false; + } else { + return (prefix_extractor->Transform(key1) == + prefix_extractor->Transform(key2)); + } +} +} // namespace + + // See doc/table_format.txt for an explanation of the filter block format. // Generate new filter every 2KB of data static const size_t kFilterBaseLg = 11; static const size_t kFilterBase = 1 << kFilterBaseLg; -FilterBlockBuilder::FilterBlockBuilder(const Options& opt, - const Comparator* internal_comparator) - : policy_(opt.filter_policy), - prefix_extractor_(opt.prefix_extractor.get()), - whole_key_filtering_(opt.whole_key_filtering), - comparator_(internal_comparator) {} +BlockBasedFilterBlockBuilder::BlockBasedFilterBlockBuilder( + const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt) + : policy_(table_opt.filter_policy.get()), + prefix_extractor_(prefix_extractor), + whole_key_filtering_(table_opt.whole_key_filtering) { + assert(policy_); +} -void FilterBlockBuilder::StartBlock(uint64_t block_offset) { +void BlockBasedFilterBlockBuilder::StartBlock(uint64_t block_offset) { uint64_t filter_index = (block_offset / kFilterBase); assert(filter_index >= filter_offsets_.size()); while (filter_index > filter_offsets_.size()) { @@ -36,59 +55,45 @@ void FilterBlockBuilder::StartBlock(uint64_t block_offset) { } } -bool FilterBlockBuilder::SamePrefix(const Slice &key1, - const Slice &key2) const { - if (!prefix_extractor_->InDomain(key1) && - !prefix_extractor_->InDomain(key2)) { - return true; - } else if (!prefix_extractor_->InDomain(key1) || - !prefix_extractor_->InDomain(key2)) { - return false; - } else { - return (prefix_extractor_->Transform(key1) == - prefix_extractor_->Transform(key2)); +void BlockBasedFilterBlockBuilder::Add(const Slice& key) { + added_to_start_ = 0; + if (whole_key_filtering_) { + AddKey(key); + added_to_start_ = 1; } + if (prefix_extractor_ && prefix_extractor_->InDomain(key)) { + AddPrefix(key); + } +} + +// Add key to filter if needed +inline void BlockBasedFilterBlockBuilder::AddKey(const Slice& key) { + start_.push_back(entries_.size()); + entries_.append(key.data(), key.size()); } -void FilterBlockBuilder::AddKey(const Slice& key) { +// Add prefix to filter if needed +inline void BlockBasedFilterBlockBuilder::AddPrefix(const Slice& key) { // get slice for most recently added entry Slice prev; - size_t added_to_start = 0; - - // add key to filter if needed - if (whole_key_filtering_) { - start_.push_back(entries_.size()); - ++added_to_start; - entries_.append(key.data(), key.size()); - } - - if (start_.size() > added_to_start) { - size_t prev_start = start_[start_.size() - 1 - added_to_start]; + if (start_.size() > added_to_start_) { + size_t prev_start = start_[start_.size() - 1 - added_to_start_]; const char* base = entries_.data() + prev_start; size_t length = entries_.size() - prev_start; prev = Slice(base, length); } - // add prefix to filter if needed - if (prefix_extractor_ && prefix_extractor_->InDomain(ExtractUserKey(key))) { - // If prefix_extractor_, this filter_block layer assumes we only - // operate on internal keys. - Slice user_key = ExtractUserKey(key); - // this assumes prefix(prefix(key)) == prefix(key), as the last - // entry in entries_ may be either a key or prefix, and we use - // prefix(last entry) to get the prefix of the last key. - if (prev.size() == 0 || - !SamePrefix(user_key, ExtractUserKey(prev))) { - Slice prefix = prefix_extractor_->Transform(user_key); - InternalKey internal_prefix_tmp(prefix, 0, kTypeValue); - Slice internal_prefix = internal_prefix_tmp.Encode(); - start_.push_back(entries_.size()); - entries_.append(internal_prefix.data(), internal_prefix.size()); - } + // this assumes prefix(prefix(key)) == prefix(key), as the last + // entry in entries_ may be either a key or prefix, and we use + // prefix(last entry) to get the prefix of the last key. + if (prev.size() == 0 || !SamePrefix(prefix_extractor_, key, prev)) { + Slice prefix = prefix_extractor_->Transform(key); + start_.push_back(entries_.size()); + entries_.append(prefix.data(), prefix.size()); } } -Slice FilterBlockBuilder::Finish() { +Slice BlockBasedFilterBlockBuilder::Finish() { if (!start_.empty()) { GenerateFilter(); } @@ -104,7 +109,7 @@ Slice FilterBlockBuilder::Finish() { return Slice(result_); } -void FilterBlockBuilder::GenerateFilter() { +void BlockBasedFilterBlockBuilder::GenerateFilter() { const size_t num_entries = start_.size(); if (num_entries == 0) { // Fast path if there are no keys for this filter @@ -117,7 +122,7 @@ void FilterBlockBuilder::GenerateFilter() { tmp_entries_.resize(num_entries); for (size_t i = 0; i < num_entries; i++) { const char* base = entries_.data() + start_[i]; - size_t length = start_[i+1] - start_[i]; + size_t length = start_[i + 1] - start_[i]; tmp_entries_[i] = Slice(base, length); } @@ -130,49 +135,52 @@ void FilterBlockBuilder::GenerateFilter() { start_.clear(); } -FilterBlockReader::FilterBlockReader( - const Options& opt, const Slice& contents, bool delete_contents_after_use) - : policy_(opt.filter_policy), - prefix_extractor_(opt.prefix_extractor.get()), - whole_key_filtering_(opt.whole_key_filtering), +BlockBasedFilterBlockReader::BlockBasedFilterBlockReader( + const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt, BlockContents&& contents) + : policy_(table_opt.filter_policy.get()), + prefix_extractor_(prefix_extractor), + whole_key_filtering_(table_opt.whole_key_filtering), data_(nullptr), offset_(nullptr), num_(0), - base_lg_(0) { - size_t n = contents.size(); + base_lg_(0), + contents_(std::move(contents)) { + assert(policy_); + size_t n = contents_.data.size(); if (n < 5) return; // 1 byte for base_lg_ and 4 for start of offset array - base_lg_ = contents[n-1]; - uint32_t last_word = DecodeFixed32(contents.data() + n - 5); + base_lg_ = contents_.data[n - 1]; + uint32_t last_word = DecodeFixed32(contents_.data.data() + n - 5); if (last_word > n - 5) return; - data_ = contents.data(); + data_ = contents_.data.data(); offset_ = data_ + last_word; num_ = (n - 5 - last_word) / 4; - if (delete_contents_after_use) { - filter_data.reset(contents.data()); - } } -bool FilterBlockReader::KeyMayMatch(uint64_t block_offset, - const Slice& key) { +bool BlockBasedFilterBlockReader::KeyMayMatch(const Slice& key, + uint64_t block_offset) { + assert(block_offset != kNotValid); if (!whole_key_filtering_) { return true; } - return MayMatch(block_offset, key); + return MayMatch(key, block_offset); } -bool FilterBlockReader::PrefixMayMatch(uint64_t block_offset, - const Slice& prefix) { +bool BlockBasedFilterBlockReader::PrefixMayMatch(const Slice& prefix, + uint64_t block_offset) { + assert(block_offset != kNotValid); if (!prefix_extractor_) { return true; } - return MayMatch(block_offset, prefix); + return MayMatch(prefix, block_offset); } -bool FilterBlockReader::MayMatch(uint64_t block_offset, const Slice& entry) { +bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry, + uint64_t block_offset) { uint64_t index = block_offset >> base_lg_; if (index < num_) { - uint32_t start = DecodeFixed32(offset_ + index*4); - uint32_t limit = DecodeFixed32(offset_ + index*4 + 4); + uint32_t start = DecodeFixed32(offset_ + index * 4); + uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4); if (start <= limit && limit <= (uint32_t)(offset_ - data_)) { Slice filter = Slice(data_ + start, limit - start); return policy_->KeyMayMatch(entry, filter); @@ -184,7 +192,7 @@ bool FilterBlockReader::MayMatch(uint64_t block_offset, const Slice& entry) { return true; // Errors are treated as potential matches } -size_t FilterBlockReader::ApproximateMemoryUsage() const { +size_t BlockBasedFilterBlockReader::ApproximateMemoryUsage() const { return num_ * 4 + 5 + (offset_ - data_); } } diff --git a/table/block_based_filter_block.h b/table/block_based_filter_block.h new file mode 100644 index 00000000000..9621425e392 --- /dev/null +++ b/table/block_based_filter_block.h @@ -0,0 +1,101 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A filter block is stored near the end of a Table file. It contains +// filters (e.g., bloom filters) for all data blocks in the table combined +// into a single filter block. + +#pragma once + +#include +#include +#include +#include +#include +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "table/filter_block.h" +#include "util/hash.h" + +namespace rocksdb { + + +// A BlockBasedFilterBlockBuilder is used to construct all of the filters for a +// particular Table. It generates a single string which is stored as +// a special block in the Table. +// +// The sequence of calls to BlockBasedFilterBlockBuilder must match the regexp: +// (StartBlock Add*)* Finish +class BlockBasedFilterBlockBuilder : public FilterBlockBuilder { + public: + BlockBasedFilterBlockBuilder(const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt); + + virtual bool IsBlockBased() override { return true; } + virtual void StartBlock(uint64_t block_offset) override; + virtual void Add(const Slice& key) override; + virtual Slice Finish() override; + + private: + void AddKey(const Slice& key); + void AddPrefix(const Slice& key); + void GenerateFilter(); + + // important: all of these might point to invalid addresses + // at the time of destruction of this filter block. destructor + // should NOT dereference them. + const FilterPolicy* policy_; + const SliceTransform* prefix_extractor_; + bool whole_key_filtering_; + + std::string entries_; // Flattened entry contents + std::vector start_; // Starting index in entries_ of each entry + uint32_t added_to_start_; // To indicate if key is added + std::string result_; // Filter data computed so far + std::vector tmp_entries_; // policy_->CreateFilter() argument + std::vector filter_offsets_; + + // No copying allowed + BlockBasedFilterBlockBuilder(const BlockBasedFilterBlockBuilder&); + void operator=(const BlockBasedFilterBlockBuilder&); +}; + +// A FilterBlockReader is used to parse filter from SST table. +// KeyMayMatch and PrefixMayMatch would trigger filter checking +class BlockBasedFilterBlockReader : public FilterBlockReader { + public: + // REQUIRES: "contents" and *policy must stay live while *this is live. + BlockBasedFilterBlockReader(const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt, + BlockContents&& contents); + virtual bool IsBlockBased() override { return true; } + virtual bool KeyMayMatch(const Slice& key, + uint64_t block_offset = kNotValid) override; + virtual bool PrefixMayMatch(const Slice& prefix, + uint64_t block_offset = kNotValid) override; + virtual size_t ApproximateMemoryUsage() const override; + + private: + const FilterPolicy* policy_; + const SliceTransform* prefix_extractor_; + bool whole_key_filtering_; + const char* data_; // Pointer to filter data (at block-start) + const char* offset_; // Pointer to beginning of offset array (at block-end) + size_t num_; // Number of entries in offset array + size_t base_lg_; // Encoding parameter (see kFilterBaseLg in .cc file) + BlockContents contents_; + + bool MayMatch(const Slice& entry, uint64_t block_offset); + + // No copying allowed + BlockBasedFilterBlockReader(const BlockBasedFilterBlockReader&); + void operator=(const BlockBasedFilterBlockReader&); +}; +} // namespace rocksdb diff --git a/table/block_based_filter_block_test.cc b/table/block_based_filter_block_test.cc new file mode 100644 index 00000000000..28eea16ce80 --- /dev/null +++ b/table/block_based_filter_block_test.cc @@ -0,0 +1,242 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/block_based_filter_block.h" + +#include "rocksdb/filter_policy.h" +#include "util/coding.h" +#include "util/hash.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +// For testing: emit an array with one hash value per key +class TestHashFilter : public FilterPolicy { + public: + virtual const char* Name() const { + return "TestHashFilter"; + } + + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { + for (int i = 0; i < n; i++) { + uint32_t h = Hash(keys[i].data(), keys[i].size(), 1); + PutFixed32(dst, h); + } + } + + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const { + uint32_t h = Hash(key.data(), key.size(), 1); + for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) { + if (h == DecodeFixed32(filter.data() + i)) { + return true; + } + } + return false; + } +}; + +class FilterBlockTest { + public: + TestHashFilter policy_; + BlockBasedTableOptions table_options_; + + FilterBlockTest() { + table_options_.filter_policy.reset(new TestHashFilter()); + } +}; + +TEST(FilterBlockTest, EmptyBuilder) { + BlockBasedFilterBlockBuilder builder(nullptr, table_options_); + BlockContents block(builder.Finish(), false, kNoCompression); + ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data)); + BlockBasedFilterBlockReader reader(nullptr, table_options_, std::move(block)); + ASSERT_TRUE(reader.KeyMayMatch("foo", 0)); + ASSERT_TRUE(reader.KeyMayMatch("foo", 100000)); +} + +TEST(FilterBlockTest, SingleChunk) { + BlockBasedFilterBlockBuilder builder(nullptr, table_options_); + builder.StartBlock(100); + builder.Add("foo"); + builder.Add("bar"); + builder.Add("box"); + builder.StartBlock(200); + builder.Add("box"); + builder.StartBlock(300); + builder.Add("hello"); + BlockContents block(builder.Finish(), false, kNoCompression); + BlockBasedFilterBlockReader reader(nullptr, table_options_, std::move(block)); + ASSERT_TRUE(reader.KeyMayMatch("foo", 100)); + ASSERT_TRUE(reader.KeyMayMatch("bar", 100)); + ASSERT_TRUE(reader.KeyMayMatch("box", 100)); + ASSERT_TRUE(reader.KeyMayMatch("hello", 100)); + ASSERT_TRUE(reader.KeyMayMatch("foo", 100)); + ASSERT_TRUE(!reader.KeyMayMatch("missing", 100)); + ASSERT_TRUE(!reader.KeyMayMatch("other", 100)); +} + +TEST(FilterBlockTest, MultiChunk) { + BlockBasedFilterBlockBuilder builder(nullptr, table_options_); + + // First filter + builder.StartBlock(0); + builder.Add("foo"); + builder.StartBlock(2000); + builder.Add("bar"); + + // Second filter + builder.StartBlock(3100); + builder.Add("box"); + + // Third filter is empty + + // Last filter + builder.StartBlock(9000); + builder.Add("box"); + builder.Add("hello"); + + BlockContents block(builder.Finish(), false, kNoCompression); + BlockBasedFilterBlockReader reader(nullptr, table_options_, std::move(block)); + + // Check first filter + ASSERT_TRUE(reader.KeyMayMatch("foo", 0)); + ASSERT_TRUE(reader.KeyMayMatch("bar", 2000)); + ASSERT_TRUE(!reader.KeyMayMatch("box", 0)); + ASSERT_TRUE(!reader.KeyMayMatch("hello", 0)); + + // Check second filter + ASSERT_TRUE(reader.KeyMayMatch("box", 3100)); + ASSERT_TRUE(!reader.KeyMayMatch("foo", 3100)); + ASSERT_TRUE(!reader.KeyMayMatch("bar", 3100)); + ASSERT_TRUE(!reader.KeyMayMatch("hello", 3100)); + + // Check third filter (empty) + ASSERT_TRUE(!reader.KeyMayMatch("foo", 4100)); + ASSERT_TRUE(!reader.KeyMayMatch("bar", 4100)); + ASSERT_TRUE(!reader.KeyMayMatch("box", 4100)); + ASSERT_TRUE(!reader.KeyMayMatch("hello", 4100)); + + // Check last filter + ASSERT_TRUE(reader.KeyMayMatch("box", 9000)); + ASSERT_TRUE(reader.KeyMayMatch("hello", 9000)); + ASSERT_TRUE(!reader.KeyMayMatch("foo", 9000)); + ASSERT_TRUE(!reader.KeyMayMatch("bar", 9000)); +} + +// Test for block based filter block +// use new interface in FilterPolicy to create filter builder/reader +class BlockBasedFilterBlockTest { + public: + BlockBasedTableOptions table_options_; + + BlockBasedFilterBlockTest() { + table_options_.filter_policy.reset(NewBloomFilterPolicy(10)); + } + + ~BlockBasedFilterBlockTest() {} +}; + +TEST(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) { + FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder( + nullptr, table_options_); + BlockContents block(builder->Finish(), false, kNoCompression); + ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data)); + FilterBlockReader* reader = new BlockBasedFilterBlockReader( + nullptr, table_options_, std::move(block)); + ASSERT_TRUE(reader->KeyMayMatch("foo", 0)); + ASSERT_TRUE(reader->KeyMayMatch("foo", 100000)); + + delete builder; + delete reader; +} + +TEST(BlockBasedFilterBlockTest, BlockBasedSingleChunk) { + FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder( + nullptr, table_options_); + builder->StartBlock(100); + builder->Add("foo"); + builder->Add("bar"); + builder->Add("box"); + builder->StartBlock(200); + builder->Add("box"); + builder->StartBlock(300); + builder->Add("hello"); + BlockContents block(builder->Finish(), false, kNoCompression); + FilterBlockReader* reader = new BlockBasedFilterBlockReader( + nullptr, table_options_, std::move(block)); + ASSERT_TRUE(reader->KeyMayMatch("foo", 100)); + ASSERT_TRUE(reader->KeyMayMatch("bar", 100)); + ASSERT_TRUE(reader->KeyMayMatch("box", 100)); + ASSERT_TRUE(reader->KeyMayMatch("hello", 100)); + ASSERT_TRUE(reader->KeyMayMatch("foo", 100)); + ASSERT_TRUE(!reader->KeyMayMatch("missing", 100)); + ASSERT_TRUE(!reader->KeyMayMatch("other", 100)); + + delete builder; + delete reader; +} + +TEST(BlockBasedFilterBlockTest, BlockBasedMultiChunk) { + FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder( + nullptr, table_options_); + + // First filter + builder->StartBlock(0); + builder->Add("foo"); + builder->StartBlock(2000); + builder->Add("bar"); + + // Second filter + builder->StartBlock(3100); + builder->Add("box"); + + // Third filter is empty + + // Last filter + builder->StartBlock(9000); + builder->Add("box"); + builder->Add("hello"); + + BlockContents block(builder->Finish(), false, kNoCompression); + FilterBlockReader* reader = new BlockBasedFilterBlockReader( + nullptr, table_options_, std::move(block)); + + // Check first filter + ASSERT_TRUE(reader->KeyMayMatch("foo", 0)); + ASSERT_TRUE(reader->KeyMayMatch("bar", 2000)); + ASSERT_TRUE(!reader->KeyMayMatch("box", 0)); + ASSERT_TRUE(!reader->KeyMayMatch("hello", 0)); + + // Check second filter + ASSERT_TRUE(reader->KeyMayMatch("box", 3100)); + ASSERT_TRUE(!reader->KeyMayMatch("foo", 3100)); + ASSERT_TRUE(!reader->KeyMayMatch("bar", 3100)); + ASSERT_TRUE(!reader->KeyMayMatch("hello", 3100)); + + // Check third filter (empty) + ASSERT_TRUE(!reader->KeyMayMatch("foo", 4100)); + ASSERT_TRUE(!reader->KeyMayMatch("bar", 4100)); + ASSERT_TRUE(!reader->KeyMayMatch("box", 4100)); + ASSERT_TRUE(!reader->KeyMayMatch("hello", 4100)); + + // Check last filter + ASSERT_TRUE(reader->KeyMayMatch("box", 9000)); + ASSERT_TRUE(reader->KeyMayMatch("hello", 9000)); + ASSERT_TRUE(!reader->KeyMayMatch("foo", 9000)); + ASSERT_TRUE(!reader->KeyMayMatch("bar", 9000)); + + delete builder; + delete reader; +} + +} // namespace rocksdb + +int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index c239bf458bb..2f373fff199 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -17,6 +17,7 @@ #include #include #include +#include #include "db/dbformat.h" @@ -25,13 +26,14 @@ #include "rocksdb/env.h" #include "rocksdb/filter_policy.h" #include "rocksdb/flush_block_policy.h" -#include "rocksdb/options.h" #include "rocksdb/table.h" #include "table/block.h" #include "table/block_based_table_reader.h" #include "table/block_builder.h" #include "table/filter_block.h" +#include "table/block_based_filter_block.h" +#include "table/full_filter_block.h" #include "table/format.h" #include "table/meta_blocks.h" #include "table/table_builder.h" @@ -116,7 +118,7 @@ class ShortenedIndexBuilder : public IndexBuilder { public: explicit ShortenedIndexBuilder(const Comparator* comparator) : IndexBuilder(comparator), - index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {} + index_block_builder_(1 /* block_restart_interval == 1 */) {} virtual void AddIndexEntry(std::string* last_key_in_current_block, const Slice* first_key_in_next_block, @@ -133,12 +135,12 @@ class ShortenedIndexBuilder : public IndexBuilder { index_block_builder_.Add(*last_key_in_current_block, handle_encoding); } - virtual Status Finish(IndexBlocks* index_blocks) { + virtual Status Finish(IndexBlocks* index_blocks) override { index_blocks->index_block_contents = index_block_builder_.Finish(); return Status::OK(); } - virtual size_t EstimatedSize() const { + virtual size_t EstimatedSize() const override { return index_block_builder_.CurrentSizeEstimate(); } @@ -175,14 +177,14 @@ class HashIndexBuilder : public IndexBuilder { explicit HashIndexBuilder(const Comparator* comparator, const SliceTransform* hash_key_extractor) : IndexBuilder(comparator), - primary_index_builder(comparator), + primary_index_builder_(comparator), hash_key_extractor_(hash_key_extractor) {} virtual void AddIndexEntry(std::string* last_key_in_current_block, const Slice* first_key_in_next_block, const BlockHandle& block_handle) override { ++current_restart_index_; - primary_index_builder.AddIndexEntry(last_key_in_current_block, + primary_index_builder_.AddIndexEntry(last_key_in_current_block, first_key_in_next_block, block_handle); } @@ -213,9 +215,9 @@ class HashIndexBuilder : public IndexBuilder { } } - virtual Status Finish(IndexBlocks* index_blocks) { + virtual Status Finish(IndexBlocks* index_blocks) override { FlushPendingPrefix(); - primary_index_builder.Finish(index_blocks); + primary_index_builder_.Finish(index_blocks); index_blocks->meta_blocks.insert( {kHashIndexPrefixesBlock.c_str(), prefix_block_}); index_blocks->meta_blocks.insert( @@ -223,8 +225,8 @@ class HashIndexBuilder : public IndexBuilder { return Status::OK(); } - virtual size_t EstimatedSize() const { - return primary_index_builder.EstimatedSize() + prefix_block_.size() + + virtual size_t EstimatedSize() const override { + return primary_index_builder_.EstimatedSize() + prefix_block_.size() + prefix_meta_block_.size(); } @@ -237,7 +239,7 @@ class HashIndexBuilder : public IndexBuilder { PutVarint32(&prefix_meta_block_, pending_block_num_); } - ShortenedIndexBuilder primary_index_builder; + ShortenedIndexBuilder primary_index_builder_; const SliceTransform* hash_key_extractor_; // stores a sequence of prefixes @@ -275,6 +277,21 @@ IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator, return nullptr; } +// Create a index builder based on its type. +FilterBlockBuilder* CreateFilterBlockBuilder(const ImmutableCFOptions& opt, + const BlockBasedTableOptions& table_opt) { + if (table_opt.filter_policy == nullptr) return nullptr; + + FilterBitsBuilder* filter_bits_builder = + table_opt.filter_policy->GetFilterBitsBuilder(); + if (filter_bits_builder == nullptr) { + return new BlockBasedFilterBlockBuilder(opt.prefix_extractor, table_opt); + } else { + return new FullFilterBlockBuilder(opt.prefix_extractor, table_opt, + filter_bits_builder); + } +} + bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) { // Check to see if compressed less than 12.5% return compressed_size < raw_size - (raw_size / 8u); @@ -366,7 +383,6 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector std::string val; PutFixed32(&val, static_cast(index_type_)); properties->insert({BlockBasedTablePropertyNames::kIndexType, val}); - return Status::OK(); } @@ -385,7 +401,8 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector }; struct BlockBasedTableBuilder::Rep { - Options options; + const ImmutableCFOptions ioptions; + const BlockBasedTableOptions table_options; const InternalKeyComparator& internal_comparator; WritableFile* file; uint64_t offset = 0; @@ -396,12 +413,12 @@ struct BlockBasedTableBuilder::Rep { std::unique_ptr index_builder; std::string last_key; - CompressionType compression_type; - ChecksumType checksum_type; + const CompressionType compression_type; + const CompressionOptions compression_opts; TableProperties props; bool closed = false; // Either Finish() or Abandon() has been called. - FilterBlockBuilder* filter_block; + std::unique_ptr filter_block; char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize]; size_t compressed_cache_key_prefix_size; @@ -413,48 +430,49 @@ struct BlockBasedTableBuilder::Rep { std::vector> table_properties_collectors; - Rep(const Options& opt, const InternalKeyComparator& icomparator, - WritableFile* f, FlushBlockPolicyFactory* flush_block_policy_factory, - CompressionType compression_type, IndexType index_block_type, - ChecksumType checksum_type) - : options(opt), + Rep(const ImmutableCFOptions& ioptions, + const BlockBasedTableOptions& table_opt, + const InternalKeyComparator& icomparator, + WritableFile* f, const CompressionType compression_type, + const CompressionOptions& compression_opts) + : ioptions(ioptions), + table_options(table_opt), internal_comparator(icomparator), file(f), - data_block(options, &internal_comparator), - internal_prefix_transform(options.prefix_extractor.get()), - index_builder(CreateIndexBuilder(index_block_type, &internal_comparator, - &this->internal_prefix_transform)), + data_block(table_options.block_restart_interval), + internal_prefix_transform(ioptions.prefix_extractor), + index_builder(CreateIndexBuilder( + table_options.index_type, &internal_comparator, + &this->internal_prefix_transform)), compression_type(compression_type), - checksum_type(checksum_type), - filter_block(opt.filter_policy == nullptr - ? nullptr - : new FilterBlockBuilder(opt, &internal_comparator)), - flush_block_policy(flush_block_policy_factory->NewFlushBlockPolicy( - options, data_block)) { + filter_block(CreateFilterBlockBuilder(ioptions, table_options)), + flush_block_policy( + table_options.flush_block_policy_factory->NewFlushBlockPolicy( + table_options, data_block)) { for (auto& collector_factories : - options.table_properties_collector_factories) { + ioptions.table_properties_collector_factories) { table_properties_collectors.emplace_back( collector_factories->CreateTablePropertiesCollector()); } table_properties_collectors.emplace_back( - new BlockBasedTablePropertiesCollector(index_block_type)); + new BlockBasedTablePropertiesCollector(table_options.index_type)); } }; BlockBasedTableBuilder::BlockBasedTableBuilder( - const Options& options, const BlockBasedTableOptions& table_options, + const ImmutableCFOptions& ioptions, + const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, WritableFile* file, - CompressionType compression_type) - : rep_(new Rep(options, internal_comparator, file, - table_options.flush_block_policy_factory.get(), - compression_type, table_options.index_type, - table_options.checksum)) { + const CompressionType compression_type, + const CompressionOptions& compression_opts) + : rep_(new Rep(ioptions, table_options, internal_comparator, + file, compression_type, compression_opts)) { if (rep_->filter_block != nullptr) { rep_->filter_block->StartBlock(0); } - if (options.block_cache_compressed.get() != nullptr) { + if (table_options.block_cache_compressed.get() != nullptr) { BlockBasedTable::GenerateCachePrefix( - options.block_cache_compressed.get(), file, + table_options.block_cache_compressed.get(), file, &rep_->compressed_cache_key_prefix[0], &rep_->compressed_cache_key_prefix_size); } @@ -462,7 +480,6 @@ BlockBasedTableBuilder::BlockBasedTableBuilder( BlockBasedTableBuilder::~BlockBasedTableBuilder() { assert(rep_->closed); // Catch errors where caller forgot to call Finish() - delete rep_->filter_block; delete rep_; } @@ -493,7 +510,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { } if (r->filter_block != nullptr) { - r->filter_block->AddKey(key); + r->filter_block->Add(ExtractUserKey(key)); } r->last_key.assign(key.data(), key.size()); @@ -504,7 +521,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { r->index_builder->OnKeyAdded(key); NotifyCollectTableCollectorsOnAdd(key, value, r->table_properties_collectors, - r->options.info_log.get()); + r->ioptions.info_log); } void BlockBasedTableBuilder::Flush() { @@ -542,10 +559,10 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents, Slice block_contents; if (raw_block_contents.size() < kCompressionSizeLimit) { block_contents = - CompressBlock(raw_block_contents, r->options.compression_opts, &type, + CompressBlock(raw_block_contents, r->compression_opts, &type, &r->compressed_output); } else { - RecordTick(r->options.statistics.get(), NUMBER_BLOCK_NOT_COMPRESSED); + RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED); type = kNoCompression; block_contents = raw_block_contents; } @@ -557,8 +574,7 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents, CompressionType type, BlockHandle* handle) { Rep* r = rep_; - StopWatch sw(r->options.env, r->options.statistics.get(), - WRITE_RAW_BLOCK_MICROS); + StopWatch sw(r->ioptions.env, r->ioptions.statistics, WRITE_RAW_BLOCK_MICROS); handle->set_offset(r->offset); handle->set_size(block_contents.size()); r->status = r->file->Append(block_contents); @@ -566,7 +582,7 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents, char trailer[kBlockTrailerSize]; trailer[0] = type; char* trailer_without_type = trailer + 1; - switch (r->checksum_type) { + switch (r->table_options.checksum) { case kNoChecksum: // we don't support no checksum yet assert(false); @@ -612,25 +628,20 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents, const CompressionType type, const BlockHandle* handle) { Rep* r = rep_; - Cache* block_cache_compressed = r->options.block_cache_compressed.get(); + Cache* block_cache_compressed = r->table_options.block_cache_compressed.get(); if (type != kNoCompression && block_cache_compressed != nullptr) { Cache::Handle* cache_handle = nullptr; size_t size = block_contents.size(); - char* ubuf = new char[size + 1]; // make a new copy - memcpy(ubuf, block_contents.data(), size); + std::unique_ptr ubuf(new char[size + 1]); + memcpy(ubuf.get(), block_contents.data(), size); ubuf[size] = type; - BlockContents results; - Slice sl(ubuf, size); - results.data = sl; - results.cachable = true; // XXX - results.heap_allocated = true; - results.compression_type = type; + BlockContents results(std::move(ubuf), size, true, type); - Block* block = new Block(results); + Block* block = new Block(std::move(results)); // make cache key by appending the file offset to the cache prefix id char* end = EncodeVarint64( @@ -658,10 +669,7 @@ Status BlockBasedTableBuilder::Finish() { assert(!r->closed); r->closed = true; - BlockHandle filter_block_handle, - metaindex_block_handle, - index_block_handle; - + BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle; // Write filter block if (ok() && r->filter_block != nullptr) { auto filter_contents = r->filter_block->Finish(); @@ -700,8 +708,13 @@ Status BlockBasedTableBuilder::Finish() { if (r->filter_block != nullptr) { // Add mapping from ".Name" to location // of filter data. - std::string key = BlockBasedTable::kFilterBlockPrefix; - key.append(r->options.filter_policy->Name()); + std::string key; + if (r->filter_block->IsBlockBased()) { + key = BlockBasedTable::kFilterBlockPrefix; + } else { + key = BlockBasedTable::kFullFilterBlockPrefix; + } + key.append(r->table_options.filter_policy->Name()); meta_index_builder.Add(key, filter_block_handle); } @@ -709,8 +722,8 @@ Status BlockBasedTableBuilder::Finish() { { PropertyBlockBuilder property_block_builder; std::vector failed_user_prop_collectors; - r->props.filter_policy_name = r->options.filter_policy != nullptr ? - r->options.filter_policy->Name() : ""; + r->props.filter_policy_name = r->table_options.filter_policy != nullptr ? + r->table_options.filter_policy->Name() : ""; r->props.index_size = r->index_builder->EstimatedSize() + kBlockTrailerSize; @@ -719,7 +732,7 @@ Status BlockBasedTableBuilder::Finish() { // Add use collected properties NotifyCollectTableCollectorsOnFinish(r->table_properties_collectors, - r->options.info_log.get(), + r->ioptions.info_log, &property_block_builder); BlockHandle properties_block_handle; @@ -750,12 +763,12 @@ Status BlockBasedTableBuilder::Finish() { // TODO(icanadi) at some point in the future, when we're absolutely sure // nobody will roll back to RocksDB 2.x versions, retire the legacy magic // number and always write new table files with new magic number - bool legacy = (r->checksum_type == kCRC32c); + bool legacy = (r->table_options.checksum == kCRC32c); Footer footer(legacy ? kLegacyBlockBasedTableMagicNumber : kBlockBasedTableMagicNumber); footer.set_metaindex_handle(metaindex_block_handle); footer.set_index_handle(index_block_handle); - footer.set_checksum(r->checksum_type); + footer.set_checksum(r->table_options.checksum); std::string footer_encoding; footer.EncodeTo(&footer_encoding); r->status = r->file->Append(footer_encoding); @@ -778,14 +791,12 @@ Status BlockBasedTableBuilder::Finish() { } } - Log( - r->options.info_log, + Log(r->ioptions.info_log, "Table was constructed:\n" " [basic properties]: %s\n" " [user collected properties]: %s", r->props.ToString().c_str(), - user_collected.c_str() - ); + user_collected.c_str()); } return r->status; @@ -806,5 +817,6 @@ uint64_t BlockBasedTableBuilder::FileSize() const { } const std::string BlockBasedTable::kFilterBlockPrefix = "filter."; +const std::string BlockBasedTable::kFullFilterBlockPrefix = "fullfilter."; } // namespace rocksdb diff --git a/table/block_based_table_builder.h b/table/block_based_table_builder.h index 72a2f207a66..6fde329199f 100644 --- a/table/block_based_table_builder.h +++ b/table/block_based_table_builder.h @@ -28,10 +28,12 @@ class BlockBasedTableBuilder : public TableBuilder { // Create a builder that will store the contents of the table it is // building in *file. Does not close the file. It is up to the // caller to close the file after calling Finish(). - BlockBasedTableBuilder(const Options& options, + BlockBasedTableBuilder(const ImmutableCFOptions& ioptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, - WritableFile* file, CompressionType compression_type); + WritableFile* file, + const CompressionType compression_type, + const CompressionOptions& compression_opts); // REQUIRES: Either Finish() or Abandon() has been called. ~BlockBasedTableBuilder(); diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc index 22fd0dd939c..b4e2e7d1fec 100644 --- a/table/block_based_table_factory.cc +++ b/table/block_based_table_factory.cc @@ -15,6 +15,7 @@ #include #include "rocksdb/flush_block_policy.h" +#include "rocksdb/cache.h" #include "table/block_based_table_builder.h" #include "table/block_based_table_reader.h" #include "port/port.h" @@ -28,27 +29,100 @@ BlockBasedTableFactory::BlockBasedTableFactory( table_options_.flush_block_policy_factory.reset( new FlushBlockBySizePolicyFactory()); } + if (table_options_.no_block_cache) { + table_options_.block_cache.reset(); + } else if (table_options_.block_cache == nullptr) { + table_options_.block_cache = NewLRUCache(8 << 20); + } + if (table_options_.block_size_deviation < 0 || + table_options_.block_size_deviation > 100) { + table_options_.block_size_deviation = 0; + } } Status BlockBasedTableFactory::NewTableReader( - const Options& options, const EnvOptions& soptions, + const ImmutableCFOptions& ioptions, const EnvOptions& soptions, const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader) const { - return BlockBasedTable::Open(options, soptions, table_options_, + return BlockBasedTable::Open(ioptions, soptions, table_options_, internal_comparator, std::move(file), file_size, table_reader); } TableBuilder* BlockBasedTableFactory::NewTableBuilder( - const Options& options, const InternalKeyComparator& internal_comparator, - WritableFile* file, CompressionType compression_type) const { + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& internal_comparator, + WritableFile* file, const CompressionType compression_type, + const CompressionOptions& compression_opts) const { + auto table_builder = new BlockBasedTableBuilder( - options, table_options_, internal_comparator, file, compression_type); + ioptions, table_options_, internal_comparator, file, + compression_type, compression_opts); return table_builder; } +std::string BlockBasedTableFactory::GetPrintableTableOptions() const { + std::string ret; + ret.reserve(20000); + const int kBufferSize = 200; + char buffer[kBufferSize]; + + snprintf(buffer, kBufferSize, " flush_block_policy_factory: %s (%p)\n", + table_options_.flush_block_policy_factory->Name(), + table_options_.flush_block_policy_factory.get()); + ret.append(buffer); + snprintf(buffer, kBufferSize, " cache_index_and_filter_blocks: %d\n", + table_options_.cache_index_and_filter_blocks); + ret.append(buffer); + snprintf(buffer, kBufferSize, " index_type: %d\n", + table_options_.index_type); + ret.append(buffer); + snprintf(buffer, kBufferSize, " hash_index_allow_collision: %d\n", + table_options_.hash_index_allow_collision); + ret.append(buffer); + snprintf(buffer, kBufferSize, " checksum: %d\n", + table_options_.checksum); + ret.append(buffer); + snprintf(buffer, kBufferSize, " no_block_cache: %d\n", + table_options_.no_block_cache); + ret.append(buffer); + snprintf(buffer, kBufferSize, " block_cache: %p\n", + table_options_.block_cache.get()); + ret.append(buffer); + if (table_options_.block_cache) { + snprintf(buffer, kBufferSize, " block_cache_size: %zd\n", + table_options_.block_cache->GetCapacity()); + ret.append(buffer); + } + snprintf(buffer, kBufferSize, " block_cache_compressed: %p\n", + table_options_.block_cache_compressed.get()); + ret.append(buffer); + if (table_options_.block_cache_compressed) { + snprintf(buffer, kBufferSize, " block_cache_compressed_size: %zd\n", + table_options_.block_cache_compressed->GetCapacity()); + ret.append(buffer); + } + snprintf(buffer, kBufferSize, " block_size: %zd\n", + table_options_.block_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " block_size_deviation: %d\n", + table_options_.block_size_deviation); + ret.append(buffer); + snprintf(buffer, kBufferSize, " block_restart_interval: %d\n", + table_options_.block_restart_interval); + ret.append(buffer); + snprintf(buffer, kBufferSize, " filter_policy: %s\n", + table_options_.filter_policy == nullptr ? + "nullptr" : table_options_.filter_policy->Name()); + ret.append(buffer); + snprintf(buffer, kBufferSize, " whole_key_filtering: %d\n", + table_options_.whole_key_filtering); + ret.append(buffer); + return ret; +} + TableFactory* NewBlockBasedTableFactory( const BlockBasedTableOptions& table_options) { return new BlockBasedTableFactory(table_options); diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h index 656b531aece..2dcfda6d484 100644 --- a/table/block_based_table_factory.h +++ b/table/block_based_table_factory.h @@ -14,12 +14,11 @@ #include #include "rocksdb/flush_block_policy.h" -#include "rocksdb/options.h" #include "rocksdb/table.h" +#include "db/dbformat.h" namespace rocksdb { -struct Options; struct EnvOptions; using std::unique_ptr; @@ -34,14 +33,24 @@ class BlockBasedTableFactory : public TableFactory { const char* Name() const override { return "BlockBasedTable"; } - Status NewTableReader(const Options& options, const EnvOptions& soptions, - const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table_reader) const override; + Status NewTableReader( + const ImmutableCFOptions& ioptions, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader) const override; TableBuilder* NewTableBuilder( - const Options& options, const InternalKeyComparator& internal_comparator, - WritableFile* file, CompressionType compression_type) const override; + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& internal_comparator, + WritableFile* file, const CompressionType compression_type, + const CompressionOptions& compression_opts) const override; + + // Sanitizes the specified DB Options. + Status SanitizeDBOptions(const DBOptions* db_opts) const override { + return Status::OK(); + } + + std::string GetPrintableTableOptions() const override; private: BlockBasedTableOptions table_options_; diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index a0ce32a75d2..09328dc3b21 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -26,6 +26,8 @@ #include "table/block.h" #include "table/filter_block.h" +#include "table/block_based_filter_block.h" +#include "table/full_filter_block.h" #include "table/block_hash_index.h" #include "table/block_prefix_index.h" #include "table/format.h" @@ -46,7 +48,6 @@ using std::unique_ptr; typedef BlockBasedTable::IndexReader IndexReader; namespace { - // The longest the prefix of the cache key used to identify blocks can be. // We are using the fact that we know for Posix files the unique ID is three // varints. @@ -65,7 +66,7 @@ Status ReadBlockFromFile(RandomAccessFile* file, const Footer& footer, Status s = ReadBlockContents(file, footer, options, handle, &contents, env, do_uncompress); if (s.ok()) { - *result = new Block(contents); + *result = new Block(std::move(contents)); } return s; @@ -137,7 +138,8 @@ class BlockBasedTable::IndexReader { // Create an iterator for index access. // An iter is passed in, if it is not null, update this one and return it // If it is null, create a new Iterator - virtual Iterator* NewIterator(BlockIter* iter = nullptr) = 0; + virtual Iterator* NewIterator( + BlockIter* iter = nullptr, bool total_order_seek = true) = 0; // The size of the index. virtual size_t size() const = 0; @@ -174,8 +176,9 @@ class BinarySearchIndexReader : public IndexReader { return s; } - virtual Iterator* NewIterator(BlockIter* iter = nullptr) override { - return index_block_->NewIterator(comparator_, iter); + virtual Iterator* NewIterator( + BlockIter* iter = nullptr, bool dont_care = true) override { + return index_block_->NewIterator(comparator_, iter, true); } virtual size_t size() const override { return index_block_->size(); } @@ -249,9 +252,6 @@ class HashIndexReader : public IndexReader { &prefixes_meta_contents, env, true /* do decompression */); if (!s.ok()) { - if (prefixes_contents.heap_allocated) { - delete[] prefixes_contents.data.data(); - } // TODO: log error return Status::OK(); } @@ -266,7 +266,7 @@ class HashIndexReader : public IndexReader { // TODO: log error if (s.ok()) { new_index_reader->index_block_->SetBlockHashIndex(hash_index); - new_index_reader->OwnPrefixesContents(prefixes_contents); + new_index_reader->OwnPrefixesContents(std::move(prefixes_contents)); } } else { BlockPrefixIndex* prefix_index = nullptr; @@ -280,23 +280,12 @@ class HashIndexReader : public IndexReader { } } - // Always release prefix meta block - if (prefixes_meta_contents.heap_allocated) { - delete[] prefixes_meta_contents.data.data(); - } - - // Release prefix content block if we don't own it. - if (!new_index_reader->own_prefixes_contents_) { - if (prefixes_contents.heap_allocated) { - delete[] prefixes_contents.data.data(); - } - } - return Status::OK(); } - virtual Iterator* NewIterator(BlockIter* iter = nullptr) override { - return index_block_->NewIterator(comparator_, iter); + virtual Iterator* NewIterator( + BlockIter* iter = nullptr, bool total_order_seek = true) override { + return index_block_->NewIterator(comparator_, iter, total_order_seek); } virtual size_t size() const override { return index_block_->size(); } @@ -309,36 +298,35 @@ class HashIndexReader : public IndexReader { private: HashIndexReader(const Comparator* comparator, Block* index_block) - : IndexReader(comparator), - index_block_(index_block), - own_prefixes_contents_(false) { + : IndexReader(comparator), index_block_(index_block) { assert(index_block_ != nullptr); } ~HashIndexReader() { - if (own_prefixes_contents_ && prefixes_contents_.heap_allocated) { - delete[] prefixes_contents_.data.data(); - } } - void OwnPrefixesContents(const BlockContents& prefixes_contents) { - prefixes_contents_ = prefixes_contents; - own_prefixes_contents_ = true; + void OwnPrefixesContents(BlockContents&& prefixes_contents) { + prefixes_contents_ = std::move(prefixes_contents); } std::unique_ptr index_block_; - bool own_prefixes_contents_; BlockContents prefixes_contents_; }; struct BlockBasedTable::Rep { - Rep(const EnvOptions& storage_options, + Rep(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, + const BlockBasedTableOptions& table_opt, const InternalKeyComparator& internal_comparator) - : soptions(storage_options), internal_comparator(internal_comparator) {} - - Options options; - const EnvOptions& soptions; + : ioptions(ioptions), env_options(env_options), table_options(table_opt), + filter_policy(table_opt.filter_policy.get()), + internal_comparator(internal_comparator) {} + + const ImmutableCFOptions& ioptions; + const EnvOptions& env_options; + const BlockBasedTableOptions& table_options; + const FilterPolicy* const filter_policy; const InternalKeyComparator& internal_comparator; Status status; unique_ptr file; @@ -398,13 +386,13 @@ void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) { assert(kMaxCacheKeyPrefixSize >= 10); rep->cache_key_prefix_size = 0; rep->compressed_cache_key_prefix_size = 0; - if (rep->options.block_cache != nullptr) { - GenerateCachePrefix(rep->options.block_cache.get(), rep->file.get(), + if (rep->table_options.block_cache != nullptr) { + GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file.get(), &rep->cache_key_prefix[0], &rep->cache_key_prefix_size); } - if (rep->options.block_cache_compressed != nullptr) { - GenerateCachePrefix(rep->options.block_cache_compressed.get(), + if (rep->table_options.block_cache_compressed != nullptr) { + GenerateCachePrefix(rep->table_options.block_cache_compressed.get(), rep->file.get(), &rep->compressed_cache_key_prefix[0], &rep->compressed_cache_key_prefix_size); } @@ -438,7 +426,8 @@ void BlockBasedTable::GenerateCachePrefix(Cache* cc, } } -Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions, +Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, unique_ptr&& file, @@ -452,8 +441,8 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions, // We've successfully read the footer and the index block: we're // ready to serve requests. - Rep* rep = new BlockBasedTable::Rep(soptions, internal_comparator); - rep->options = options; + Rep* rep = new BlockBasedTable::Rep( + ioptions, env_options, table_options, internal_comparator); rep->file = std::move(file); rep->footer = footer; rep->index_type = table_options.index_type; @@ -475,7 +464,7 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions, TableProperties* table_properties = nullptr; if (s.ok()) { s = ReadProperties(meta_iter->value(), rep->file.get(), rep->footer, - rep->options.env, rep->options.info_log.get(), + rep->ioptions.env, rep->ioptions.info_log, &table_properties); } @@ -483,17 +472,18 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions, auto err_msg = "[Warning] Encountered error while reading data from properties " "block " + s.ToString(); - Log(rep->options.info_log, "%s", err_msg.c_str()); + Log(rep->ioptions.info_log, "%s", err_msg.c_str()); } else { rep->table_properties.reset(table_properties); } } else { - Log(WARN_LEVEL, rep->options.info_log, + Log(WARN_LEVEL, rep->ioptions.info_log, "Cannot find Properties block from file."); } // Will use block cache for index/filter blocks access? - if (options.block_cache && table_options.cache_index_and_filter_blocks) { + if (table_options.block_cache && + table_options.cache_index_and_filter_blocks) { // Hack: Call NewIndexIterator() to implicitly add index to the block_cache unique_ptr iter(new_table->NewIndexIterator(ReadOptions())); s = iter->status(); @@ -501,26 +491,32 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions, if (s.ok()) { // Hack: Call GetFilter() to implicitly add filter to the block_cache auto filter_entry = new_table->GetFilter(); - filter_entry.Release(options.block_cache.get()); + filter_entry.Release(table_options.block_cache.get()); } } else { // If we don't use block cache for index/filter blocks access, we'll // pre-load these blocks, which will kept in member variables in Rep // and with a same life-time as this table object. IndexReader* index_reader = nullptr; - // TODO: we never really verify check sum for index block s = new_table->CreateIndexReader(&index_reader, meta_iter.get()); if (s.ok()) { rep->index_reader.reset(index_reader); // Set filter block - if (rep->options.filter_policy) { - std::string key = kFilterBlockPrefix; - key.append(rep->options.filter_policy->Name()); - BlockHandle handle; - if (FindMetaBlock(meta_iter.get(), key, &handle).ok()) { - rep->filter.reset(ReadFilter(handle, rep)); + if (rep->filter_policy) { + // First try reading full_filter, then reading block_based_filter + for (auto filter_block_prefix : { kFullFilterBlockPrefix, + kFilterBlockPrefix }) { + std::string key = filter_block_prefix; + key.append(rep->filter_policy->Name()); + + BlockHandle handle; + if (FindMetaBlock(meta_iter.get(), key, &handle).ok()) { + rep->filter.reset(ReadFilter(handle, rep, + filter_block_prefix, nullptr)); + break; + } } } } else { @@ -536,7 +532,7 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions, } void BlockBasedTable::SetupForCompaction() { - switch (rep_->options.access_hint_on_compaction_start) { + switch (rep_->ioptions.access_hint_on_compaction_start) { case Options::NONE: break; case Options::NORMAL: @@ -586,13 +582,13 @@ Status BlockBasedTable::ReadMetaBlock( ReadOptions(), rep->footer.metaindex_handle(), &meta, - rep->options.env); + rep->ioptions.env); if (!s.ok()) { auto err_msg = "[Warning] Encountered error while reading data from properties" "block " + s.ToString(); - Log(rep->options.info_log, "%s", err_msg.c_str()); + Log(rep->ioptions.info_log, "%s", err_msg.c_str()); } if (!s.ok()) { delete meta; @@ -656,7 +652,7 @@ Status BlockBasedTable::GetDataBlockFromCache( // Insert uncompressed block into block cache if (s.ok()) { - block->value = new Block(contents); // uncompressed block + block->value = new Block(std::move(contents)); // uncompressed block assert(block->value->compression_type() == kNoCompression); if (block_cache != nullptr && block->value->cachable() && read_options.fill_cache) { @@ -694,7 +690,7 @@ Status BlockBasedTable::PutDataBlockToCache( } if (raw_block->compression_type() != kNoCompression) { - block->value = new Block(contents); // uncompressed block + block->value = new Block(std::move(contents)); // uncompressed block } else { block->value = raw_block; raw_block = nullptr; @@ -728,15 +724,15 @@ Status BlockBasedTable::PutDataBlockToCache( return s; } -FilterBlockReader* BlockBasedTable::ReadFilter(const BlockHandle& filter_handle, - BlockBasedTable::Rep* rep, - size_t* filter_size) { +FilterBlockReader* BlockBasedTable::ReadFilter( + const BlockHandle& filter_handle, BlockBasedTable::Rep* rep, + const std::string& filter_block_prefix, size_t* filter_size) { // TODO: We might want to unify with ReadBlockFromFile() if we start // requiring checksum verification in Table::Open. ReadOptions opt; BlockContents block; if (!ReadBlockContents(rep->file.get(), rep->footer, opt, filter_handle, - &block, rep->options.env, false).ok()) { + &block, rep->ioptions.env, false).ok()) { return nullptr; } @@ -744,41 +740,52 @@ FilterBlockReader* BlockBasedTable::ReadFilter(const BlockHandle& filter_handle, *filter_size = block.data.size(); } - return new FilterBlockReader( - rep->options, block.data, block.heap_allocated); + assert(rep->filter_policy); + if (kFilterBlockPrefix == filter_block_prefix) { + return new BlockBasedFilterBlockReader( + rep->ioptions.prefix_extractor, rep->table_options, std::move(block)); + } else if (kFullFilterBlockPrefix == filter_block_prefix) { + auto filter_bits_reader = rep->filter_policy-> + GetFilterBitsReader(block.data); + + if (filter_bits_reader != nullptr) { + return new FullFilterBlockReader(rep->ioptions.prefix_extractor, + rep->table_options, std::move(block), + filter_bits_reader); + } + } + return nullptr; } BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( - bool no_io) const { + bool no_io) const { // filter pre-populated if (rep_->filter != nullptr) { return {rep_->filter.get(), nullptr /* cache handle */}; } - if (rep_->options.filter_policy == nullptr /* do not use filter at all */ || - rep_->options.block_cache == nullptr /* no block cache at all */) { + Cache* block_cache = rep_->table_options.block_cache.get(); + if (rep_->filter_policy == nullptr /* do not use filter */ || + block_cache == nullptr /* no block cache at all */) { return {nullptr /* filter */, nullptr /* cache handle */}; } // Fetching from the cache - Cache* block_cache = rep_->options.block_cache.get(); char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - auto key = GetCacheKey( - rep_->cache_key_prefix, - rep_->cache_key_prefix_size, - rep_->footer.metaindex_handle(), - cache_key + auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, + rep_->footer.metaindex_handle(), + cache_key ); - Statistics* statistics = rep_->options.statistics.get(); + Statistics* statistics = rep_->ioptions.statistics; auto cache_handle = GetEntryFromCache(block_cache, key, BLOCK_CACHE_FILTER_MISS, BLOCK_CACHE_FILTER_HIT, statistics); FilterBlockReader* filter = nullptr; if (cache_handle != nullptr) { - filter = reinterpret_cast( - block_cache->Value(cache_handle)); + filter = reinterpret_cast( + block_cache->Value(cache_handle)); } else if (no_io) { // Do not invoke any io. return CachableEntry(); @@ -789,17 +796,22 @@ BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( auto s = ReadMetaBlock(rep_, &meta, &iter); if (s.ok()) { - std::string filter_block_key = kFilterBlockPrefix; - filter_block_key.append(rep_->options.filter_policy->Name()); - BlockHandle handle; - if (FindMetaBlock(iter.get(), filter_block_key, &handle).ok()) { - filter = ReadFilter(handle, rep_, &filter_size); - assert(filter); - assert(filter_size > 0); - - cache_handle = block_cache->Insert( - key, filter, filter_size, &DeleteCachedEntry); - RecordTick(statistics, BLOCK_CACHE_ADD); + // First try reading full_filter, then reading block_based_filter + for (auto filter_block_prefix : {kFullFilterBlockPrefix, + kFilterBlockPrefix}) { + std::string filter_block_key = filter_block_prefix; + filter_block_key.append(rep_->filter_policy->Name()); + BlockHandle handle; + if (FindMetaBlock(iter.get(), filter_block_key, &handle).ok()) { + filter = ReadFilter(handle, rep_, filter_block_prefix, &filter_size); + + if (filter == nullptr) break; // err happen in ReadFilter + assert(filter_size > 0); + cache_handle = block_cache->Insert( + key, filter, filter_size, &DeleteCachedEntry); + RecordTick(statistics, BLOCK_CACHE_ADD); + break; + } } } } @@ -811,15 +823,16 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options, BlockIter* input_iter) { // index reader has already been pre-populated. if (rep_->index_reader) { - return rep_->index_reader->NewIterator(input_iter); + return rep_->index_reader->NewIterator( + input_iter, read_options.total_order_seek); } bool no_io = read_options.read_tier == kBlockCacheTier; - Cache* block_cache = rep_->options.block_cache.get(); + Cache* block_cache = rep_->table_options.block_cache.get(); char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, rep_->footer.index_handle(), cache_key); - Statistics* statistics = rep_->options.statistics.get(); + Statistics* statistics = rep_->ioptions.statistics; auto cache_handle = GetEntryFromCache(block_cache, key, BLOCK_CACHE_INDEX_MISS, BLOCK_CACHE_INDEX_HIT, statistics); @@ -859,10 +872,9 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options, } assert(cache_handle); - Iterator* iter; - iter = index_reader->NewIterator(input_iter); + auto* iter = index_reader->NewIterator( + input_iter, read_options.total_order_seek); iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, cache_handle); - return iter; } @@ -874,9 +886,9 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, const ReadOptions& ro, const Slice& index_value, BlockIter* input_iter) { const bool no_io = (ro.read_tier == kBlockCacheTier); - Cache* block_cache = rep->options.block_cache.get(); - Cache* block_cache_compressed = rep->options. - block_cache_compressed.get(); + Cache* block_cache = rep->table_options.block_cache.get(); + Cache* block_cache_compressed = + rep->table_options.block_cache_compressed.get(); CachableEntry block; BlockHandle handle; @@ -896,7 +908,7 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, // If either block cache is enabled, we'll try to read from it. if (block_cache != nullptr || block_cache_compressed != nullptr) { - Statistics* statistics = rep->options.statistics.get(); + Statistics* statistics = rep->ioptions.statistics; char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; Slice key, /* key to the block cache */ @@ -904,8 +916,8 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, // create key for block cache if (block_cache != nullptr) { - key = GetCacheKey(rep->cache_key_prefix, - rep->cache_key_prefix_size, handle, cache_key); + key = GetCacheKey(rep->cache_key_prefix, rep->cache_key_prefix_size, + handle, cache_key); } if (block_cache_compressed != nullptr) { @@ -920,9 +932,9 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, if (block.value == nullptr && !no_io && ro.fill_cache) { Block* raw_block = nullptr; { - StopWatch sw(rep->options.env, statistics, READ_BLOCK_GET_MICROS); + StopWatch sw(rep->ioptions.env, statistics, READ_BLOCK_GET_MICROS); s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle, - &raw_block, rep->options.env, + &raw_block, rep->ioptions.env, block_cache_compressed == nullptr); } @@ -945,7 +957,7 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, } } s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle, - &block.value, rep->options.env); + &block.value, rep->ioptions.env); } Iterator* iter; @@ -972,7 +984,8 @@ class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState { public: BlockEntryIteratorState(BlockBasedTable* table, const ReadOptions& read_options) - : TwoLevelIteratorState(table->rep_->options.prefix_extractor != nullptr), + : TwoLevelIteratorState( + table->rep_->ioptions.prefix_extractor != nullptr), table_(table), read_options_(read_options) {} @@ -981,6 +994,9 @@ class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState { } bool PrefixMayMatch(const Slice& internal_key) override { + if (read_options_.total_order_seek) { + return true; + } return table_->PrefixMayMatch(internal_key); } @@ -992,8 +1008,8 @@ class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState { // This will be broken if the user specifies an unusual implementation // of Options.comparator, or if the user specifies an unusual -// definition of prefixes in Options.filter_policy. In particular, we -// require the following three properties: +// definition of prefixes in BlockBasedTableOptions.filter_policy. +// In particular, we require the following three properties: // // 1) key.starts_with(prefix(key)) // 2) Compare(prefix(key), key) <= 0. @@ -1003,12 +1019,12 @@ class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState { // // REQUIRES: this method shouldn't be called while the DB lock is held. bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) { - if (!rep_->options.filter_policy) { + if (!rep_->filter_policy) { return true; } - assert(rep_->options.prefix_extractor != nullptr); - auto prefix = rep_->options.prefix_extractor->Transform( + assert(rep_->ioptions.prefix_extractor != nullptr); + auto prefix = rep_->ioptions.prefix_extractor->Transform( ExtractUserKey(internal_key)); InternalKey internal_key_prefix(prefix, 0, kTypeValue); auto internal_prefix = internal_key_prefix.Encode(); @@ -1021,51 +1037,59 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) { // loaded to memory. ReadOptions no_io_read_options; no_io_read_options.read_tier = kBlockCacheTier; - unique_ptr iiter(NewIndexIterator(no_io_read_options)); - iiter->Seek(internal_prefix); - - if (!iiter->Valid()) { - // we're past end of file - // if it's incomplete, it means that we avoided I/O - // and we're not really sure that we're past the end - // of the file - may_match = iiter->status().IsIncomplete(); - } else if (ExtractUserKey(iiter->key()).starts_with( - ExtractUserKey(internal_prefix))) { - // we need to check for this subtle case because our only - // guarantee is that "the key is a string >= last key in that data - // block" according to the doc/table_format.txt spec. - // - // Suppose iiter->key() starts with the desired prefix; it is not - // necessarily the case that the corresponding data block will - // contain the prefix, since iiter->key() need not be in the - // block. However, the next data block may contain the prefix, so - // we return true to play it safe. - may_match = true; - } else { - // iiter->key() does NOT start with the desired prefix. Because - // Seek() finds the first key that is >= the seek target, this - // means that iiter->key() > prefix. Thus, any data blocks coming - // after the data block corresponding to iiter->key() cannot - // possibly contain the key. Thus, the corresponding data block - // is the only one which could potentially contain the prefix. - Slice handle_value = iiter->value(); - BlockHandle handle; - s = handle.DecodeFrom(&handle_value); - assert(s.ok()); - auto filter_entry = GetFilter(true /* no io */); - may_match = - filter_entry.value == nullptr || - filter_entry.value->PrefixMayMatch(handle.offset(), internal_prefix); - filter_entry.Release(rep_->options.block_cache.get()); + + // First, try check with full filter + auto filter_entry = GetFilter(true /* no io */); + FilterBlockReader* filter = filter_entry.value; + if (filter != nullptr && !filter->IsBlockBased()) { + may_match = filter->PrefixMayMatch(prefix); } - Statistics* statistics = rep_->options.statistics.get(); + // Then, try find it within each block + if (may_match) { + unique_ptr iiter(NewIndexIterator(no_io_read_options)); + iiter->Seek(internal_prefix); + + if (!iiter->Valid()) { + // we're past end of file + // if it's incomplete, it means that we avoided I/O + // and we're not really sure that we're past the end + // of the file + may_match = iiter->status().IsIncomplete(); + } else if (ExtractUserKey(iiter->key()).starts_with( + ExtractUserKey(internal_prefix))) { + // we need to check for this subtle case because our only + // guarantee is that "the key is a string >= last key in that data + // block" according to the doc/table_format.txt spec. + // + // Suppose iiter->key() starts with the desired prefix; it is not + // necessarily the case that the corresponding data block will + // contain the prefix, since iiter->key() need not be in the + // block. However, the next data block may contain the prefix, so + // we return true to play it safe. + may_match = true; + } else if (filter != nullptr && filter->IsBlockBased()) { + // iiter->key() does NOT start with the desired prefix. Because + // Seek() finds the first key that is >= the seek target, this + // means that iiter->key() > prefix. Thus, any data blocks coming + // after the data block corresponding to iiter->key() cannot + // possibly contain the key. Thus, the corresponding data block + // is the only on could potentially contain the prefix. + Slice handle_value = iiter->value(); + BlockHandle handle; + s = handle.DecodeFrom(&handle_value); + assert(s.ok()); + may_match = filter->PrefixMayMatch(prefix, handle.offset()); + } + } + + Statistics* statistics = rep_->ioptions.statistics; RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED); if (!may_match) { RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL); } + filter_entry.Release(rep_->table_options.block_cache.get()); return may_match; } @@ -1081,65 +1105,72 @@ Status BlockBasedTable::Get( const Slice& v), void (*mark_key_may_exist_handler)(void* handle_context)) { Status s; - BlockIter iiter; - NewIndexIterator(read_options, &iiter); - auto filter_entry = GetFilter(read_options.read_tier == kBlockCacheTier); FilterBlockReader* filter = filter_entry.value; - bool done = false; - for (iiter.Seek(key); iiter.Valid() && !done; iiter.Next()) { - Slice handle_value = iiter.value(); - BlockHandle handle; - bool may_not_exist_in_filter = - filter != nullptr && - handle.DecodeFrom(&handle_value).ok() && - !filter->KeyMayMatch(handle.offset(), key); - - if (may_not_exist_in_filter) { - // Not found - // TODO: think about interaction with Merge. If a user key cannot - // cross one data block, we should be fine. - RecordTick(rep_->options.statistics.get(), BLOOM_FILTER_USEFUL); - break; - } else { - BlockIter biter; - NewDataBlockIterator(rep_, read_options, iiter.value(), &biter); - - if (read_options.read_tier && biter.status().IsIncomplete()) { - // couldn't get block from block_cache - // Update Saver.state to Found because we are only looking for whether - // we can guarantee the key is not there when "no_io" is set - (*mark_key_may_exist_handler)(handle_context); - break; - } - if (!biter.status().ok()) { - s = biter.status(); - break; - } + // First check the full filter + // If full filter not useful, Then go into each block + if (filter != nullptr && !filter->IsBlockBased() + && !filter->KeyMayMatch(ExtractUserKey(key))) { + RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); + } else { + BlockIter iiter; + NewIndexIterator(read_options, &iiter); - // Call the *saver function on each entry/block until it returns false - for (biter.Seek(key); biter.Valid(); biter.Next()) { - ParsedInternalKey parsed_key; - if (!ParseInternalKey(biter.key(), &parsed_key)) { - s = Status::Corruption(Slice()); - } + bool done = false; + for (iiter.Seek(key); iiter.Valid() && !done; iiter.Next()) { + Slice handle_value = iiter.value(); - if (!(*result_handler)(handle_context, parsed_key, - biter.value())) { - done = true; + BlockHandle handle; + bool not_exist_in_filter = + filter != nullptr && filter->IsBlockBased() == true && + handle.DecodeFrom(&handle_value).ok() && + !filter->KeyMayMatch(ExtractUserKey(key), handle.offset()); + + if (not_exist_in_filter) { + // Not found + // TODO: think about interaction with Merge. If a user key cannot + // cross one data block, we should be fine. + RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); + break; + } else { + BlockIter biter; + NewDataBlockIterator(rep_, read_options, iiter.value(), &biter); + + if (read_options.read_tier && biter.status().IsIncomplete()) { + // couldn't get block from block_cache + // Update Saver.state to Found because we are only looking for whether + // we can guarantee the key is not there when "no_io" is set + (*mark_key_may_exist_handler)(handle_context); break; } + if (!biter.status().ok()) { + s = biter.status(); + break; + } + + // Call the *saver function on each entry/block until it returns false + for (biter.Seek(key); biter.Valid(); biter.Next()) { + ParsedInternalKey parsed_key; + if (!ParseInternalKey(biter.key(), &parsed_key)) { + s = Status::Corruption(Slice()); + } + + if (!(*result_handler)(handle_context, parsed_key, + biter.value())) { + done = true; + break; + } + } + s = biter.status(); } - s = biter.status(); + } + if (s.ok()) { + s = iiter.status(); } } - filter_entry.Release(rep_->options.block_cache.get()); - if (s.ok()) { - s = iiter.status(); - } - + filter_entry.Release(rep_->table_options.block_cache.get()); return s; } @@ -1154,13 +1185,13 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, Slice input = iiter->value(); Status s = handle.DecodeFrom(&input); assert(s.ok()); - Cache* block_cache = rep_->options.block_cache.get(); + Cache* block_cache = rep_->table_options.block_cache.get(); assert(block_cache != nullptr); char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; Slice cache_key = - GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle, - cache_key_storage); + GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, + handle, cache_key_storage); Slice ckey; s = GetDataBlockFromCache(cache_key, ckey, block_cache, nullptr, nullptr, @@ -1194,13 +1225,13 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader, } auto file = rep_->file.get(); - auto env = rep_->options.env; + auto env = rep_->ioptions.env; auto comparator = &rep_->internal_comparator; const Footer& footer = rep_->footer; if (index_type_on_file == BlockBasedTableOptions::kHashSearch && - rep_->options.prefix_extractor == nullptr) { - Log(rep_->options.info_log, + rep_->ioptions.prefix_extractor == nullptr) { + Log(rep_->ioptions.info_log, "BlockBasedTableOptions::kHashSearch requires " "options.prefix_extractor to be set." " Fall back to binary seach index."); @@ -1221,7 +1252,7 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader, if (!s.ok()) { // we simply fall back to binary search in case there is any // problem with prefix hash index loading. - Log(rep_->options.info_log, + Log(rep_->ioptions.info_log, "Unable to read the metaindex block." " Fall back to binary seach index."); return BinarySearchIndexReader::Create( @@ -1233,7 +1264,7 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader, // We need to wrap data with internal_prefix_transform to make sure it can // handle prefix correctly. rep_->internal_prefix_transform.reset( - new InternalKeySliceTransform(rep_->options.prefix_extractor.get())); + new InternalKeySliceTransform(rep_->ioptions.prefix_extractor)); return HashIndexReader::Create( rep_->internal_prefix_transform.get(), footer, file, env, comparator, footer.index_handle(), meta_index_iter, index_reader, diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 3ff97dda68f..503a91bb38e 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -14,6 +14,7 @@ #include #include +#include "rocksdb/options.h" #include "rocksdb/statistics.h" #include "rocksdb/status.h" #include "rocksdb/table.h" @@ -27,6 +28,8 @@ class BlockIter; class BlockHandle; class Cache; class FilterBlockReader; +class BlockBasedFilterBlockReader; +class FullFilterBlockReader; class Footer; class InternalKeyComparator; class Iterator; @@ -36,7 +39,6 @@ class TableReader; class WritableFile; struct BlockBasedTableOptions; struct EnvOptions; -struct Options; struct ReadOptions; using std::unique_ptr; @@ -47,6 +49,7 @@ using std::unique_ptr; class BlockBasedTable : public TableReader { public: static const std::string kFilterBlockPrefix; + static const std::string kFullFilterBlockPrefix; // Attempt to open the table that is stored in bytes [0..file_size) // of "file", and read the metadata entries necessary to allow @@ -58,7 +61,8 @@ class BlockBasedTable : public TableReader { // to nullptr and returns a non-ok status. // // *file must remain live while this Table is in use. - static Status Open(const Options& db_options, const EnvOptions& env_options, + static Status Open(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_key_comparator, unique_ptr&& file, uint64_t file_size, @@ -183,7 +187,9 @@ class BlockBasedTable : public TableReader { // Create the filter from the filter block. static FilterBlockReader* ReadFilter(const BlockHandle& filter_handle, - Rep* rep, size_t* filter_size = nullptr); + Rep* rep, + const std::string& filter_block_prefix, + size_t* filter_size = nullptr); static void SetupCacheKeyPrefix(Rep* rep); diff --git a/table/block_builder.cc b/table/block_builder.cc index f812dbae74f..f8627743a02 100644 --- a/table/block_builder.cc +++ b/table/block_builder.cc @@ -41,10 +41,8 @@ namespace rocksdb { -BlockBuilder::BlockBuilder(int block_restart_interval, - const Comparator* comparator) +BlockBuilder::BlockBuilder(int block_restart_interval) : block_restart_interval_(block_restart_interval), - comparator_(comparator), restarts_(), counter_(0), finished_(false) { @@ -52,9 +50,6 @@ BlockBuilder::BlockBuilder(int block_restart_interval, restarts_.push_back(0); // First restart point is at offset 0 } -BlockBuilder::BlockBuilder(const Options& options, const Comparator* comparator) - : BlockBuilder(options.block_restart_interval, comparator) {} - void BlockBuilder::Reset() { buffer_.clear(); restarts_.clear(); @@ -99,8 +94,6 @@ void BlockBuilder::Add(const Slice& key, const Slice& value) { Slice last_key_piece(last_key_); assert(!finished_); assert(counter_ <= block_restart_interval_); - assert(buffer_.empty() // No values yet? - || comparator_->Compare(key, last_key_piece) > 0); size_t shared = 0; if (counter_ < block_restart_interval_) { // See how much sharing to do with previous string diff --git a/table/block_builder.h b/table/block_builder.h index ed2f290fd23..c01a23bea93 100644 --- a/table/block_builder.h +++ b/table/block_builder.h @@ -15,13 +15,12 @@ namespace rocksdb { -struct Options; -class Comparator; - class BlockBuilder { public: - BlockBuilder(int block_builder, const Comparator* comparator); - explicit BlockBuilder(const Options& options, const Comparator* comparator); + BlockBuilder(const BlockBuilder&) = delete; + void operator=(const BlockBuilder&) = delete; + + explicit BlockBuilder(int block_restart_interval); // Reset the contents as if the BlockBuilder was just constructed. void Reset(); @@ -49,17 +48,12 @@ class BlockBuilder { private: const int block_restart_interval_; - const Comparator* comparator_; std::string buffer_; // Destination buffer std::vector restarts_; // Restart points int counter_; // Number of entries emitted since restart bool finished_; // Has Finish() been called? std::string last_key_; - - // No copying allowed - BlockBuilder(const BlockBuilder&); - void operator=(const BlockBuilder&); }; } // namespace rocksdb diff --git a/table/block_prefix_index.cc b/table/block_prefix_index.cc index f06dcd9fe76..d64b73b9849 100644 --- a/table/block_prefix_index.cc +++ b/table/block_prefix_index.cc @@ -210,8 +210,8 @@ Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor, return s; } -const uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, - uint32_t** blocks) { +uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, + uint32_t** blocks) { Slice prefix = internal_prefix_extractor_->Transform(key); uint32_t bucket = PrefixToBucket(prefix, num_buckets_); diff --git a/table/block_prefix_index.h b/table/block_prefix_index.h index 2afecadd268..662bc09aaeb 100644 --- a/table/block_prefix_index.h +++ b/table/block_prefix_index.h @@ -23,7 +23,7 @@ class BlockPrefixIndex { // the key, based on the prefix. // Returns the total number of relevant blocks, 0 means the key does // not exist. - const uint32_t GetBlocks(const Slice& key, uint32_t** blocks); + uint32_t GetBlocks(const Slice& key, uint32_t** blocks); size_t ApproximateMemoryUsage() const { return sizeof(BlockPrefixIndex) + diff --git a/table/block_test.cc b/table/block_test.cc index 8ef4a5a8d43..6b82c4d93ee 100644 --- a/table/block_test.cc +++ b/table/block_test.cc @@ -76,7 +76,7 @@ TEST(BlockTest, SimpleTest) { std::vector keys; std::vector values; - BlockBuilder builder(options, ic.get()); + BlockBuilder builder(16); int num_records = 100000; GenerateRandomKVs(&keys, &values, 0, num_records); @@ -92,8 +92,7 @@ TEST(BlockTest, SimpleTest) { BlockContents contents; contents.data = rawblock; contents.cachable = false; - contents.heap_allocated = false; - Block reader(contents); + Block reader(std::move(contents)); // read contents of block sequentially int count = 0; @@ -132,8 +131,7 @@ BlockContents GetBlockContents(std::unique_ptr *builder, const std::vector &keys, const std::vector &values, const int prefix_group_size = 1) { - builder->reset( - new BlockBuilder(1 /* restart interval */, BytewiseComparator())); + builder->reset(new BlockBuilder(1 /* restart interval */)); // Add only half of the keys for (size_t i = 0; i < keys.size(); ++i) { @@ -144,7 +142,6 @@ BlockContents GetBlockContents(std::unique_ptr *builder, BlockContents contents; contents.data = rawblock; contents.cachable = false; - contents.heap_allocated = false; return contents; } @@ -154,8 +151,10 @@ void CheckBlockContents(BlockContents contents, const int max_key, const std::vector &values) { const size_t prefix_size = 6; // create block reader - Block reader1(contents); - Block reader2(contents); + BlockContents contents_ref(contents.data, contents.cachable, + contents.compression_type); + Block reader1(std::move(contents)); + Block reader2(std::move(contents_ref)); std::unique_ptr prefix_extractor( NewFixedPrefixTransform(prefix_size)); @@ -172,7 +171,7 @@ void CheckBlockContents(BlockContents contents, const int max_key, } std::unique_ptr hash_iter( - reader1.NewIterator(BytewiseComparator())); + reader1.NewIterator(BytewiseComparator(), nullptr, false)); std::unique_ptr regular_iter( reader2.NewIterator(BytewiseComparator())); @@ -213,7 +212,7 @@ TEST(BlockTest, SimpleIndexHash) { std::unique_ptr builder; auto contents = GetBlockContents(&builder, keys, values); - CheckBlockContents(contents, kMaxKey, keys, values); + CheckBlockContents(std::move(contents), kMaxKey, keys, values); } TEST(BlockTest, IndexHashWithSharedPrefix) { @@ -232,7 +231,7 @@ TEST(BlockTest, IndexHashWithSharedPrefix) { std::unique_ptr builder; auto contents = GetBlockContents(&builder, keys, values, kPrefixGroup); - CheckBlockContents(contents, kMaxKey, keys, values); + CheckBlockContents(std::move(contents), kMaxKey, keys, values); } } // namespace rocksdb diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo_table_builder.cc index d2f5b7a8d7f..51c80d9df7b 100644 --- a/table/cuckoo_table_builder.cc +++ b/table/cuckoo_table_builder.cc @@ -16,6 +16,7 @@ #include "rocksdb/env.h" #include "rocksdb/table.h" #include "table/block_builder.h" +#include "table/cuckoo_table_factory.h" #include "table/format.h" #include "table/meta_blocks.h" #include "util/autovector.h" @@ -24,32 +25,41 @@ namespace rocksdb { const std::string CuckooTablePropertyNames::kEmptyKey = "rocksdb.cuckoo.bucket.empty.key"; -const std::string CuckooTablePropertyNames::kNumHashTable = +const std::string CuckooTablePropertyNames::kNumHashFunc = "rocksdb.cuckoo.hash.num"; -const std::string CuckooTablePropertyNames::kMaxNumBuckets = - "rocksdb.cuckoo.bucket.maxnum"; +const std::string CuckooTablePropertyNames::kHashTableSize = + "rocksdb.cuckoo.hash.size"; const std::string CuckooTablePropertyNames::kValueLength = "rocksdb.cuckoo.value.length"; const std::string CuckooTablePropertyNames::kIsLastLevel = "rocksdb.cuckoo.file.islastlevel"; +const std::string CuckooTablePropertyNames::kCuckooBlockSize = + "rocksdb.cuckoo.hash.cuckooblocksize"; +const std::string CuckooTablePropertyNames::kIdentityAsFirstHash = + "rocksdb.cuckoo.hash.identityfirst"; // Obtained by running echo rocksdb.table.cuckoo | sha1sum extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull; CuckooTableBuilder::CuckooTableBuilder( - WritableFile* file, double hash_table_ratio, + WritableFile* file, double max_hash_table_ratio, uint32_t max_num_hash_table, uint32_t max_search_depth, + const Comparator* user_comparator, uint32_t cuckoo_block_size, + bool identity_as_first_hash, uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) - : num_hash_table_(2), + : num_hash_func_(2), file_(file), - hash_table_ratio_(hash_table_ratio), - max_num_hash_table_(max_num_hash_table), + max_hash_table_ratio_(max_hash_table_ratio), + max_num_hash_func_(max_num_hash_table), max_search_depth_(max_search_depth), + cuckoo_block_size_(std::max(1U, cuckoo_block_size)), + hash_table_size_(2), is_last_level_file_(false), has_seen_first_key_(false), + ucomp_(user_comparator), + identity_as_first_hash_(identity_as_first_hash), get_slice_hash_(get_slice_hash), closed_(false) { - properties_.num_entries = 0; // Data is in a huge block. properties_.num_data_blocks = 1; properties_.index_size = 0; @@ -57,7 +67,7 @@ CuckooTableBuilder::CuckooTableBuilder( } void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { - if (properties_.num_entries >= kMaxVectorIdx - 1) { + if (kvs_.size() >= kMaxVectorIdx - 1) { status_ = Status::NotSupported("Number of keys in a file must be < 2^32-1"); return; } @@ -73,6 +83,8 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { if (!has_seen_first_key_) { is_last_level_file_ = ikey.sequence == 0; has_seen_first_key_ = true; + smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size()); + largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size()); } // Even if one sequence number is non-zero, then it is not last level. assert(!is_last_level_file_ || ikey.sequence == 0); @@ -83,28 +95,24 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { kvs_.emplace_back(std::make_pair(key.ToString(), value.ToString())); } - properties_.num_entries++; - - // We assume that the keys are inserted in sorted order as determined by - // Byte-wise comparator. To identify an unused key, which will be used in - // filling empty buckets in the table, we try to find gaps between successive - // keys inserted (ie, latest key and previous in kvs_). - if (unused_user_key_.empty() && kvs_.size() > 1) { - std::string prev_key = is_last_level_file_ ? kvs_[kvs_.size()-1].first - : ExtractUserKey(kvs_[kvs_.size()-1].first).ToString(); - std::string new_user_key = prev_key; - new_user_key.back()++; - // We ignore carry-overs and check that it is larger than previous key. - if (Slice(new_user_key).compare(Slice(prev_key)) > 0 && - Slice(new_user_key).compare(ikey.user_key) < 0) { - unused_user_key_ = new_user_key; - } + // In order to fill the empty buckets in the hash table, we identify a + // key which is not used so far (unused_user_key). We determine this by + // maintaining smallest and largest keys inserted so far in bytewise order + // and use them to find a key outside this range in Finish() operation. + // Note that this strategy is independent of user comparator used here. + if (ikey.user_key.compare(smallest_user_key_) < 0) { + smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size()); + } else if (ikey.user_key.compare(largest_user_key_) > 0) { + largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size()); + } + if (hash_table_size_ < kvs_.size() / max_hash_table_ratio_) { + hash_table_size_ *= 2; } } Status CuckooTableBuilder::MakeHashTable(std::vector* buckets) { - uint64_t num_buckets = kvs_.size() / hash_table_ratio_; - buckets->resize(num_buckets); + uint64_t hash_table_size_minus_one = hash_table_size_ - 1; + buckets->resize(hash_table_size_minus_one + cuckoo_block_size_); uint64_t make_space_for_key_call_id = 0; for (uint32_t vector_idx = 0; vector_idx < kvs_.size(); vector_idx++) { uint64_t bucket_id; @@ -112,39 +120,50 @@ Status CuckooTableBuilder::MakeHashTable(std::vector* buckets) { autovector hash_vals; Slice user_key = is_last_level_file_ ? kvs_[vector_idx].first : ExtractUserKey(kvs_[vector_idx].first); - for (uint32_t hash_cnt = 0; hash_cnt < num_hash_table_; ++hash_cnt) { - uint64_t hash_val = get_slice_hash_(user_key, hash_cnt, num_buckets); - if ((*buckets)[hash_val].vector_idx == kMaxVectorIdx) { - bucket_id = hash_val; - bucket_found = true; - break; - } else { - if (user_key.compare(is_last_level_file_ - ? Slice(kvs_[(*buckets)[hash_val].vector_idx].first) - : ExtractUserKey( - kvs_[(*buckets)[hash_val].vector_idx].first)) == 0) { - return Status::NotSupported("Same key is being inserted again."); + for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !bucket_found; + ++hash_cnt) { + uint64_t hash_val = CuckooHash(user_key, hash_cnt, + hash_table_size_minus_one, identity_as_first_hash_, get_slice_hash_); + // If there is a collision, check next cuckoo_block_size_ locations for + // empty locations. While checking, if we reach end of the hash table, + // stop searching and proceed for next hash function. + for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; + ++block_idx, ++hash_val) { + if ((*buckets)[hash_val].vector_idx == kMaxVectorIdx) { + bucket_id = hash_val; + bucket_found = true; + break; + } else { + if (ucomp_->Compare(user_key, is_last_level_file_ + ? Slice(kvs_[(*buckets)[hash_val].vector_idx].first) + : ExtractUserKey( + kvs_[(*buckets)[hash_val].vector_idx].first)) == 0) { + return Status::NotSupported("Same key is being inserted again."); + } + hash_vals.push_back(hash_val); } - hash_vals.push_back(hash_val); } } while (!bucket_found && !MakeSpaceForKey(hash_vals, ++make_space_for_key_call_id, buckets, &bucket_id)) { // Rehash by increashing number of hash tables. - if (num_hash_table_ >= max_num_hash_table_) { - return Status::NotSupported("Too many collissions. Unable to hash."); + if (num_hash_func_ >= max_num_hash_func_) { + return Status::NotSupported("Too many collisions. Unable to hash."); } // We don't really need to rehash the entire table because old hashes are // still valid and we only increased the number of hash functions. - uint64_t hash_val = get_slice_hash_(user_key, - num_hash_table_, num_buckets); - ++num_hash_table_; - if ((*buckets)[hash_val].vector_idx == kMaxVectorIdx) { - bucket_found = true; - bucket_id = hash_val; - break; - } else { - hash_vals.push_back(hash_val); + uint64_t hash_val = CuckooHash(user_key, num_hash_func_, + hash_table_size_minus_one, identity_as_first_hash_, get_slice_hash_); + ++num_hash_func_; + for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; + ++block_idx, ++hash_val) { + if ((*buckets)[hash_val].vector_idx == kMaxVectorIdx) { + bucket_found = true; + bucket_id = hash_val; + break; + } else { + hash_vals.push_back(hash_val); + } } } (*buckets)[bucket_id].vector_idx = vector_idx; @@ -156,38 +175,46 @@ Status CuckooTableBuilder::Finish() { assert(!closed_); closed_ = true; std::vector buckets; - Status s = MakeHashTable(&buckets); - if (!s.ok()) { - return s; - } - if (unused_user_key_.empty() && !kvs_.empty()) { - // Try to find the key next to last key by handling carryovers. - std::string last_key = - is_last_level_file_ ? kvs_[kvs_.size()-1].first - : ExtractUserKey(kvs_[kvs_.size()-1].first).ToString(); - std::string new_user_key = last_key; - int curr_pos = new_user_key.size() - 1; + Status s; + std::string unused_bucket; + if (!kvs_.empty()) { + s = MakeHashTable(&buckets); + if (!s.ok()) { + return s; + } + // Determine unused_user_key to fill empty buckets. + std::string unused_user_key = smallest_user_key_; + int curr_pos = unused_user_key.size() - 1; while (curr_pos >= 0) { - ++new_user_key[curr_pos]; - if (new_user_key > last_key) { - unused_user_key_ = new_user_key; + --unused_user_key[curr_pos]; + if (Slice(unused_user_key).compare(smallest_user_key_) < 0) { break; } --curr_pos; } + if (curr_pos < 0) { + // Try using the largest key to identify an unused key. + unused_user_key = largest_user_key_; + curr_pos = unused_user_key.size() - 1; + while (curr_pos >= 0) { + ++unused_user_key[curr_pos]; + if (Slice(unused_user_key).compare(largest_user_key_) > 0) { + break; + } + --curr_pos; + } + } if (curr_pos < 0) { return Status::Corruption("Unable to find unused key"); } - } - std::string unused_bucket; - if (!kvs_.empty()) { if (is_last_level_file_) { - unused_bucket = unused_user_key_; + unused_bucket = unused_user_key; } else { - ParsedInternalKey ikey(unused_user_key_, 0, kTypeValue); + ParsedInternalKey ikey(unused_user_key, 0, kTypeValue); AppendInternalKey(&unused_bucket, ikey); } } + properties_.num_entries = kvs_.size(); properties_.fixed_key_len = unused_bucket.size(); uint32_t value_length = kvs_.empty() ? 0 : kvs_[0].second.size(); uint32_t bucket_size = value_length + properties_.fixed_key_len; @@ -222,16 +249,26 @@ Status CuckooTableBuilder::Finish() { properties_.user_collected_properties[ CuckooTablePropertyNames::kEmptyKey] = unused_bucket; properties_.user_collected_properties[ - CuckooTablePropertyNames::kNumHashTable].assign( - reinterpret_cast(&num_hash_table_), sizeof(num_hash_table_)); - uint64_t num_buckets = buckets.size(); + CuckooTablePropertyNames::kNumHashFunc].assign( + reinterpret_cast(&num_hash_func_), sizeof(num_hash_func_)); + + uint64_t hash_table_size = buckets.size() - cuckoo_block_size_ + 1; properties_.user_collected_properties[ - CuckooTablePropertyNames::kMaxNumBuckets].assign( - reinterpret_cast(&num_buckets), sizeof(num_buckets)); + CuckooTablePropertyNames::kHashTableSize].assign( + reinterpret_cast(&hash_table_size), + sizeof(hash_table_size)); properties_.user_collected_properties[ CuckooTablePropertyNames::kIsLastLevel].assign( reinterpret_cast(&is_last_level_file_), sizeof(is_last_level_file_)); + properties_.user_collected_properties[ + CuckooTablePropertyNames::kCuckooBlockSize].assign( + reinterpret_cast(&cuckoo_block_size_), + sizeof(cuckoo_block_size_)); + properties_.user_collected_properties[ + CuckooTablePropertyNames::kIdentityAsFirstHash].assign( + reinterpret_cast(&identity_as_first_hash_), + sizeof(identity_as_first_hash_)); // Write meta blocks. MetaIndexBuilder meta_index_builder; @@ -275,20 +312,26 @@ void CuckooTableBuilder::Abandon() { } uint64_t CuckooTableBuilder::NumEntries() const { - return properties_.num_entries; + return kvs_.size(); } uint64_t CuckooTableBuilder::FileSize() const { if (closed_) { return file_->GetFileSize(); - } else if (properties_.num_entries == 0) { + } else if (kvs_.size() == 0) { return 0; } - // This is not the actual size of the file as we need to account for - // hash table ratio. This returns the size of filled buckets in the table - // scaled up by a factor of 1/hash_table_ratio. - return ((kvs_[0].first.size() + kvs_[0].second.size()) * - properties_.num_entries) / hash_table_ratio_; + + // Account for buckets being a power of two. + // As elements are added, file size remains constant for a while and doubles + // its size. Since compaction algorithm stops adding elements only after it + // exceeds the file limit, we account for the extra element being added here. + uint64_t expected_hash_table_size = hash_table_size_; + if (expected_hash_table_size < (kvs_.size() + 1) / max_hash_table_ratio_) { + expected_hash_table_size *= 2; + } + return (kvs_[0].first.size() + kvs_[0].second.size()) * + expected_hash_table_size - 1; } // This method is invoked when there is no place to insert the target key. @@ -318,17 +361,19 @@ bool CuckooTableBuilder::MakeSpaceForKey( std::vector tree; // We want to identify already visited buckets in the current method call so // that we don't add same buckets again for exploration in the tree. - // We do this by maintaining a count of current method call, which acts as a - // unique id for this invocation of the method. We store this number into - // the nodes that we explore in current method call. + // We do this by maintaining a count of current method call in + // make_space_for_key_call_id, which acts as a unique id for this invocation + // of the method. We store this number into the nodes that we explore in + // current method call. // It is unlikely for the increment operation to overflow because the maximum - // no. of times this will be called is <= max_num_hash_table_ + kvs_.size(). - for (uint32_t hash_cnt = 0; hash_cnt < num_hash_table_; ++hash_cnt) { + // no. of times this will be called is <= max_num_hash_func_ + kvs_.size(). + for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) { uint64_t bucket_id = hash_vals[hash_cnt]; (*buckets)[bucket_id].make_space_for_key_call_id = make_space_for_key_call_id; tree.push_back(CuckooNode(bucket_id, 0, 0)); } + uint64_t hash_table_size_minus_one = hash_table_size_ - 1; bool null_found = false; uint32_t curr_pos = 0; while (!null_found && curr_pos < tree.size()) { @@ -338,22 +383,28 @@ bool CuckooTableBuilder::MakeSpaceForKey( break; } CuckooBucket& curr_bucket = (*buckets)[curr_node.bucket_id]; - for (uint32_t hash_cnt = 0; hash_cnt < num_hash_table_; ++hash_cnt) { - uint64_t child_bucket_id = get_slice_hash_( - is_last_level_file_ ? kvs_[curr_bucket.vector_idx].first - : ExtractUserKey(Slice(kvs_[curr_bucket.vector_idx].first)), - hash_cnt, buckets->size()); - if ((*buckets)[child_bucket_id].make_space_for_key_call_id == - make_space_for_key_call_id) { - continue; - } - (*buckets)[child_bucket_id].make_space_for_key_call_id = - make_space_for_key_call_id; - tree.push_back(CuckooNode(child_bucket_id, curr_depth + 1, - curr_pos)); - if ((*buckets)[child_bucket_id].vector_idx == kMaxVectorIdx) { - null_found = true; - break; + for (uint32_t hash_cnt = 0; + hash_cnt < num_hash_func_ && !null_found; ++hash_cnt) { + uint64_t child_bucket_id = CuckooHash( + (is_last_level_file_ ? kvs_[curr_bucket.vector_idx].first : + ExtractUserKey(Slice(kvs_[curr_bucket.vector_idx].first))), + hash_cnt, hash_table_size_minus_one, identity_as_first_hash_, + get_slice_hash_); + // Iterate inside Cuckoo Block. + for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; + ++block_idx, ++child_bucket_id) { + if ((*buckets)[child_bucket_id].make_space_for_key_call_id == + make_space_for_key_call_id) { + continue; + } + (*buckets)[child_bucket_id].make_space_for_key_call_id = + make_space_for_key_call_id; + tree.push_back(CuckooNode(child_bucket_id, curr_depth + 1, + curr_pos)); + if ((*buckets)[child_bucket_id].vector_idx == kMaxVectorIdx) { + null_found = true; + break; + } } } ++curr_pos; @@ -363,10 +414,10 @@ bool CuckooTableBuilder::MakeSpaceForKey( // There is an empty node in tree.back(). Now, traverse the path from this // empty node to top of the tree and at every node in the path, replace // child with the parent. Stop when first level is reached in the tree - // (happens when 0 <= bucket_to_replace_pos < num_hash_table_) and return + // (happens when 0 <= bucket_to_replace_pos < num_hash_func_) and return // this location in first level for target key to be inserted. uint32_t bucket_to_replace_pos = tree.size()-1; - while (bucket_to_replace_pos >= num_hash_table_) { + while (bucket_to_replace_pos >= num_hash_func_) { CuckooNode& curr_node = tree[bucket_to_replace_pos]; (*buckets)[curr_node.bucket_id] = (*buckets)[tree[curr_node.parent_pos].bucket_id]; diff --git a/table/cuckoo_table_builder.h b/table/cuckoo_table_builder.h index 7bc9f1d89a4..45cf49315ca 100644 --- a/table/cuckoo_table_builder.h +++ b/table/cuckoo_table_builder.h @@ -21,8 +21,10 @@ namespace rocksdb { class CuckooTableBuilder: public TableBuilder { public: CuckooTableBuilder( - WritableFile* file, double hash_table_ratio, uint32_t max_num_hash_table, - uint32_t max_search_depth, + WritableFile* file, double max_hash_table_ratio, + uint32_t max_num_hash_func, uint32_t max_search_depth, + const Comparator* user_comparator, uint32_t cuckoo_block_size, + bool identity_as_first_hash, uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)); // REQUIRES: Either Finish() or Abandon() has been called. @@ -60,7 +62,7 @@ class CuckooTableBuilder: public TableBuilder { CuckooBucket() : vector_idx(kMaxVectorIdx), make_space_for_key_call_id(0) {} uint32_t vector_idx; - // This number will not exceed kvs_.size() + max_num_hash_table_. + // This number will not exceed kvs_.size() + max_num_hash_func_. // We assume number of items is <= 2^32. uint32_t make_space_for_key_call_id; }; @@ -73,19 +75,24 @@ class CuckooTableBuilder: public TableBuilder { uint64_t* bucket_id); Status MakeHashTable(std::vector* buckets); - uint32_t num_hash_table_; + uint32_t num_hash_func_; WritableFile* file_; - const double hash_table_ratio_; - const uint32_t max_num_hash_table_; + const double max_hash_table_ratio_; + const uint32_t max_num_hash_func_; const uint32_t max_search_depth_; + const uint32_t cuckoo_block_size_; + uint64_t hash_table_size_; bool is_last_level_file_; Status status_; std::vector> kvs_; TableProperties properties_; bool has_seen_first_key_; + const Comparator* ucomp_; + bool identity_as_first_hash_; uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index, uint64_t max_num_buckets); - std::string unused_user_key_ = ""; + std::string largest_user_key_ = ""; + std::string smallest_user_key_ = ""; bool closed_; // Either Finish() or Abandon() has been called. diff --git a/table/cuckoo_table_builder_test.cc b/table/cuckoo_table_builder_test.cc index 64c6f76531a..d259507282f 100644 --- a/table/cuckoo_table_builder_test.cc +++ b/table/cuckoo_table_builder_test.cc @@ -37,8 +37,9 @@ class CuckooBuilderTest { void CheckFileContents(const std::vector& keys, const std::vector& values, const std::vector& expected_locations, - std::string expected_unused_bucket, uint64_t expected_max_buckets, - uint32_t expected_num_hash_fun, bool expected_is_last_level) { + std::string expected_unused_bucket, uint64_t expected_table_size, + uint32_t expected_num_hash_func, bool expected_is_last_level, + uint32_t expected_cuckoo_block_size = 1) { // Read file unique_ptr read_file; ASSERT_OK(env_->NewRandomAccessFile(fname, &read_file, env_options_)); @@ -51,7 +52,8 @@ class CuckooBuilderTest { kCuckooTableMagicNumber, env_, nullptr, &props)); ASSERT_EQ(props->num_entries, keys.size()); ASSERT_EQ(props->fixed_key_len, keys.empty() ? 0 : keys[0].size()); - ASSERT_EQ(props->data_size, keys.size()*expected_unused_bucket.size()); + ASSERT_EQ(props->data_size, expected_unused_bucket.size() * + (expected_table_size + expected_cuckoo_block_size - 1)); ASSERT_EQ(props->raw_key_size, keys.size()*props->fixed_key_len); // Check unused bucket. @@ -65,14 +67,18 @@ class CuckooBuilderTest { CuckooTablePropertyNames::kValueLength].data()); ASSERT_EQ(values.empty() ? 0 : values[0].size(), value_len_found); ASSERT_EQ(props->raw_value_size, values.size()*value_len_found); - const uint64_t max_buckets = + const uint64_t table_size = *reinterpret_cast(props->user_collected_properties[ - CuckooTablePropertyNames::kMaxNumBuckets].data()); - ASSERT_EQ(expected_max_buckets, max_buckets); - const uint32_t num_hash_fun_found = + CuckooTablePropertyNames::kHashTableSize].data()); + ASSERT_EQ(expected_table_size, table_size); + const uint32_t num_hash_func_found = *reinterpret_cast(props->user_collected_properties[ - CuckooTablePropertyNames::kNumHashTable].data()); - ASSERT_EQ(expected_num_hash_fun, num_hash_fun_found); + CuckooTablePropertyNames::kNumHashFunc].data()); + ASSERT_EQ(expected_num_hash_func, num_hash_func_found); + const uint32_t cuckoo_block_size = + *reinterpret_cast(props->user_collected_properties[ + CuckooTablePropertyNames::kCuckooBlockSize].data()); + ASSERT_EQ(expected_cuckoo_block_size, cuckoo_block_size); const bool is_last_level_found = *reinterpret_cast(props->user_collected_properties[ CuckooTablePropertyNames::kIsLastLevel].data()); @@ -82,7 +88,7 @@ class CuckooBuilderTest { // Check contents of the bucket. std::vector keys_found(keys.size(), false); uint32_t bucket_size = expected_unused_bucket.size(); - for (uint32_t i = 0; i < max_buckets; ++i) { + for (uint32_t i = 0; i < table_size + cuckoo_block_size - 1; ++i) { Slice read_slice; ASSERT_OK(read_file->Read(i*bucket_size, bucket_size, &read_slice, nullptr)); @@ -108,6 +114,14 @@ class CuckooBuilderTest { return ikey.GetKey().ToString(); } + uint64_t NextPowOf2(uint64_t num) { + uint64_t n = 2; + while (n <= num) { + n *= 2; + } + return n; + } + Env* env_; EnvOptions env_options_; std::string fname; @@ -116,11 +130,12 @@ class CuckooBuilderTest { TEST(CuckooBuilderTest, SuccessWithEmptyFile) { unique_ptr writable_file; - fname = test::TmpDir() + "/NoCollisionFullKey"; + fname = test::TmpDir() + "/EmptyFile"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - 4, 100, GetSliceHash); + 4, 100, BytewiseComparator(), 1, false, GetSliceHash); ASSERT_OK(builder.status()); + ASSERT_EQ(0UL, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); CheckFileContents({}, {}, {}, "", 0, 2, false); @@ -141,26 +156,29 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) { for (auto& user_key : user_keys) { keys.push_back(GetInternalKey(user_key, false)); } + uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/NoCollisionFullKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, GetSliceHash); + num_hash_fun, 100, BytewiseComparator(), 1, false, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } + uint32_t bucket_size = keys[0].size() + values[0].size(); + ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); - uint32_t expected_max_buckets = keys.size() / kHashTableRatio; - std::string expected_unused_bucket = GetInternalKey("key05", true); + std::string expected_unused_bucket = GetInternalKey("key00", true); expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(keys, values, expected_locations, - expected_unused_bucket, expected_max_buckets, 2, false); + expected_unused_bucket, expected_table_size, 2, false); } TEST(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) { @@ -178,26 +196,71 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) { for (auto& user_key : user_keys) { keys.push_back(GetInternalKey(user_key, false)); } + uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionFullKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, GetSliceHash); + num_hash_fun, 100, BytewiseComparator(), 1, false, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } + uint32_t bucket_size = keys[0].size() + values[0].size(); + ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); - uint32_t expected_max_buckets = keys.size() / kHashTableRatio; - std::string expected_unused_bucket = GetInternalKey("key05", true); + std::string expected_unused_bucket = GetInternalKey("key00", true); expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(keys, values, expected_locations, - expected_unused_bucket, expected_max_buckets, 4, false); + expected_unused_bucket, expected_table_size, 4, false); +} + +TEST(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) { + uint32_t num_hash_fun = 4; + std::vector user_keys = {"key01", "key02", "key03", "key04"}; + std::vector values = {"v01", "v02", "v03", "v04"}; + hash_map = { + {user_keys[0], {0, 1, 2, 3}}, + {user_keys[1], {0, 1, 2, 3}}, + {user_keys[2], {0, 1, 2, 3}}, + {user_keys[3], {0, 1, 2, 3}}, + }; + std::vector expected_locations = {0, 1, 2, 3}; + std::vector keys; + for (auto& user_key : user_keys) { + keys.push_back(GetInternalKey(user_key, false)); + } + uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); + + unique_ptr writable_file; + uint32_t cuckoo_block_size = 2; + fname = test::TmpDir() + "/WithCollisionFullKey2"; + ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); + CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, + num_hash_fun, 100, BytewiseComparator(), cuckoo_block_size, false, + GetSliceHash); + ASSERT_OK(builder.status()); + for (uint32_t i = 0; i < user_keys.size(); i++) { + builder.Add(Slice(keys[i]), Slice(values[i])); + ASSERT_EQ(builder.NumEntries(), i + 1); + ASSERT_OK(builder.status()); + } + uint32_t bucket_size = keys[0].size() + values[0].size(); + ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); + ASSERT_OK(builder.Finish()); + ASSERT_OK(writable_file->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); + + std::string expected_unused_bucket = GetInternalKey("key00", true); + expected_unused_bucket += std::string(values[0].size(), 'a'); + CheckFileContents(keys, values, expected_locations, + expected_unused_bucket, expected_table_size, 3, false, cuckoo_block_size); } TEST(CuckooBuilderTest, WithCollisionPathFullKey) { @@ -220,26 +283,71 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKey) { for (auto& user_key : user_keys) { keys.push_back(GetInternalKey(user_key, false)); } + uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionPathFullKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, GetSliceHash); + num_hash_fun, 100, BytewiseComparator(), 1, false, GetSliceHash); + ASSERT_OK(builder.status()); + for (uint32_t i = 0; i < user_keys.size(); i++) { + builder.Add(Slice(keys[i]), Slice(values[i])); + ASSERT_EQ(builder.NumEntries(), i + 1); + ASSERT_OK(builder.status()); + } + uint32_t bucket_size = keys[0].size() + values[0].size(); + ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); + ASSERT_OK(builder.Finish()); + ASSERT_OK(writable_file->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); + + std::string expected_unused_bucket = GetInternalKey("key00", true); + expected_unused_bucket += std::string(values[0].size(), 'a'); + CheckFileContents(keys, values, expected_locations, + expected_unused_bucket, expected_table_size, 2, false); +} + +TEST(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) { + uint32_t num_hash_fun = 2; + std::vector user_keys = {"key01", "key02", "key03", + "key04", "key05"}; + std::vector values = {"v01", "v02", "v03", "v04", "v05"}; + hash_map = { + {user_keys[0], {0, 1}}, + {user_keys[1], {1, 2}}, + {user_keys[2], {3, 4}}, + {user_keys[3], {4, 5}}, + {user_keys[4], {0, 3}}, + }; + std::vector expected_locations = {2, 1, 3, 4, 0}; + std::vector keys; + for (auto& user_key : user_keys) { + keys.push_back(GetInternalKey(user_key, false)); + } + uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); + + unique_ptr writable_file; + fname = test::TmpDir() + "/WithCollisionPathFullKeyAndCuckooBlock"; + ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); + CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, + num_hash_fun, 100, BytewiseComparator(), 2, false, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } + uint32_t bucket_size = keys[0].size() + values[0].size(); + ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); - uint32_t expected_max_buckets = keys.size() / kHashTableRatio; - std::string expected_unused_bucket = GetInternalKey("key06", true); + std::string expected_unused_bucket = GetInternalKey("key00", true); expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(keys, values, expected_locations, - expected_unused_bucket, expected_max_buckets, 2, false); + expected_unused_bucket, expected_table_size, 2, false, 2); } TEST(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) { @@ -253,26 +361,29 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) { {user_keys[3], {3, 4, 5, 6}} }; std::vector expected_locations = {0, 1, 2, 3}; + uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/NoCollisionUserKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, GetSliceHash); + num_hash_fun, 100, BytewiseComparator(), 1, false, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } + uint32_t bucket_size = user_keys[0].size() + values[0].size(); + ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); - uint32_t expected_max_buckets = user_keys.size() / kHashTableRatio; - std::string expected_unused_bucket = "key05"; + std::string expected_unused_bucket = "key00"; expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(user_keys, values, expected_locations, - expected_unused_bucket, expected_max_buckets, 2, true); + expected_unused_bucket, expected_table_size, 2, true); } TEST(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) { @@ -286,26 +397,29 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) { {user_keys[3], {0, 1, 2, 3}}, }; std::vector expected_locations = {0, 1, 2, 3}; + uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionUserKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, GetSliceHash); + num_hash_fun, 100, BytewiseComparator(), 1, false, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } + uint32_t bucket_size = user_keys[0].size() + values[0].size(); + ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); - uint32_t expected_max_buckets = user_keys.size() / kHashTableRatio; - std::string expected_unused_bucket = "key05"; + std::string expected_unused_bucket = "key00"; expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(user_keys, values, expected_locations, - expected_unused_bucket, expected_max_buckets, 4, true); + expected_unused_bucket, expected_table_size, 4, true); } TEST(CuckooBuilderTest, WithCollisionPathUserKey) { @@ -321,26 +435,29 @@ TEST(CuckooBuilderTest, WithCollisionPathUserKey) { {user_keys[4], {0, 2}}, }; std::vector expected_locations = {0, 1, 3, 4, 2}; + uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionPathUserKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 2, GetSliceHash); + num_hash_fun, 2, BytewiseComparator(), 1, false, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } + uint32_t bucket_size = user_keys[0].size() + values[0].size(); + ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); - uint32_t expected_max_buckets = user_keys.size() / kHashTableRatio; - std::string expected_unused_bucket = "key06"; + std::string expected_unused_bucket = "key00"; expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(user_keys, values, expected_locations, - expected_unused_bucket, expected_max_buckets, 2, true); + expected_unused_bucket, expected_table_size, 2, true); } TEST(CuckooBuilderTest, FailWhenCollisionPathTooLong) { @@ -362,7 +479,7 @@ TEST(CuckooBuilderTest, FailWhenCollisionPathTooLong) { fname = test::TmpDir() + "/WithCollisionPathUserKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 2, GetSliceHash); + num_hash_fun, 2, BytewiseComparator(), 1, false, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(GetInternalKey(user_keys[i], false)), Slice("value")); @@ -382,7 +499,7 @@ TEST(CuckooBuilderTest, FailWhenSameKeyInserted) { fname = test::TmpDir() + "/FailWhenSameKeyInserted"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, GetSliceHash); + num_hash_fun, 100, BytewiseComparator(), 1, false, GetSliceHash); ASSERT_OK(builder.status()); builder.Add(Slice(GetInternalKey(user_key, false)), Slice("value1")); diff --git a/table/cuckoo_table_factory.cc b/table/cuckoo_table_factory.cc index fac201cbdfa..18db54ed785 100644 --- a/table/cuckoo_table_factory.cc +++ b/table/cuckoo_table_factory.cc @@ -9,34 +9,15 @@ #include "db/dbformat.h" #include "table/cuckoo_table_builder.h" #include "table/cuckoo_table_reader.h" -#include "util/murmurhash.h" namespace rocksdb { -extern const uint32_t kMaxNumHashTable = 64; - -extern uint64_t GetSliceMurmurHash(const Slice& s, uint32_t index, - uint64_t max_num_buckets) { - static constexpr uint32_t seeds[kMaxNumHashTable] = { - 816922183, 506425713, 949485004, 22513986, 421427259, 500437285, - 888981693, 847587269, 511007211, 722295391, 934013645, 566947683, - 193618736, 428277388, 770956674, 819994962, 755946528, 40807421, - 263144466, 241420041, 444294464, 731606396, 304158902, 563235655, - 968740453, 336996831, 462831574, 407970157, 985877240, 637708754, - 736932700, 205026023, 755371467, 729648411, 807744117, 46482135, - 847092855, 620960699, 102476362, 314094354, 625838942, 550889395, - 639071379, 834567510, 397667304, 151945969, 443634243, 196618243, - 421986347, 407218337, 964502417, 327741231, 493359459, 452453139, - 692216398, 108161624, 816246924, 234779764, 618949448, 496133787, - 156374056, 316589799, 982915425, 553105889 }; - return MurmurHash(s.data(), s.size(), seeds[index]) % max_num_buckets; -} -Status CuckooTableFactory::NewTableReader(const Options& options, - const EnvOptions& soptions, const InternalKeyComparator& icomp, +Status CuckooTableFactory::NewTableReader(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const InternalKeyComparator& icomp, std::unique_ptr&& file, uint64_t file_size, std::unique_ptr* table) const { - std::unique_ptr new_reader(new CuckooTableReader(options, - std::move(file), file_size, GetSliceMurmurHash)); + std::unique_ptr new_reader(new CuckooTableReader(ioptions, + std::move(file), file_size, icomp.user_comparator(), nullptr)); Status s = new_reader->status(); if (s.ok()) { *table = std::move(new_reader); @@ -45,15 +26,39 @@ Status CuckooTableFactory::NewTableReader(const Options& options, } TableBuilder* CuckooTableFactory::NewTableBuilder( - const Options& options, const InternalKeyComparator& internal_comparator, - WritableFile* file, CompressionType compression_type) const { - return new CuckooTableBuilder(file, hash_table_ratio_, kMaxNumHashTable, - max_search_depth_, GetSliceMurmurHash); + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& internal_comparator, + WritableFile* file, const CompressionType, + const CompressionOptions&) const { + return new CuckooTableBuilder(file, table_options_.hash_table_ratio, 64, + table_options_.max_search_depth, internal_comparator.user_comparator(), + table_options_.cuckoo_block_size, table_options_.identity_as_first_hash, + nullptr); +} + +std::string CuckooTableFactory::GetPrintableTableOptions() const { + std::string ret; + ret.reserve(2000); + const int kBufferSize = 200; + char buffer[kBufferSize]; + + snprintf(buffer, kBufferSize, " hash_table_ratio: %lf\n", + table_options_.hash_table_ratio); + ret.append(buffer); + snprintf(buffer, kBufferSize, " max_search_depth: %u\n", + table_options_.max_search_depth); + ret.append(buffer); + snprintf(buffer, kBufferSize, " cuckoo_block_size: %u\n", + table_options_.cuckoo_block_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " identity_as_first_hash: %d\n", + table_options_.identity_as_first_hash); + ret.append(buffer); + return ret; } -TableFactory* NewCuckooTableFactory(double hash_table_ratio, - uint32_t max_search_depth) { - return new CuckooTableFactory(hash_table_ratio, max_search_depth); +TableFactory* NewCuckooTableFactory(const CuckooTableOptions& table_options) { + return new CuckooTableFactory(table_options); } } // namespace rocksdb diff --git a/table/cuckoo_table_factory.h b/table/cuckoo_table_factory.h index a4b670a1fd2..7b2f32ce327 100644 --- a/table/cuckoo_table_factory.h +++ b/table/cuckoo_table_factory.h @@ -6,12 +6,31 @@ #pragma once #ifndef ROCKSDB_LITE +#include #include "rocksdb/table.h" +#include "util/murmurhash.h" +#include "rocksdb/options.h" namespace rocksdb { -extern uint64_t GetSliceMurmurHash(const Slice& s, uint32_t index, - uint64_t max_num_buckets); +const uint32_t kCuckooMurmurSeedMultiplier = 816922183; +static inline uint64_t CuckooHash( + const Slice& user_key, uint32_t hash_cnt, uint64_t table_size_minus_one, + bool identity_as_first_hash, + uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) { +#ifndef NDEBUG + // This part is used only in unit tests. + if (get_slice_hash != nullptr) { + return get_slice_hash(user_key, hash_cnt, table_size_minus_one + 1); + } +#endif + if (hash_cnt == 0 && identity_as_first_hash) { + return (*reinterpret_cast(user_key.data())) & + table_size_minus_one; + } + return MurmurHash(user_key.data(), user_key.size(), + kCuckooMurmurSeedMultiplier * hash_cnt) & table_size_minus_one; +} // Cuckoo Table is designed for applications that require fast point lookups // but not fast range scans. @@ -20,29 +39,33 @@ extern uint64_t GetSliceMurmurHash(const Slice& s, uint32_t index, // - Key length and Value length are fixed. // - Does not support Snapshot. // - Does not support Merge operations. -// - Only supports Bytewise comparators. class CuckooTableFactory : public TableFactory { public: - CuckooTableFactory(double hash_table_ratio, uint32_t max_search_depth) - : hash_table_ratio_(hash_table_ratio), - max_search_depth_(max_search_depth) {} + explicit CuckooTableFactory(const CuckooTableOptions& table_options) + : table_options_(table_options) {} ~CuckooTableFactory() {} const char* Name() const override { return "CuckooTable"; } Status NewTableReader( - const Options& options, const EnvOptions& soptions, + const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table) const override; - TableBuilder* NewTableBuilder(const Options& options, + TableBuilder* NewTableBuilder(const ImmutableCFOptions& options, const InternalKeyComparator& icomparator, WritableFile* file, - CompressionType compression_type) const override; + const CompressionType, const CompressionOptions&) const override; + + // Sanitizes the specified DB Options. + Status SanitizeDBOptions(const DBOptions* db_opts) const override { + return Status::OK(); + } + + std::string GetPrintableTableOptions() const override; private: - const double hash_table_ratio_; - const uint32_t max_search_depth_; + const CuckooTableOptions table_options_; }; } // namespace rocksdb diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo_table_reader.cc index bdf58c6794a..63b8a2c8c14 100644 --- a/table/cuckoo_table_reader.cc +++ b/table/cuckoo_table_reader.cc @@ -17,40 +17,46 @@ #include #include "rocksdb/iterator.h" #include "table/meta_blocks.h" +#include "table/cuckoo_table_factory.h" #include "util/arena.h" #include "util/coding.h" namespace rocksdb { +namespace { + static const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1); +} extern const uint64_t kCuckooTableMagicNumber; CuckooTableReader::CuckooTableReader( - const Options& options, + const ImmutableCFOptions& ioptions, std::unique_ptr&& file, uint64_t file_size, + const Comparator* comparator, uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) : file_(std::move(file)), + ucomp_(comparator), get_slice_hash_(get_slice_hash) { - if (!options.allow_mmap_reads) { + if (!ioptions.allow_mmap_reads) { status_ = Status::InvalidArgument("File is not mmaped"); } TableProperties* props = nullptr; status_ = ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber, - options.env, options.info_log.get(), &props); + ioptions.env, ioptions.info_log, &props); if (!status_.ok()) { return; } table_props_.reset(props); auto& user_props = props->user_collected_properties; - auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashTable); + auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc); if (hash_funs == user_props.end()) { - status_ = Status::InvalidArgument("Number of hash functions not found"); + status_ = Status::Corruption("Number of hash functions not found"); return; } - num_hash_fun_ = *reinterpret_cast(hash_funs->second.data()); + num_hash_func_ = *reinterpret_cast(hash_funs->second.data()); auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey); if (unused_key == user_props.end()) { - status_ = Status::InvalidArgument("Empty bucket value not found"); + status_ = Status::Corruption("Empty bucket value not found"); return; } unused_key_ = unused_key->second; @@ -58,25 +64,46 @@ CuckooTableReader::CuckooTableReader( key_length_ = props->fixed_key_len; auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength); if (value_length == user_props.end()) { - status_ = Status::InvalidArgument("Value length not found"); + status_ = Status::Corruption("Value length not found"); return; } value_length_ = *reinterpret_cast( value_length->second.data()); bucket_length_ = key_length_ + value_length_; - auto num_buckets = user_props.find(CuckooTablePropertyNames::kMaxNumBuckets); - if (num_buckets == user_props.end()) { - status_ = Status::InvalidArgument("Num buckets not found"); + auto hash_table_size = user_props.find( + CuckooTablePropertyNames::kHashTableSize); + if (hash_table_size == user_props.end()) { + status_ = Status::Corruption("Hash table size not found"); return; } - num_buckets_ = *reinterpret_cast(num_buckets->second.data()); + table_size_minus_one_ = *reinterpret_cast( + hash_table_size->second.data()) - 1; auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel); if (is_last_level == user_props.end()) { - status_ = Status::InvalidArgument("Is last level not found"); + status_ = Status::Corruption("Is last level not found"); return; } is_last_level_ = *reinterpret_cast(is_last_level->second.data()); + + auto identity_as_first_hash = user_props.find( + CuckooTablePropertyNames::kIdentityAsFirstHash); + if (identity_as_first_hash == user_props.end()) { + status_ = Status::Corruption("identity as first hash not found"); + return; + } + identity_as_first_hash_ = *reinterpret_cast( + identity_as_first_hash->second.data()); + + auto cuckoo_block_size = user_props.find( + CuckooTablePropertyNames::kCuckooBlockSize); + if (cuckoo_block_size == user_props.end()) { + status_ = Status::Corruption("Cuckoo block size not found"); + return; + } + cuckoo_block_size_ = *reinterpret_cast( + cuckoo_block_size->second.data()); + cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1; status_ = file_->Read(0, file_size, &file_data_, nullptr); } @@ -85,42 +112,53 @@ Status CuckooTableReader::Get( bool (*result_handler)(void* arg, const ParsedInternalKey& k, const Slice& v), void (*mark_key_may_exist_handler)(void* handle_context)) { - ParsedInternalKey ikey; - if (!ParseInternalKey(key, &ikey)) { - return Status::Corruption("Unable to parse key into inernal key."); - } - if ((is_last_level_ && key.size() != key_length_ + 8) || - (!is_last_level_ && key.size() != key_length_)) { - return Status::InvalidArgument("Length of key is invalid."); - } - for (uint32_t hash_cnt = 0; hash_cnt < num_hash_fun_; ++hash_cnt) { - uint64_t hash_val = get_slice_hash_(ikey.user_key, hash_cnt, num_buckets_); - assert(hash_val < num_buckets_); - uint64_t offset = hash_val * bucket_length_; + assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0)); + Slice user_key = ExtractUserKey(key); + for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) { + uint64_t offset = bucket_length_ * CuckooHash( + user_key, hash_cnt, table_size_minus_one_, identity_as_first_hash_, + get_slice_hash_); const char* bucket = &file_data_.data()[offset]; - if (unused_key_.compare(0, key_length_, bucket, key_length_) == 0) { - return Status::OK(); - } - // Here, we compare only the user key part as we support only one entry - // per user key and we don't support sanpshot. - if (ikey.user_key.compare(Slice(bucket, ikey.user_key.size())) == 0) { - Slice value = Slice(&bucket[key_length_], value_length_); - if (is_last_level_) { - ParsedInternalKey found_ikey(Slice(bucket, key_length_), 0, kTypeValue); - result_handler(handle_context, found_ikey, value); - } else { - Slice full_key(bucket, key_length_); - ParsedInternalKey found_ikey; - ParseInternalKey(full_key, &found_ikey); - result_handler(handle_context, found_ikey, value); + for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; + ++block_idx, bucket += bucket_length_) { + if (ucomp_->Compare(Slice(unused_key_.data(), user_key.size()), + Slice(bucket, user_key.size())) == 0) { + return Status::OK(); + } + // Here, we compare only the user key part as we support only one entry + // per user key and we don't support sanpshot. + if (ucomp_->Compare(user_key, Slice(bucket, user_key.size())) == 0) { + Slice value(bucket + key_length_, value_length_); + if (is_last_level_) { + ParsedInternalKey found_ikey( + Slice(bucket, key_length_), 0, kTypeValue); + result_handler(handle_context, found_ikey, value); + } else { + Slice full_key(bucket, key_length_); + ParsedInternalKey found_ikey; + ParseInternalKey(full_key, &found_ikey); + result_handler(handle_context, found_ikey, value); + } + // We don't support merge operations. So, we return here. + return Status::OK(); } - // We don't support merge operations. So, we return here. - return Status::OK(); } } return Status::OK(); } +void CuckooTableReader::Prepare(const Slice& key) { + // Prefetch the first Cuckoo Block. + Slice user_key = ExtractUserKey(key); + uint64_t addr = reinterpret_cast(file_data_.data()) + + bucket_length_ * CuckooHash(user_key, 0, table_size_minus_one_, + identity_as_first_hash_, nullptr); + uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_; + for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) { + PREFETCH(reinterpret_cast(addr), 0, 3); + } +} + class CuckooTableIterator : public Iterator { public: explicit CuckooTableIterator(CuckooTableReader* reader); @@ -137,17 +175,29 @@ class CuckooTableIterator : public Iterator { void LoadKeysFromReader(); private: - struct { + struct CompareKeys { + CompareKeys(const Comparator* ucomp, const bool last_level) + : ucomp_(ucomp), + is_last_level_(last_level) {} bool operator()(const std::pair& first, const std::pair& second) const { - return first.first.compare(second.first) < 0; + if (is_last_level_) { + return ucomp_->Compare(first.first, second.first) < 0; + } else { + return ucomp_->Compare(ExtractUserKey(first.first), + ExtractUserKey(second.first)) < 0; + } } - } CompareKeys; + + private: + const Comparator* ucomp_; + const bool is_last_level_; + }; + const CompareKeys comparator_; void PrepareKVAtCurrIdx(); CuckooTableReader* reader_; Status status_; // Contains a map of keys to bucket_id sorted in key order. - // We assume byte-wise comparison for key ordering. std::vector> key_to_bucket_id_; // We assume that the number of items can be stored in uint32 (4 Billion). uint32_t curr_key_idx_; @@ -159,7 +209,8 @@ class CuckooTableIterator : public Iterator { }; CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader) - : reader_(reader), + : comparator_(reader->ucomp_, reader->is_last_level_), + reader_(reader), curr_key_idx_(std::numeric_limits::max()) { key_to_bucket_id_.clear(); curr_value_.clear(); @@ -168,7 +219,9 @@ CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader) void CuckooTableIterator::LoadKeysFromReader() { key_to_bucket_id_.reserve(reader_->GetTableProperties()->num_entries); - for (uint32_t bucket_id = 0; bucket_id < reader_->num_buckets_; bucket_id++) { + uint64_t num_buckets = reader_->table_size_minus_one_ + + reader_->cuckoo_block_size_; + for (uint32_t bucket_id = 0; bucket_id < num_buckets; bucket_id++) { Slice read_key; status_ = reader_->file_->Read(bucket_id * reader_->bucket_length_, reader_->key_length_, &read_key, nullptr); @@ -178,7 +231,7 @@ void CuckooTableIterator::LoadKeysFromReader() { } assert(key_to_bucket_id_.size() == reader_->GetTableProperties()->num_entries); - std::sort(key_to_bucket_id_.begin(), key_to_bucket_id_.end(), CompareKeys); + std::sort(key_to_bucket_id_.begin(), key_to_bucket_id_.end(), comparator_); curr_key_idx_ = key_to_bucket_id_.size(); } @@ -200,7 +253,7 @@ void CuckooTableIterator::Seek(const Slice& target) { auto seek_it = std::lower_bound(key_to_bucket_id_.begin(), key_to_bucket_id_.end(), std::make_pair(target_to_search, 0), - CompareKeys); + comparator_); curr_key_idx_ = std::distance(key_to_bucket_id_.begin(), seek_it); PrepareKVAtCurrIdx(); } @@ -263,10 +316,17 @@ Slice CuckooTableIterator::value() const { return curr_value_; } -Iterator* CuckooTableReader::NewIterator(const ReadOptions&, Arena* arena) { +extern Iterator* NewErrorIterator(const Status& status, Arena* arena); + +Iterator* CuckooTableReader::NewIterator( + const ReadOptions& read_options, Arena* arena) { if (!status().ok()) { return NewErrorIterator( - Status::Corruption("CuckooTableReader status is not okay.")); + Status::Corruption("CuckooTableReader status is not okay."), arena); + } + if (read_options.total_order_seek) { + return NewErrorIterator( + Status::InvalidArgument("total_order_seek is not supported."), arena); } CuckooTableIterator* iter; if (arena == nullptr) { diff --git a/table/cuckoo_table_reader.h b/table/cuckoo_table_reader.h index d252531cb5b..f9e93abf4e2 100644 --- a/table/cuckoo_table_reader.h +++ b/table/cuckoo_table_reader.h @@ -16,6 +16,7 @@ #include "db/dbformat.h" #include "rocksdb/env.h" +#include "rocksdb/options.h" #include "table/table_reader.h" namespace rocksdb { @@ -26,9 +27,10 @@ class TableReader; class CuckooTableReader: public TableReader { public: CuckooTableReader( - const Options& options, + const ImmutableCFOptions& ioptions, std::unique_ptr&& file, uint64_t file_size, + const Comparator* user_comparator, uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)); ~CuckooTableReader() {} @@ -39,13 +41,14 @@ class CuckooTableReader: public TableReader { Status status() const { return status_; } Status Get( - const ReadOptions& readOptions, const Slice& key, void* handle_context, + const ReadOptions& read_options, const Slice& key, void* handle_context, bool (*result_handler)(void* arg, const ParsedInternalKey& k, const Slice& v), void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) override; Iterator* NewIterator(const ReadOptions&, Arena* arena = nullptr) override; + void Prepare(const Slice& target) override; // Report an approximation of how much memory has been used. size_t ApproximateMemoryUsage() const override; @@ -53,7 +56,6 @@ class CuckooTableReader: public TableReader { // Following methods are not implemented for Cuckoo Table Reader uint64_t ApproximateOffsetOf(const Slice& key) override { return 0; } void SetupForCompaction() override {} - void Prepare(const Slice& target) override {} // End of methods not implemented. private: @@ -62,14 +64,18 @@ class CuckooTableReader: public TableReader { std::unique_ptr file_; Slice file_data_; bool is_last_level_; + bool identity_as_first_hash_; std::shared_ptr table_props_; Status status_; - uint32_t num_hash_fun_; + uint32_t num_hash_func_; std::string unused_key_; uint32_t key_length_; uint32_t value_length_; uint32_t bucket_length_; - uint64_t num_buckets_; + uint32_t cuckoo_block_size_; + uint32_t cuckoo_block_bytes_minus_one_; + uint64_t table_size_minus_one_; + const Comparator* ucomp_; uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index, uint64_t max_num_buckets); }; diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo_table_reader_test.cc index e9bf7bb10fa..6dd5e552554 100644 --- a/table/cuckoo_table_reader_test.cc +++ b/table/cuckoo_table_reader_test.cc @@ -11,7 +11,10 @@ int main() { } #else +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include @@ -33,12 +36,12 @@ using GFLAGS::SetUsageMessage; DEFINE_string(file_dir, "", "Directory where the files will be created" " for benchmark. Added for using tmpfs."); DEFINE_bool(enable_perf, false, "Run Benchmark Tests too."); +DEFINE_bool(write, false, + "Should write new values to file in performance tests?"); +DEFINE_bool(identity_as_first_hash, true, "use identity as first hash"); namespace rocksdb { -extern const uint64_t kCuckooTableMagicNumber; -extern const uint64_t kMaxNumHashTable; - namespace { const uint32_t kNumHashFunc = 10; // Methods, variables related to Hash functions. @@ -102,11 +105,13 @@ class CuckooReaderTest { return std::string(reinterpret_cast(&i), sizeof(i)); } - void CreateCuckooFileAndCheckReader() { - unique_ptr writable_file; + void CreateCuckooFileAndCheckReader( + const Comparator* ucomp = BytewiseComparator()) { + std::unique_ptr writable_file; ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options)); CuckooTableBuilder builder( - writable_file.get(), 0.9, kNumHashFunc, 100, GetSliceHash); + writable_file.get(), 0.9, kNumHashFunc, 100, ucomp, 2, false, + GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t key_idx = 0; key_idx < num_items; ++key_idx) { builder.Add(Slice(keys[key_idx]), Slice(values[key_idx])); @@ -119,12 +124,14 @@ class CuckooReaderTest { ASSERT_OK(writable_file->Close()); // Check reader now. - unique_ptr read_file; + std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + const ImmutableCFOptions ioptions(options); CuckooTableReader reader( - options, + ioptions, std::move(read_file), file_size, + ucomp, GetSliceHash); ASSERT_OK(reader.status()); for (uint32_t i = 0; i < num_items; ++i) { @@ -143,13 +150,15 @@ class CuckooReaderTest { } } - void CheckIterator() { - unique_ptr read_file; + void CheckIterator(const Comparator* ucomp = BytewiseComparator()) { + std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + const ImmutableCFOptions ioptions(options); CuckooTableReader reader( - options, + ioptions, std::move(read_file), file_size, + ucomp, GetSliceHash); ASSERT_OK(reader.status()); Iterator* it = reader.NewIterator(ReadOptions(), nullptr); @@ -241,12 +250,40 @@ TEST(CuckooReaderTest, WhenKeyExists) { CreateCuckooFileAndCheckReader(); } +TEST(CuckooReaderTest, WhenKeyExistsWithUint64Comparator) { + SetUp(kNumHashFunc); + fname = test::TmpDir() + "/CuckooReaderUint64_WhenKeyExists"; + for (uint64_t i = 0; i < num_items; i++) { + user_keys[i].resize(8); + memcpy(&user_keys[i][0], static_cast(&i), 8); + ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue); + AppendInternalKey(&keys[i], ikey); + values[i] = "value" + NumToStr(i); + // Give disjoint hash values. + AddHashLookups(user_keys[i], i, kNumHashFunc); + } + CreateCuckooFileAndCheckReader(test::Uint64Comparator()); + // Last level file. + UpdateKeys(true); + CreateCuckooFileAndCheckReader(test::Uint64Comparator()); + // Test with collision. Make all hash values collide. + hash_map.clear(); + for (uint32_t i = 0; i < num_items; i++) { + AddHashLookups(user_keys[i], 0, kNumHashFunc); + } + UpdateKeys(false); + CreateCuckooFileAndCheckReader(test::Uint64Comparator()); + // Last level file. + UpdateKeys(true); + CreateCuckooFileAndCheckReader(test::Uint64Comparator()); +} + TEST(CuckooReaderTest, CheckIterator) { SetUp(2*kNumHashFunc); fname = test::TmpDir() + "/CuckooReader_CheckIterator"; for (uint64_t i = 0; i < num_items; i++) { user_keys[i] = "key" + NumToStr(i); - ParsedInternalKey ikey(user_keys[i], 0, kTypeValue); + ParsedInternalKey ikey(user_keys[i], 1000, kTypeValue); AppendInternalKey(&keys[i], ikey); values[i] = "value" + NumToStr(i); // Give disjoint hash values, in reverse order. @@ -260,6 +297,26 @@ TEST(CuckooReaderTest, CheckIterator) { CheckIterator(); } +TEST(CuckooReaderTest, CheckIteratorUint64) { + SetUp(2*kNumHashFunc); + fname = test::TmpDir() + "/CuckooReader_CheckIterator"; + for (uint64_t i = 0; i < num_items; i++) { + user_keys[i].resize(8); + memcpy(&user_keys[i][0], static_cast(&i), 8); + ParsedInternalKey ikey(user_keys[i], 1000, kTypeValue); + AppendInternalKey(&keys[i], ikey); + values[i] = "value" + NumToStr(i); + // Give disjoint hash values, in reverse order. + AddHashLookups(user_keys[i], num_items-i-1, kNumHashFunc); + } + CreateCuckooFileAndCheckReader(test::Uint64Comparator()); + CheckIterator(test::Uint64Comparator()); + // Last level file. + UpdateKeys(true); + CreateCuckooFileAndCheckReader(test::Uint64Comparator()); + CheckIterator(test::Uint64Comparator()); +} + TEST(CuckooReaderTest, WhenKeyNotFound) { // Add keys with colliding hash values. SetUp(kNumHashFunc); @@ -273,12 +330,14 @@ TEST(CuckooReaderTest, WhenKeyNotFound) { AddHashLookups(user_keys[i], 0, kNumHashFunc); } CreateCuckooFileAndCheckReader(); - unique_ptr read_file; + std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + const ImmutableCFOptions ioptions(options); CuckooTableReader reader( - options, + ioptions, std::move(read_file), file_size, + BytewiseComparator(), GetSliceHash); ASSERT_OK(reader.status()); // Search for a key with colliding hash values. @@ -303,31 +362,6 @@ TEST(CuckooReaderTest, WhenKeyNotFound) { ASSERT_EQ(0, v.call_count); ASSERT_OK(reader.status()); - // Test read with corrupted key. - Slice corrupt_key("corrupt_ikey"); - ASSERT_TRUE(!ParseInternalKey(corrupt_key, &ikey)); - ASSERT_TRUE(reader.Get( - ReadOptions(), corrupt_key, &v, - AssertValues, nullptr).IsCorruption()); - ASSERT_EQ(0, v.call_count); - ASSERT_OK(reader.status()); - - // Test read with key of invalid length. - IterKey k; - k.SetInternalKey("very_long_key", 0, kTypeValue); - ASSERT_TRUE(reader.Get( - ReadOptions(), k.GetKey(), &v, - AssertValues, nullptr).IsInvalidArgument()); - ASSERT_EQ(0, v.call_count); - ASSERT_OK(reader.status()); - k.Clear(); - k.SetInternalKey("s", 0, kTypeValue); - ASSERT_TRUE(reader.Get( - ReadOptions(), k.GetKey(), &v, - AssertValues, nullptr).IsInvalidArgument()); - ASSERT_EQ(0, v.call_count); - ASSERT_OK(reader.status()); - // Test read when key is unused key. std::string unused_key = reader.GetTableProperties()->user_collected_properties.at( @@ -343,8 +377,15 @@ TEST(CuckooReaderTest, WhenKeyNotFound) { // Performance tests namespace { +int64_t found_count = 0; +std::string value; bool DoNothing(void* arg, const ParsedInternalKey& k, const Slice& v) { // Deliberately empty. + if (*reinterpret_cast(k.user_key.data()) == + *reinterpret_cast(v.data())) { + ++found_count; + value.assign(v.data(), v.size()); + } return false; } @@ -356,94 +397,164 @@ bool CheckValue(void* cnt_ptr, const ParsedInternalKey& k, const Slice& v) { return false; } +void GetKeys(uint64_t num, std::vector* keys) { + keys->clear(); + IterKey k; + k.SetInternalKey("", 0, kTypeValue); + std::string internal_key_suffix = k.GetKey().ToString(); + ASSERT_EQ(static_cast(8), internal_key_suffix.size()); + for (uint64_t key_idx = 0; key_idx < num; ++key_idx) { + uint64_t value = 2 * key_idx; + std::string new_key(reinterpret_cast(&value), sizeof(value)); + new_key += internal_key_suffix; + keys->push_back(new_key); + } +} + +std::string GetFileName(uint64_t num) { + if (FLAGS_file_dir.empty()) { + FLAGS_file_dir = test::TmpDir(); + } + return FLAGS_file_dir + "/cuckoo_read_benchmark" + + std::to_string(num/1000000) + "Mkeys"; +} + // Create last level file as we are interested in measuring performance of // last level file only. -void BM_CuckooRead(uint64_t num, uint32_t key_length, - uint32_t value_length, uint64_t num_reads, double hash_ratio) { - assert(value_length <= key_length); - assert(8 <= key_length); - std::vector keys; +void WriteFile(const std::vector& keys, + const uint64_t num, double hash_ratio) { Options options; options.allow_mmap_reads = true; Env* env = options.env; EnvOptions env_options = EnvOptions(options); - uint64_t file_size; - if (FLAGS_file_dir.empty()) { - FLAGS_file_dir = test::TmpDir(); - } - std::string fname = FLAGS_file_dir + "/cuckoo_read_benchmark"; + std::string fname = GetFileName(num); - unique_ptr writable_file; + std::unique_ptr writable_file; ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options)); CuckooTableBuilder builder( writable_file.get(), hash_ratio, - kMaxNumHashTable, 1000, GetSliceMurmurHash); + 64, 1000, test::Uint64Comparator(), 5, + FLAGS_identity_as_first_hash, nullptr); ASSERT_OK(builder.status()); for (uint64_t key_idx = 0; key_idx < num; ++key_idx) { // Value is just a part of key. - std::string new_key(reinterpret_cast(&key_idx), sizeof(key_idx)); - new_key = std::string(key_length - new_key.size(), 'k') + new_key; - ParsedInternalKey ikey(new_key, 0, kTypeValue); - std::string full_key; - AppendInternalKey(&full_key, ikey); - builder.Add(Slice(full_key), Slice(&full_key[0], value_length)); + builder.Add(Slice(keys[key_idx]), Slice(&keys[key_idx][0], 4)); ASSERT_EQ(builder.NumEntries(), key_idx + 1); ASSERT_OK(builder.status()); - keys.push_back(full_key); } ASSERT_OK(builder.Finish()); ASSERT_EQ(num, builder.NumEntries()); - file_size = builder.FileSize(); ASSERT_OK(writable_file->Close()); - unique_ptr read_file; + + uint64_t file_size; + env->GetFileSize(fname, &file_size); + std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + const ImmutableCFOptions ioptions(options); CuckooTableReader reader( - options, - std::move(read_file), - file_size, - GetSliceMurmurHash); + ioptions, std::move(read_file), file_size, + test::Uint64Comparator(), nullptr); + ASSERT_OK(reader.status()); + ReadOptions r_options; + for (uint64_t i = 0; i < num; ++i) { + int cnt = 0; + ASSERT_OK(reader.Get(r_options, Slice(keys[i]), &cnt, CheckValue, nullptr)); + if (cnt != 1) { + fprintf(stderr, "%" PRIu64 " not found.\n", i); + ASSERT_EQ(1, cnt); + } + } +} + +void ReadKeys(uint64_t num, uint32_t batch_size) { + Options options; + options.allow_mmap_reads = true; + Env* env = options.env; + EnvOptions env_options = EnvOptions(options); + std::string fname = GetFileName(num); + + uint64_t file_size; + env->GetFileSize(fname, &file_size); + std::unique_ptr read_file; + ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + + const ImmutableCFOptions ioptions(options); + CuckooTableReader reader( + ioptions, std::move(read_file), file_size, test::Uint64Comparator(), + nullptr); ASSERT_OK(reader.status()); const UserCollectedProperties user_props = reader.GetTableProperties()->user_collected_properties; const uint32_t num_hash_fun = *reinterpret_cast( - user_props.at(CuckooTablePropertyNames::kNumHashTable).data()); - fprintf(stderr, "With %" PRIu64 " items and hash table ratio %f, number of" - " hash functions used: %u.\n", num, hash_ratio, num_hash_fun); + user_props.at(CuckooTablePropertyNames::kNumHashFunc).data()); + const uint64_t table_size = *reinterpret_cast( + user_props.at(CuckooTablePropertyNames::kHashTableSize).data()); + fprintf(stderr, "With %" PRIu64 " items, utilization is %.2f%%, number of" + " hash functions: %u.\n", num, num * 100.0 / (table_size), num_hash_fun); ReadOptions r_options; - for (auto& key : keys) { - int cnt = 0; - ASSERT_OK(reader.Get(r_options, Slice(key), &cnt, CheckValue, nullptr)); - ASSERT_EQ(1, cnt); + + std::vector keys; + keys.reserve(num); + for (uint64_t i = 0; i < num; ++i) { + keys.push_back(2 * i); } - // Shuffle Keys. std::random_shuffle(keys.begin(), keys.end()); - uint64_t time_now = env->NowMicros(); - reader.NewIterator(ReadOptions(), nullptr); - fprintf(stderr, "Time taken for preparing iterator for %" PRIu64 " items: %" PRIu64 " ms.\n", - num, (env->NowMicros() - time_now)/1000); - time_now = env->NowMicros(); - for (uint64_t i = 0; i < num_reads; ++i) { - reader.Get(r_options, Slice(keys[i % num]), nullptr, DoNothing, nullptr); + found_count = 0; + uint64_t start_time = env->NowMicros(); + if (batch_size > 0) { + for (uint64_t i = 0; i < num; i += batch_size) { + for (uint64_t j = i; j < i+batch_size && j < num; ++j) { + reader.Prepare(Slice(reinterpret_cast(&keys[j]), 16)); + } + for (uint64_t j = i; j < i+batch_size && j < num; ++j) { + reader.Get(r_options, Slice(reinterpret_cast(&keys[j]), 16), + nullptr, DoNothing, nullptr); + } + } + } else { + for (uint64_t i = 0; i < num; i++) { + reader.Get(r_options, Slice(reinterpret_cast(&keys[i]), 16), + nullptr, DoNothing, nullptr); + } } - fprintf(stderr, "Time taken per op is %.3fus\n", - (env->NowMicros() - time_now)*1.0/num_reads); + float time_per_op = (env->NowMicros() - start_time) * 1.0 / num; + fprintf(stderr, + "Time taken per op is %.3fus (%.1f Mqps) with batch size of %u, " + "# of found keys %" PRId64 "\n", + time_per_op, 1.0 / time_per_op, batch_size, found_count); } } // namespace. -TEST(CuckooReaderTest, Performance) { - // In all these tests, num_reads = 10*num_items. +TEST(CuckooReaderTest, TestReadPerformance) { if (!FLAGS_enable_perf) { return; } - BM_CuckooRead(100000, 8, 4, 1000000, 0.9); - BM_CuckooRead(1000000, 8, 4, 10000000, 0.9); - BM_CuckooRead(1000000, 8, 4, 10000000, 0.7); - BM_CuckooRead(10000000, 8, 4, 100000000, 0.9); - BM_CuckooRead(10000000, 8, 4, 100000000, 0.7); + double hash_ratio = 0.95; + // These numbers are chosen to have a hash utilizaiton % close to + // 0.9, 0.75, 0.6 and 0.5 respectively. + // They all create 128 M buckets. + std::vector nums = {120*1024*1024, 100*1024*1024, 80*1024*1024, + 70*1024*1024}; +#ifndef NDEBUG + fprintf(stdout, + "WARNING: Not compiled with DNDEBUG. Performance tests may be slow.\n"); +#endif + std::vector keys; + for (uint64_t num : nums) { + if (FLAGS_write || !Env::Default()->FileExists(GetFileName(num))) { + GetKeys(num, &keys); + WriteFile(keys, num, hash_ratio); + } + ReadKeys(num, 0); + ReadKeys(num, 10); + ReadKeys(num, 25); + ReadKeys(num, 50); + ReadKeys(num, 100); + fprintf(stderr, "\n"); + } } - } // namespace rocksdb int main(int argc, char** argv) { diff --git a/table/filter_block.h b/table/filter_block.h index 000e106b26d..19767682768 100644 --- a/table/filter_block.h +++ b/table/filter_block.h @@ -10,6 +10,11 @@ // A filter block is stored near the end of a Table file. It contains // filters (e.g., bloom filters) for all data blocks in the table combined // into a single filter block. +// +// It is a base class for BlockBasedFilter and FullFilter. +// These two are both used in BlockBasedTable. The first one contain filter +// For a part of keys in sst file, the second contain filter for all keys +// in sst file. #pragma once @@ -21,10 +26,13 @@ #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" #include "util/hash.h" +#include "format.h" namespace rocksdb { +const uint64_t kNotValid = ULLONG_MAX; class FilterPolicy; // A FilterBlockBuilder is used to construct all of the filters for a @@ -32,62 +40,45 @@ class FilterPolicy; // a special block in the Table. // // The sequence of calls to FilterBlockBuilder must match the regexp: -// (StartBlock AddKey*)* Finish +// (StartBlock Add*)* Finish +// +// BlockBased/Full FilterBlock would be called in the same way. class FilterBlockBuilder { public: - explicit FilterBlockBuilder(const Options& opt, - const Comparator* internal_comparator); + explicit FilterBlockBuilder() {} + virtual ~FilterBlockBuilder() {} - void StartBlock(uint64_t block_offset); - void AddKey(const Slice& key); - Slice Finish(); + virtual bool IsBlockBased() = 0; // If is blockbased filter + virtual void StartBlock(uint64_t block_offset) = 0; // Start new block filter + virtual void Add(const Slice& key) = 0; // Add a key to current filter + virtual Slice Finish() = 0; // Generate Filter private: - bool SamePrefix(const Slice &key1, const Slice &key2) const; - void GenerateFilter(); - - // important: all of these might point to invalid addresses - // at the time of destruction of this filter block. destructor - // should NOT dereference them. - const FilterPolicy* policy_; - const SliceTransform* prefix_extractor_; - bool whole_key_filtering_; - const Comparator* comparator_; - - std::string entries_; // Flattened entry contents - std::vector start_; // Starting index in entries_ of each entry - std::string result_; // Filter data computed so far - std::vector tmp_entries_; // policy_->CreateFilter() argument - std::vector filter_offsets_; - // No copying allowed FilterBlockBuilder(const FilterBlockBuilder&); void operator=(const FilterBlockBuilder&); }; +// A FilterBlockReader is used to parse filter from SST table. +// KeyMayMatch and PrefixMayMatch would trigger filter checking +// +// BlockBased/Full FilterBlock would be called in the same way. class FilterBlockReader { public: - // REQUIRES: "contents" and *policy must stay live while *this is live. - FilterBlockReader( - const Options& opt, - const Slice& contents, - bool delete_contents_after_use = false); - bool KeyMayMatch(uint64_t block_offset, const Slice& key); - bool PrefixMayMatch(uint64_t block_offset, const Slice& prefix); - size_t ApproximateMemoryUsage() const; - - private: - const FilterPolicy* policy_; - const SliceTransform* prefix_extractor_; - bool whole_key_filtering_; - const char* data_; // Pointer to filter data (at block-start) - const char* offset_; // Pointer to beginning of offset array (at block-end) - size_t num_; // Number of entries in offset array - size_t base_lg_; // Encoding parameter (see kFilterBaseLg in .cc file) - std::unique_ptr filter_data; + explicit FilterBlockReader() {} + virtual ~FilterBlockReader() {} + virtual bool IsBlockBased() = 0; // If is blockbased filter + virtual bool KeyMayMatch(const Slice& key, + uint64_t block_offset = kNotValid) = 0; + virtual bool PrefixMayMatch(const Slice& prefix, + uint64_t block_offset = kNotValid) = 0; + virtual size_t ApproximateMemoryUsage() const = 0; - bool MayMatch(uint64_t block_offset, const Slice& entry); + private: + // No copying allowed + FilterBlockReader(const FilterBlockReader&); + void operator=(const FilterBlockReader&); }; -} +} // namespace rocksdb diff --git a/table/filter_block_test.cc b/table/filter_block_test.cc deleted file mode 100644 index 1703d59d179..00000000000 --- a/table/filter_block_test.cc +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -// -// Copyright (c) 2012 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "table/filter_block.h" - -#include "rocksdb/filter_policy.h" -#include "util/coding.h" -#include "util/hash.h" -#include "util/logging.h" -#include "util/testharness.h" -#include "util/testutil.h" - -namespace rocksdb { - -// For testing: emit an array with one hash value per key -class TestHashFilter : public FilterPolicy { - public: - virtual const char* Name() const { - return "TestHashFilter"; - } - - virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { - for (int i = 0; i < n; i++) { - uint32_t h = Hash(keys[i].data(), keys[i].size(), 1); - PutFixed32(dst, h); - } - } - - virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const { - uint32_t h = Hash(key.data(), key.size(), 1); - for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) { - if (h == DecodeFixed32(filter.data() + i)) { - return true; - } - } - return false; - } -}; - -class FilterBlockTest { - public: - TestHashFilter policy_; - Options options_; - - FilterBlockTest() { - options_ = Options(); - options_.filter_policy = &policy_; - } -}; - -TEST(FilterBlockTest, EmptyBuilder) { - FilterBlockBuilder builder(options_, options_.comparator); - Slice block = builder.Finish(); - ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block)); - FilterBlockReader reader(options_, block); - ASSERT_TRUE(reader.KeyMayMatch(0, "foo")); - ASSERT_TRUE(reader.KeyMayMatch(100000, "foo")); -} - -TEST(FilterBlockTest, SingleChunk) { - FilterBlockBuilder builder(options_, options_.comparator); - builder.StartBlock(100); - builder.AddKey("foo"); - builder.AddKey("bar"); - builder.AddKey("box"); - builder.StartBlock(200); - builder.AddKey("box"); - builder.StartBlock(300); - builder.AddKey("hello"); - Slice block = builder.Finish(); - FilterBlockReader reader(options_, block); - ASSERT_TRUE(reader.KeyMayMatch(100, "foo")); - ASSERT_TRUE(reader.KeyMayMatch(100, "bar")); - ASSERT_TRUE(reader.KeyMayMatch(100, "box")); - ASSERT_TRUE(reader.KeyMayMatch(100, "hello")); - ASSERT_TRUE(reader.KeyMayMatch(100, "foo")); - ASSERT_TRUE(! reader.KeyMayMatch(100, "missing")); - ASSERT_TRUE(! reader.KeyMayMatch(100, "other")); -} - -TEST(FilterBlockTest, MultiChunk) { - FilterBlockBuilder builder(options_, options_.comparator); - - // First filter - builder.StartBlock(0); - builder.AddKey("foo"); - builder.StartBlock(2000); - builder.AddKey("bar"); - - // Second filter - builder.StartBlock(3100); - builder.AddKey("box"); - - // Third filter is empty - - // Last filter - builder.StartBlock(9000); - builder.AddKey("box"); - builder.AddKey("hello"); - - Slice block = builder.Finish(); - FilterBlockReader reader(options_, block); - - // Check first filter - ASSERT_TRUE(reader.KeyMayMatch(0, "foo")); - ASSERT_TRUE(reader.KeyMayMatch(2000, "bar")); - ASSERT_TRUE(! reader.KeyMayMatch(0, "box")); - ASSERT_TRUE(! reader.KeyMayMatch(0, "hello")); - - // Check second filter - ASSERT_TRUE(reader.KeyMayMatch(3100, "box")); - ASSERT_TRUE(! reader.KeyMayMatch(3100, "foo")); - ASSERT_TRUE(! reader.KeyMayMatch(3100, "bar")); - ASSERT_TRUE(! reader.KeyMayMatch(3100, "hello")); - - // Check third filter (empty) - ASSERT_TRUE(! reader.KeyMayMatch(4100, "foo")); - ASSERT_TRUE(! reader.KeyMayMatch(4100, "bar")); - ASSERT_TRUE(! reader.KeyMayMatch(4100, "box")); - ASSERT_TRUE(! reader.KeyMayMatch(4100, "hello")); - - // Check last filter - ASSERT_TRUE(reader.KeyMayMatch(9000, "box")); - ASSERT_TRUE(reader.KeyMayMatch(9000, "hello")); - ASSERT_TRUE(! reader.KeyMayMatch(9000, "foo")); - ASSERT_TRUE(! reader.KeyMayMatch(9000, "bar")); -} - -} // namespace rocksdb - -int main(int argc, char** argv) { - return rocksdb::test::RunAllTests(); -} diff --git a/table/flush_block_policy.cc b/table/flush_block_policy.cc index 4e2235205fa..4c12b30bb2b 100644 --- a/table/flush_block_policy.cc +++ b/table/flush_block_policy.cc @@ -62,9 +62,11 @@ class FlushBlockBySizePolicy : public FlushBlockPolicy { }; FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( - const Options& options, const BlockBuilder& data_block_builder) const { + const BlockBasedTableOptions& table_options, + const BlockBuilder& data_block_builder) const { return new FlushBlockBySizePolicy( - options.block_size, options.block_size_deviation, data_block_builder); + table_options.block_size, table_options.block_size_deviation, + data_block_builder); } } // namespace rocksdb diff --git a/table/format.cc b/table/format.cc index a642965d551..db11f9d4a85 100644 --- a/table/format.cc +++ b/table/format.cc @@ -135,7 +135,7 @@ Status Footer::DecodeFrom(Slice* input) { snprintf(buffer, sizeof(buffer) - 1, "not an sstable (bad magic number --- %lx)", (long)magic); - return Status::InvalidArgument(buffer); + return Status::Corruption(buffer); } } else { set_table_magic_number(magic); @@ -156,7 +156,7 @@ Status Footer::DecodeFrom(Slice* input) { // It consists of the checksum type, two block handles, padding, // a version number, and a magic number if (input->size() < kVersion1EncodedLength) { - return Status::InvalidArgument("input is too short to be an sstable"); + return Status::Corruption("input is too short to be an sstable"); } else { input->remove_prefix(input->size() - kVersion1EncodedLength); } @@ -183,7 +183,7 @@ Status ReadFooterFromFile(RandomAccessFile* file, uint64_t file_size, Footer* footer) { if (file_size < Footer::kMinEncodedLength) { - return Status::InvalidArgument("file is too short to be an sstable"); + return Status::Corruption("file is too short to be an sstable"); } char footer_space[Footer::kMaxEncodedLength]; @@ -198,7 +198,7 @@ Status ReadFooterFromFile(RandomAccessFile* file, // Check that we actually read the whole footer from the file. It may be // that size isn't correct. if (footer_input.size() < Footer::kMinEncodedLength) { - return Status::InvalidArgument("file is too short to be an sstable"); + return Status::Corruption("file is too short to be an sstable"); } return footer->DecodeFrom(&footer_input); @@ -211,10 +211,13 @@ Status ReadBlock(RandomAccessFile* file, const Footer& footer, const ReadOptions& options, const BlockHandle& handle, Slice* contents, /* result of reading */ char* buf) { size_t n = static_cast(handle.size()); + Status s; + + { + PERF_TIMER_GUARD(block_read_time); + s = file->Read(handle.offset(), n + kBlockTrailerSize, contents, buf); + } - PERF_TIMER_AUTO(block_read_time); - Status s = file->Read(handle.offset(), n + kBlockTrailerSize, contents, buf); - PERF_TIMER_MEASURE(block_read_time); PERF_COUNTER_ADD(block_read_count, 1); PERF_COUNTER_ADD(block_read_byte, n + kBlockTrailerSize); @@ -228,6 +231,7 @@ Status ReadBlock(RandomAccessFile* file, const Footer& footer, // Check the crc of the type and the block contents const char* data = contents->data(); // Pointer to where Read put the data if (options.verify_checksums) { + PERF_TIMER_GUARD(block_checksum_time); uint32_t value = DecodeFixed32(data + n + 1); uint32_t actual = 0; switch (footer.checksum()) { @@ -247,118 +251,58 @@ Status ReadBlock(RandomAccessFile* file, const Footer& footer, if (!s.ok()) { return s; } - PERF_TIMER_STOP(block_checksum_time); } return s; } -// Decompress a block according to params -// May need to malloc a space for cache usage -Status DecompressBlock(BlockContents* result, size_t block_size, - bool do_uncompress, const char* buf, - const Slice& contents, bool use_stack_buf) { - Status s; - size_t n = block_size; - const char* data = contents.data(); - - result->data = Slice(); - result->cachable = false; - result->heap_allocated = false; - - PERF_TIMER_AUTO(block_decompress_time); - rocksdb::CompressionType compression_type = - static_cast(data[n]); - // If the caller has requested that the block not be uncompressed - if (!do_uncompress || compression_type == kNoCompression) { - if (data != buf) { - // File implementation gave us pointer to some other data. - // Use it directly under the assumption that it will be live - // while the file is open. - result->data = Slice(data, n); - result->heap_allocated = false; - result->cachable = false; // Do not double-cache - } else { - if (use_stack_buf) { - // Need to allocate space in heap for cache usage - char* new_buf = new char[n]; - memcpy(new_buf, buf, n); - result->data = Slice(new_buf, n); - } else { - result->data = Slice(buf, n); - } - - result->heap_allocated = true; - result->cachable = true; - } - result->compression_type = compression_type; - s = Status::OK(); +Status ReadBlockContents(RandomAccessFile* file, const Footer& footer, + const ReadOptions& options, const BlockHandle& handle, + BlockContents* contents, Env* env, + bool decompression_requested) { + Status status; + Slice slice; + size_t n = static_cast(handle.size()); + std::unique_ptr heap_buf; + char stack_buf[DefaultStackBufferSize]; + char* used_buf = nullptr; + rocksdb::CompressionType compression_type; + + if (decompression_requested && + n + kBlockTrailerSize < DefaultStackBufferSize) { + // If we've got a small enough hunk of data, read it in to the + // trivially allocated stack buffer instead of needing a full malloc() + used_buf = &stack_buf[0]; } else { - s = UncompressBlockContents(data, n, result); + heap_buf = std::unique_ptr(new char[n + kBlockTrailerSize]); + used_buf = heap_buf.get(); } - PERF_TIMER_STOP(block_decompress_time); - return s; -} -// Read and Decompress block -// Use buf in stack as temp reading buffer -Status ReadAndDecompressFast(RandomAccessFile* file, const Footer& footer, - const ReadOptions& options, - const BlockHandle& handle, BlockContents* result, - Env* env, bool do_uncompress) { - Status s; - Slice contents; - size_t n = static_cast(handle.size()); - char buf[DefaultStackBufferSize]; + status = ReadBlock(file, footer, options, handle, &slice, used_buf); - s = ReadBlock(file, footer, options, handle, &contents, buf); - if (!s.ok()) { - return s; - } - s = DecompressBlock(result, n, do_uncompress, buf, contents, true); - if (!s.ok()) { - return s; + if (!status.ok()) { + return status; } - return s; -} -// Read and Decompress block -// Use buf in heap as temp reading buffer -Status ReadAndDecompress(RandomAccessFile* file, const Footer& footer, - const ReadOptions& options, const BlockHandle& handle, - BlockContents* result, Env* env, bool do_uncompress) { - Status s; - Slice contents; - size_t n = static_cast(handle.size()); - char* buf = new char[n + kBlockTrailerSize]; + PERF_TIMER_GUARD(block_decompress_time); - s = ReadBlock(file, footer, options, handle, &contents, buf); - if (!s.ok()) { - delete[] buf; - return s; - } - s = DecompressBlock(result, n, do_uncompress, buf, contents, false); - if (!s.ok()) { - delete[] buf; - return s; + compression_type = static_cast(slice.data()[n]); + + if (decompression_requested && compression_type != kNoCompression) { + return UncompressBlockContents(slice.data(), n, contents); } - if (result->data.data() != buf) { - delete[] buf; + if (slice.data() != used_buf) { + *contents = BlockContents(Slice(slice.data(), n), false, compression_type); + return status; } - return s; -} -Status ReadBlockContents(RandomAccessFile* file, const Footer& footer, - const ReadOptions& options, const BlockHandle& handle, - BlockContents* result, Env* env, bool do_uncompress) { - size_t n = static_cast(handle.size()); - if (do_uncompress && n + kBlockTrailerSize < DefaultStackBufferSize) { - return ReadAndDecompressFast(file, footer, options, handle, result, env, - do_uncompress); - } else { - return ReadAndDecompress(file, footer, options, handle, result, env, - do_uncompress); + if (used_buf == &stack_buf[0]) { + heap_buf = std::unique_ptr(new char[n]); + memcpy(heap_buf.get(), stack_buf, n); } + + *contents = BlockContents(std::move(heap_buf), n, true, compression_type); + return status; } // @@ -368,8 +312,8 @@ Status ReadBlockContents(RandomAccessFile* file, const Footer& footer, // buffer is returned via 'result' and it is upto the caller to // free this buffer. Status UncompressBlockContents(const char* data, size_t n, - BlockContents* result) { - char* ubuf = nullptr; + BlockContents* contents) { + std::unique_ptr ubuf; int decompress_size = 0; assert(data[n] != kNoCompression); switch (data[n]) { @@ -380,64 +324,60 @@ Status UncompressBlockContents(const char* data, size_t n, if (!port::Snappy_GetUncompressedLength(data, n, &ulength)) { return Status::Corruption(snappy_corrupt_msg); } - ubuf = new char[ulength]; - if (!port::Snappy_Uncompress(data, n, ubuf)) { - delete[] ubuf; + ubuf = std::unique_ptr(new char[ulength]); + if (!port::Snappy_Uncompress(data, n, ubuf.get())) { return Status::Corruption(snappy_corrupt_msg); } - result->data = Slice(ubuf, ulength); - result->heap_allocated = true; - result->cachable = true; + *contents = BlockContents(std::move(ubuf), ulength, true, kNoCompression); break; } case kZlibCompression: - ubuf = port::Zlib_Uncompress(data, n, &decompress_size); + ubuf = std::unique_ptr( + port::Zlib_Uncompress(data, n, &decompress_size)); static char zlib_corrupt_msg[] = "Zlib not supported or corrupted Zlib compressed block contents"; if (!ubuf) { return Status::Corruption(zlib_corrupt_msg); } - result->data = Slice(ubuf, decompress_size); - result->heap_allocated = true; - result->cachable = true; + *contents = + BlockContents(std::move(ubuf), decompress_size, true, kNoCompression); break; case kBZip2Compression: - ubuf = port::BZip2_Uncompress(data, n, &decompress_size); + ubuf = std::unique_ptr( + port::BZip2_Uncompress(data, n, &decompress_size)); static char bzip2_corrupt_msg[] = "Bzip2 not supported or corrupted Bzip2 compressed block contents"; if (!ubuf) { return Status::Corruption(bzip2_corrupt_msg); } - result->data = Slice(ubuf, decompress_size); - result->heap_allocated = true; - result->cachable = true; + *contents = + BlockContents(std::move(ubuf), decompress_size, true, kNoCompression); break; case kLZ4Compression: - ubuf = port::LZ4_Uncompress(data, n, &decompress_size); + ubuf = std::unique_ptr( + port::LZ4_Uncompress(data, n, &decompress_size)); static char lz4_corrupt_msg[] = "LZ4 not supported or corrupted LZ4 compressed block contents"; if (!ubuf) { return Status::Corruption(lz4_corrupt_msg); } - result->data = Slice(ubuf, decompress_size); - result->heap_allocated = true; - result->cachable = true; + *contents = + BlockContents(std::move(ubuf), decompress_size, true, kNoCompression); break; case kLZ4HCCompression: - ubuf = port::LZ4_Uncompress(data, n, &decompress_size); + ubuf = std::unique_ptr( + port::LZ4_Uncompress(data, n, &decompress_size)); static char lz4hc_corrupt_msg[] = "LZ4HC not supported or corrupted LZ4HC compressed block contents"; if (!ubuf) { return Status::Corruption(lz4hc_corrupt_msg); } - result->data = Slice(ubuf, decompress_size); - result->heap_allocated = true; - result->cachable = true; + *contents = + BlockContents(std::move(ubuf), decompress_size, true, kNoCompression); break; default: return Status::Corruption("bad block type"); } - result->compression_type = kNoCompression; // not compressed any more return Status::OK(); } diff --git a/table/format.h b/table/format.h index a971c1a67c9..986164d81be 100644 --- a/table/format.h +++ b/table/format.h @@ -160,18 +160,29 @@ static const size_t kBlockTrailerSize = 5; struct BlockContents { Slice data; // Actual contents of data bool cachable; // True iff data can be cached - bool heap_allocated; // True iff caller should delete[] data.data() CompressionType compression_type; + std::unique_ptr allocation; + + BlockContents() : cachable(false), compression_type(kNoCompression) {} + + BlockContents(const Slice& _data, bool _cachable, + CompressionType _compression_type) + : data(_data), cachable(_cachable), compression_type(_compression_type) {} + + BlockContents(std::unique_ptr&& _data, size_t _size, bool _cachable, + CompressionType _compression_type) + : data(_data.get(), _size), + cachable(_cachable), + compression_type(_compression_type), + allocation(std::move(_data)) {} }; // Read the block identified by "handle" from "file". On failure // return non-OK. On success fill *result and return OK. -extern Status ReadBlockContents(RandomAccessFile* file, - const Footer& footer, +extern Status ReadBlockContents(RandomAccessFile* file, const Footer& footer, const ReadOptions& options, const BlockHandle& handle, - BlockContents* result, - Env* env, + BlockContents* contents, Env* env, bool do_uncompress); // The 'data' points to the raw block contents read in from file. @@ -179,9 +190,8 @@ extern Status ReadBlockContents(RandomAccessFile* file, // contents are uncompresed into this buffer. This buffer is // returned via 'result' and it is upto the caller to // free this buffer. -extern Status UncompressBlockContents(const char* data, - size_t n, - BlockContents* result); +extern Status UncompressBlockContents(const char* data, size_t n, + BlockContents* contents); // Implementation details follow. Clients should ignore, diff --git a/table/full_filter_block.cc b/table/full_filter_block.cc new file mode 100644 index 00000000000..4113ec57a9f --- /dev/null +++ b/table/full_filter_block.cc @@ -0,0 +1,103 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "table/full_filter_block.h" + +#include "rocksdb/filter_policy.h" +#include "port/port.h" +#include "util/coding.h" + +namespace rocksdb { + +FullFilterBlockBuilder::FullFilterBlockBuilder( + const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt, + FilterBitsBuilder* filter_bits_builder) + : prefix_extractor_(prefix_extractor), + whole_key_filtering_(table_opt.whole_key_filtering), + num_added_(0) { + assert(filter_bits_builder != nullptr); + filter_bits_builder_.reset(filter_bits_builder); +} + +void FullFilterBlockBuilder::Add(const Slice& key) { + if (whole_key_filtering_) { + AddKey(key); + } + if (prefix_extractor_ && prefix_extractor_->InDomain(key)) { + AddPrefix(key); + } +} + +// Add key to filter if needed +inline void FullFilterBlockBuilder::AddKey(const Slice& key) { + filter_bits_builder_->AddKey(key); + num_added_++; +} + +// Add prefix to filter if needed +inline void FullFilterBlockBuilder::AddPrefix(const Slice& key) { + Slice prefix = prefix_extractor_->Transform(key); + filter_bits_builder_->AddKey(prefix); + num_added_++; +} + +Slice FullFilterBlockBuilder::Finish() { + if (num_added_ != 0) { + num_added_ = 0; + return filter_bits_builder_->Finish(&filter_data_); + } + return Slice(); +} + +FullFilterBlockReader::FullFilterBlockReader( + const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt, const Slice& contents, + FilterBitsReader* filter_bits_reader) + : prefix_extractor_(prefix_extractor), + whole_key_filtering_(table_opt.whole_key_filtering), + contents_(contents) { + assert(filter_bits_reader != nullptr); + filter_bits_reader_.reset(filter_bits_reader); +} + +FullFilterBlockReader::FullFilterBlockReader( + const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt, BlockContents&& contents, + FilterBitsReader* filter_bits_reader) + : FullFilterBlockReader(prefix_extractor, table_opt, contents.data, + filter_bits_reader) { + block_contents_ = std::move(contents); +} + +bool FullFilterBlockReader::KeyMayMatch(const Slice& key, + uint64_t block_offset) { + assert(block_offset == kNotValid); + if (!whole_key_filtering_) { + return true; + } + return MayMatch(key); +} + +bool FullFilterBlockReader::PrefixMayMatch(const Slice& prefix, + uint64_t block_offset) { + assert(block_offset == kNotValid); + if (!prefix_extractor_) { + return true; + } + return MayMatch(prefix); +} + +bool FullFilterBlockReader::MayMatch(const Slice& entry) { + if (contents_.size() != 0) { + return filter_bits_reader_->MayMatch(entry); + } + return true; // remain the same with block_based filter +} + +size_t FullFilterBlockReader::ApproximateMemoryUsage() const { + return contents_.size(); +} +} // namespace rocksdb diff --git a/table/full_filter_block.h b/table/full_filter_block.h new file mode 100644 index 00000000000..6d6294cf245 --- /dev/null +++ b/table/full_filter_block.h @@ -0,0 +1,111 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include +#include +#include +#include +#include +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "db/dbformat.h" +#include "util/hash.h" +#include "table/filter_block.h" + +namespace rocksdb { + +class FilterPolicy; +class FilterBitsBuilder; +class FilterBitsReader; + +// A FullFilterBlockBuilder is used to construct a full filter for a +// particular Table. It generates a single string which is stored as +// a special block in the Table. +// The format of full filter block is: +// +----------------------------------------------------------------+ +// | full filter for all keys in sst file | +// +----------------------------------------------------------------+ +// The full filter can be very large. At the end of it, we put +// num_probes: how many hash functions are used in bloom filter +// +class FullFilterBlockBuilder : public FilterBlockBuilder { + public: + explicit FullFilterBlockBuilder(const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt, + FilterBitsBuilder* filter_bits_builder); + // bits_builder is created in filter_policy, it should be passed in here + // directly. and be deleted here + ~FullFilterBlockBuilder() {} + + virtual bool IsBlockBased() override { return false; } + virtual void StartBlock(uint64_t block_offset) override {} + virtual void Add(const Slice& key) override; + virtual Slice Finish() override; + + private: + // important: all of these might point to invalid addresses + // at the time of destruction of this filter block. destructor + // should NOT dereference them. + const SliceTransform* prefix_extractor_; + bool whole_key_filtering_; + + uint32_t num_added_; + std::unique_ptr filter_bits_builder_; + std::unique_ptr filter_data_; + + void AddKey(const Slice& key); + void AddPrefix(const Slice& key); + + // No copying allowed + FullFilterBlockBuilder(const FullFilterBlockBuilder&); + void operator=(const FullFilterBlockBuilder&); +}; + +// A FilterBlockReader is used to parse filter from SST table. +// KeyMayMatch and PrefixMayMatch would trigger filter checking +class FullFilterBlockReader : public FilterBlockReader { + public: + // REQUIRES: "contents" and filter_bits_reader must stay live + // while *this is live. + explicit FullFilterBlockReader(const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt, + const Slice& contents, + FilterBitsReader* filter_bits_reader); + explicit FullFilterBlockReader(const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt, + BlockContents&& contents, + FilterBitsReader* filter_bits_reader); + + // bits_reader is created in filter_policy, it should be passed in here + // directly. and be deleted here + ~FullFilterBlockReader() {} + + virtual bool IsBlockBased() override { return false; } + virtual bool KeyMayMatch(const Slice& key, + uint64_t block_offset = kNotValid) override; + virtual bool PrefixMayMatch(const Slice& prefix, + uint64_t block_offset = kNotValid) override; + virtual size_t ApproximateMemoryUsage() const override; + + private: + const SliceTransform* prefix_extractor_; + bool whole_key_filtering_; + + std::unique_ptr filter_bits_reader_; + Slice contents_; + BlockContents block_contents_; + std::unique_ptr filter_data_; + + bool MayMatch(const Slice& entry); + + // No copying allowed + FullFilterBlockReader(const FullFilterBlockReader&); + void operator=(const FullFilterBlockReader&); +}; + +} // namespace rocksdb diff --git a/table/full_filter_block_test.cc b/table/full_filter_block_test.cc new file mode 100644 index 00000000000..7bf61f2383b --- /dev/null +++ b/table/full_filter_block_test.cc @@ -0,0 +1,182 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "table/full_filter_block.h" + +#include "rocksdb/filter_policy.h" +#include "util/coding.h" +#include "util/hash.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +class TestFilterBitsBuilder : public FilterBitsBuilder { + public: + explicit TestFilterBitsBuilder() {} + + // Add Key to filter + virtual void AddKey(const Slice& key) override { + hash_entries_.push_back(Hash(key.data(), key.size(), 1)); + } + + // Generate the filter using the keys that are added + virtual Slice Finish(std::unique_ptr* buf) override { + uint32_t len = hash_entries_.size() * 4; + char* data = new char[len]; + for (size_t i = 0; i < hash_entries_.size(); i++) { + EncodeFixed32(data + i * 4, hash_entries_[i]); + } + const char* const_data = data; + buf->reset(const_data); + return Slice(data, len); + } + + private: + std::vector hash_entries_; +}; + +class TestFilterBitsReader : public FilterBitsReader { + public: + explicit TestFilterBitsReader(const Slice& contents) + : data_(contents.data()), len_(contents.size()) {} + + virtual bool MayMatch(const Slice& entry) override { + uint32_t h = Hash(entry.data(), entry.size(), 1); + for (size_t i = 0; i + 4 <= len_; i += 4) { + if (h == DecodeFixed32(data_ + i)) { + return true; + } + } + return false; + } + + private: + const char* data_; + uint32_t len_; +}; + + +class TestHashFilter : public FilterPolicy { + public: + virtual const char* Name() const { + return "TestHashFilter"; + } + + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { + for (int i = 0; i < n; i++) { + uint32_t h = Hash(keys[i].data(), keys[i].size(), 1); + PutFixed32(dst, h); + } + } + + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const { + uint32_t h = Hash(key.data(), key.size(), 1); + for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) { + if (h == DecodeFixed32(filter.data() + i)) { + return true; + } + } + return false; + } + + virtual FilterBitsBuilder* GetFilterBitsBuilder() const override { + return new TestFilterBitsBuilder(); + } + + virtual FilterBitsReader* GetFilterBitsReader(const Slice& contents) + const override { + return new TestFilterBitsReader(contents); + } +}; + +class PluginFullFilterBlockTest { + public: + BlockBasedTableOptions table_options_; + + PluginFullFilterBlockTest() { + table_options_.filter_policy.reset(new TestHashFilter()); + } +}; + +TEST(PluginFullFilterBlockTest, PluginEmptyBuilder) { + FullFilterBlockBuilder builder(nullptr, table_options_, + table_options_.filter_policy->GetFilterBitsBuilder()); + Slice block = builder.Finish(); + ASSERT_EQ("", EscapeString(block)); + + FullFilterBlockReader reader(nullptr, table_options_, block, + table_options_.filter_policy->GetFilterBitsReader(block)); + // Remain same symantic with blockbased filter + ASSERT_TRUE(reader.KeyMayMatch("foo")); +} + +TEST(PluginFullFilterBlockTest, PluginSingleChunk) { + FullFilterBlockBuilder builder(nullptr, table_options_, + table_options_.filter_policy->GetFilterBitsBuilder()); + builder.Add("foo"); + builder.Add("bar"); + builder.Add("box"); + builder.Add("box"); + builder.Add("hello"); + Slice block = builder.Finish(); + FullFilterBlockReader reader(nullptr, table_options_, block, + table_options_.filter_policy->GetFilterBitsReader(block)); + ASSERT_TRUE(reader.KeyMayMatch("foo")); + ASSERT_TRUE(reader.KeyMayMatch("bar")); + ASSERT_TRUE(reader.KeyMayMatch("box")); + ASSERT_TRUE(reader.KeyMayMatch("hello")); + ASSERT_TRUE(reader.KeyMayMatch("foo")); + ASSERT_TRUE(!reader.KeyMayMatch("missing")); + ASSERT_TRUE(!reader.KeyMayMatch("other")); +} + +class FullFilterBlockTest { + public: + BlockBasedTableOptions table_options_; + + FullFilterBlockTest() { + table_options_.filter_policy.reset(NewBloomFilterPolicy(10, false)); + } + + ~FullFilterBlockTest() {} +}; + +TEST(FullFilterBlockTest, EmptyBuilder) { + FullFilterBlockBuilder builder(nullptr, table_options_, + table_options_.filter_policy->GetFilterBitsBuilder()); + Slice block = builder.Finish(); + ASSERT_EQ("", EscapeString(block)); + + FullFilterBlockReader reader(nullptr, table_options_, block, + table_options_.filter_policy->GetFilterBitsReader(block)); + // Remain same symantic with blockbased filter + ASSERT_TRUE(reader.KeyMayMatch("foo")); +} + +TEST(FullFilterBlockTest, SingleChunk) { + FullFilterBlockBuilder builder(nullptr, table_options_, + table_options_.filter_policy->GetFilterBitsBuilder()); + builder.Add("foo"); + builder.Add("bar"); + builder.Add("box"); + builder.Add("box"); + builder.Add("hello"); + Slice block = builder.Finish(); + FullFilterBlockReader reader(nullptr, table_options_, block, + table_options_.filter_policy->GetFilterBitsReader(block)); + ASSERT_TRUE(reader.KeyMayMatch("foo")); + ASSERT_TRUE(reader.KeyMayMatch("bar")); + ASSERT_TRUE(reader.KeyMayMatch("box")); + ASSERT_TRUE(reader.KeyMayMatch("hello")); + ASSERT_TRUE(reader.KeyMayMatch("foo")); + ASSERT_TRUE(!reader.KeyMayMatch("missing")); + ASSERT_TRUE(!reader.KeyMayMatch("other")); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/table/merger.cc b/table/merger.cc index 611480cec02..a53376cebce 100644 --- a/table/merger.cc +++ b/table/merger.cc @@ -116,12 +116,12 @@ class MergingIterator : public Iterator { // Invalidate the heap. use_heap_ = false; IteratorWrapper* first_child = nullptr; - PERF_TIMER_DECLARE(); for (auto& child : children_) { - PERF_TIMER_START(seek_child_seek_time); - child.Seek(target); - PERF_TIMER_STOP(seek_child_seek_time); + { + PERF_TIMER_GUARD(seek_child_seek_time); + child.Seek(target); + } PERF_COUNTER_ADD(seek_child_seek_count, 1); if (child.Valid()) { @@ -134,24 +134,21 @@ class MergingIterator : public Iterator { } else { // We have more than one children with valid keys. Initialize // the heap and put the first child into the heap. - PERF_TIMER_START(seek_min_heap_time); + PERF_TIMER_GUARD(seek_min_heap_time); ClearHeaps(); minHeap_.push(first_child); - PERF_TIMER_STOP(seek_min_heap_time); } } if (use_heap_) { - PERF_TIMER_START(seek_min_heap_time); + PERF_TIMER_GUARD(seek_min_heap_time); minHeap_.push(&child); - PERF_TIMER_STOP(seek_min_heap_time); } } } if (use_heap_) { // If heap is valid, need to put the smallest key to curent_. - PERF_TIMER_START(seek_min_heap_time); + PERF_TIMER_GUARD(seek_min_heap_time); FindSmallest(); - PERF_TIMER_STOP(seek_min_heap_time); } else { // The heap is not valid, then the current_ iterator is the first // one, or null if there is no first child. diff --git a/table/merger_test.cc b/table/merger_test.cc new file mode 100644 index 00000000000..3a10527f447 --- /dev/null +++ b/table/merger_test.cc @@ -0,0 +1,197 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include +#include + +#include "rocksdb/iterator.h" +#include "table/merger.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +class VectorIterator : public Iterator { + public: + explicit VectorIterator(const std::vector& keys) + : keys_(keys), current_(keys.size()) { + std::sort(keys_.begin(), keys_.end()); + } + + virtual bool Valid() const { return current_ < keys_.size(); } + + virtual void SeekToFirst() { current_ = 0; } + virtual void SeekToLast() { current_ = keys_.size() - 1; } + + virtual void Seek(const Slice& target) { + current_ = std::lower_bound(keys_.begin(), keys_.end(), target.ToString()) - + keys_.begin(); + } + + virtual void Next() { current_++; } + virtual void Prev() { current_--; } + + virtual Slice key() const { return Slice(keys_[current_]); } + virtual Slice value() const { return Slice(); } + + virtual Status status() const { return Status::OK(); } + + private: + std::vector keys_; + size_t current_; +}; + +class MergerTest { + public: + MergerTest() + : rnd_(3), merging_iterator_(nullptr), single_iterator_(nullptr) {} + ~MergerTest() = default; + std::vector GenerateStrings(int len, int string_len) { + std::vector ret; + for (int i = 0; i < len; ++i) { + ret.push_back(test::RandomHumanReadableString(&rnd_, string_len)); + } + return ret; + } + + void AssertEquivalence() { + auto a = merging_iterator_.get(); + auto b = single_iterator_.get(); + if (!a->Valid()) { + ASSERT_TRUE(!b->Valid()); + } else { + ASSERT_TRUE(b->Valid()); + ASSERT_EQ(b->key().ToString(), a->key().ToString()); + ASSERT_EQ(b->value().ToString(), a->value().ToString()); + } + } + + void SeekToRandom() { Seek(test::RandomHumanReadableString(&rnd_, 5)); } + + void Seek(std::string target) { + merging_iterator_->Seek(target); + single_iterator_->Seek(target); + } + + void SeekToFirst() { + merging_iterator_->SeekToFirst(); + single_iterator_->SeekToFirst(); + } + + void SeekToLast() { + merging_iterator_->SeekToLast(); + single_iterator_->SeekToLast(); + } + + void Next(int times) { + for (int i = 0; i < times && merging_iterator_->Valid(); ++i) { + AssertEquivalence(); + merging_iterator_->Next(); + single_iterator_->Next(); + } + AssertEquivalence(); + } + + void Prev(int times) { + for (int i = 0; i < times && merging_iterator_->Valid(); ++i) { + AssertEquivalence(); + merging_iterator_->Prev(); + single_iterator_->Prev(); + } + AssertEquivalence(); + } + + void NextAndPrev(int times) { + for (int i = 0; i < times && merging_iterator_->Valid(); ++i) { + AssertEquivalence(); + if (rnd_.OneIn(2)) { + merging_iterator_->Prev(); + single_iterator_->Prev(); + } else { + merging_iterator_->Next(); + single_iterator_->Next(); + } + } + AssertEquivalence(); + } + + void Generate(size_t num_iterators, size_t strings_per_iterator, + size_t letters_per_string) { + std::vector small_iterators; + for (size_t i = 0; i < num_iterators; ++i) { + auto strings = GenerateStrings(strings_per_iterator, letters_per_string); + small_iterators.push_back(new VectorIterator(strings)); + all_keys_.insert(all_keys_.end(), strings.begin(), strings.end()); + } + + merging_iterator_.reset(NewMergingIterator( + BytewiseComparator(), &small_iterators[0], small_iterators.size())); + single_iterator_.reset(new VectorIterator(all_keys_)); + } + + Random rnd_; + std::unique_ptr merging_iterator_; + std::unique_ptr single_iterator_; + std::vector all_keys_; +}; + +TEST(MergerTest, SeekToRandomNextTest) { + Generate(1000, 50, 50); + for (int i = 0; i < 10; ++i) { + SeekToRandom(); + AssertEquivalence(); + Next(50000); + } +} + +TEST(MergerTest, SeekToRandomNextSmallStringsTest) { + Generate(1000, 50, 2); + for (int i = 0; i < 10; ++i) { + SeekToRandom(); + AssertEquivalence(); + Next(50000); + } +} + +TEST(MergerTest, SeekToRandomPrevTest) { + Generate(1000, 50, 50); + for (int i = 0; i < 10; ++i) { + SeekToRandom(); + AssertEquivalence(); + Prev(50000); + } +} + +TEST(MergerTest, SeekToRandomRandomTest) { + Generate(200, 50, 50); + for (int i = 0; i < 3; ++i) { + SeekToRandom(); + AssertEquivalence(); + NextAndPrev(5000); + } +} + +TEST(MergerTest, SeekToFirstTest) { + Generate(1000, 50, 50); + for (int i = 0; i < 10; ++i) { + SeekToFirst(); + AssertEquivalence(); + Next(50000); + } +} + +TEST(MergerTest, SeekToLastTest) { + Generate(1000, 50, 50); + for (int i = 0; i < 10; ++i) { + SeekToLast(); + AssertEquivalence(); + Prev(50000); + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index a95f4c119d3..5aabffcb0a7 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -16,9 +16,7 @@ namespace rocksdb { MetaIndexBuilder::MetaIndexBuilder() - : meta_index_block_( - new BlockBuilder(1 /* restart interval */, BytewiseComparator())) { -} + : meta_index_block_(new BlockBuilder(1 /* restart interval */)) {} void MetaIndexBuilder::Add(const std::string& key, const BlockHandle& handle) { @@ -35,9 +33,7 @@ Slice MetaIndexBuilder::Finish() { } PropertyBlockBuilder::PropertyBlockBuilder() - : properties_block_( - new BlockBuilder(1 /* restart interval */, BytewiseComparator())) { -} + : properties_block_(new BlockBuilder(1 /* restart interval */)) {} void PropertyBlockBuilder::Add(const std::string& name, const std::string& val) { @@ -145,14 +141,15 @@ Status ReadProperties(const Slice &handle_value, RandomAccessFile *file, BlockContents block_contents; ReadOptions read_options; read_options.verify_checksums = false; - Status s = ReadBlockContents(file, footer, read_options, handle, - &block_contents, env, false); + Status s; + s = ReadBlockContents(file, footer, read_options, handle, &block_contents, + env, false); if (!s.ok()) { return s; } - Block properties_block(block_contents); + Block properties_block(std::move(block_contents)); std::unique_ptr iter( properties_block.NewIterator(BytewiseComparator())); @@ -237,7 +234,7 @@ Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size, if (!s.ok()) { return s; } - Block metaindex_block(metaindex_contents); + Block metaindex_block(std::move(metaindex_contents)); std::unique_ptr meta_iter( metaindex_block.NewIterator(BytewiseComparator())); @@ -291,7 +288,7 @@ Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size, if (!s.ok()) { return s; } - Block metaindex_block(metaindex_contents); + Block metaindex_block(std::move(metaindex_contents)); std::unique_ptr meta_iter; meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator())); @@ -303,10 +300,11 @@ Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size, uint64_t table_magic_number, Env* env, const std::string& meta_block_name, BlockContents* contents) { + Status status; Footer footer(table_magic_number); - auto s = ReadFooterFromFile(file, file_size, &footer); - if (!s.ok()) { - return s; + status = ReadFooterFromFile(file, file_size, &footer); + if (!status.ok()) { + return status; } // Reading metaindex block @@ -314,30 +312,28 @@ Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size, BlockContents metaindex_contents; ReadOptions read_options; read_options.verify_checksums = false; - s = ReadBlockContents(file, footer, read_options, metaindex_handle, - &metaindex_contents, env, false); - if (!s.ok()) { - return s; + status = ReadBlockContents(file, footer, read_options, metaindex_handle, + &metaindex_contents, env, false); + if (!status.ok()) { + return status; } // Finding metablock - Block metaindex_block(metaindex_contents); + Block metaindex_block(std::move(metaindex_contents)); std::unique_ptr meta_iter; meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator())); BlockHandle block_handle; - s = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle); + status = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle); - if (!s.ok()) { - return s; + if (!status.ok()) { + return status; } // Reading metablock - s = ReadBlockContents(file, footer, read_options, block_handle, contents, env, - false); - - return s; + return ReadBlockContents(file, footer, read_options, block_handle, contents, + env, false); } } // namespace rocksdb diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc index 4f3b62ad4eb..b5914554bd7 100644 --- a/table/plain_table_builder.cc +++ b/table/plain_table_builder.cc @@ -20,7 +20,6 @@ #include "table/block_builder.h" #include "table/bloom_block.h" #include "table/plain_table_index.h" -#include "table/filter_block.h" #include "table/format.h" #include "table/meta_blocks.h" #include "util/coding.h" @@ -58,24 +57,24 @@ extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull; extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull; PlainTableBuilder::PlainTableBuilder( - const Options& options, WritableFile* file, uint32_t user_key_len, - EncodingType encoding_type, size_t index_sparseness, + const ImmutableCFOptions& ioptions, WritableFile* file, + uint32_t user_key_len, EncodingType encoding_type, size_t index_sparseness, uint32_t bloom_bits_per_key, uint32_t num_probes, size_t huge_page_tlb_size, double hash_table_ratio, bool store_index_in_file) - : options_(options), + : ioptions_(ioptions), bloom_block_(num_probes), file_(file), bloom_bits_per_key_(bloom_bits_per_key), huge_page_tlb_size_(huge_page_tlb_size), - encoder_(encoding_type, user_key_len, options.prefix_extractor.get(), + encoder_(encoding_type, user_key_len, ioptions.prefix_extractor, index_sparseness), store_index_in_file_(store_index_in_file), - prefix_extractor_(options.prefix_extractor.get()) { + prefix_extractor_(ioptions.prefix_extractor) { // Build index block and save it in the file if hash_table_ratio > 0 if (store_index_in_file_) { assert(hash_table_ratio > 0 || IsTotalOrderMode()); index_builder_.reset( - new PlainTableIndexBuilder(&arena_, options, index_sparseness, + new PlainTableIndexBuilder(&arena_, ioptions, index_sparseness, hash_table_ratio, huge_page_tlb_size_)); assert(bloom_bits_per_key_ > 0); properties_.user_collected_properties @@ -93,10 +92,10 @@ PlainTableBuilder::PlainTableBuilder( // plain encoding. properties_.format_version = (encoding_type == kPlain) ? 0 : 1; - if (options_.prefix_extractor) { + if (ioptions_.prefix_extractor) { properties_.user_collected_properties [PlainTablePropertyNames::kPrefixExtractorName] = - options_.prefix_extractor->Name(); + ioptions_.prefix_extractor->Name(); } std::string val; @@ -105,7 +104,7 @@ PlainTableBuilder::PlainTableBuilder( [PlainTablePropertyNames::kEncodingType] = val; for (auto& collector_factories : - options.table_properties_collector_factories) { + ioptions.table_properties_collector_factories) { table_properties_collectors_.emplace_back( collector_factories->CreateTablePropertiesCollector()); } @@ -124,11 +123,11 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) { // Store key hash if (store_index_in_file_) { - if (options_.prefix_extractor.get() == nullptr) { + if (ioptions_.prefix_extractor == nullptr) { keys_or_prefixes_hashes_.push_back(GetSliceHash(internal_key.user_key)); } else { Slice prefix = - options_.prefix_extractor->Transform(internal_key.user_key); + ioptions_.prefix_extractor->Transform(internal_key.user_key); keys_or_prefixes_hashes_.push_back(GetSliceHash(prefix)); } } @@ -160,7 +159,7 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) { // notify property collectors NotifyCollectTableCollectorsOnAdd(key, value, table_properties_collectors_, - options_.info_log.get()); + ioptions_.info_log); } Status PlainTableBuilder::status() const { return status_; } @@ -183,7 +182,8 @@ Status PlainTableBuilder::Finish() { if (store_index_in_file_ && (properties_.num_entries > 0)) { bloom_block_.SetTotalBits( &arena_, properties_.num_entries * bloom_bits_per_key_, - options_.bloom_locality, huge_page_tlb_size_, options_.info_log.get()); + ioptions_.bloom_locality, huge_page_tlb_size_, + ioptions_.info_log); PutVarint32(&properties_.user_collected_properties [PlainTablePropertyNames::kNumBloomBlocks], @@ -224,7 +224,7 @@ Status PlainTableBuilder::Finish() { // -- Add user collected properties NotifyCollectTableCollectorsOnFinish(table_properties_collectors_, - options_.info_log.get(), + ioptions_.info_log, &property_block_builder); // -- Write property block diff --git a/table/plain_table_builder.h b/table/plain_table_builder.h index 2871d887e82..c3af0807279 100644 --- a/table/plain_table_builder.h +++ b/table/plain_table_builder.h @@ -30,7 +30,7 @@ class PlainTableBuilder: public TableBuilder { // caller to close the file after calling Finish(). The output file // will be part of level specified by 'level'. A value of -1 means // that the caller does not know which level the output file will reside. - PlainTableBuilder(const Options& options, WritableFile* file, + PlainTableBuilder(const ImmutableCFOptions& ioptions, WritableFile* file, uint32_t user_key_size, EncodingType encoding_type, size_t index_sparseness, uint32_t bloom_bits_per_key, uint32_t num_probes = 6, size_t huge_page_tlb_size = 0, @@ -71,7 +71,7 @@ class PlainTableBuilder: public TableBuilder { private: Arena arena_; - Options options_; + const ImmutableCFOptions& ioptions_; std::vector> table_properties_collectors_; diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc index bd9d91d1cd7..de23cc902b0 100644 --- a/table/plain_table_factory.cc +++ b/table/plain_table_factory.cc @@ -14,27 +14,62 @@ namespace rocksdb { -Status PlainTableFactory::NewTableReader(const Options& options, - const EnvOptions& soptions, +Status PlainTableFactory::NewTableReader(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const InternalKeyComparator& icomp, unique_ptr&& file, uint64_t file_size, unique_ptr* table) const { - return PlainTableReader::Open(options, soptions, icomp, std::move(file), + return PlainTableReader::Open(ioptions, env_options, icomp, std::move(file), file_size, table, bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, huge_page_tlb_size_, full_scan_mode_); } TableBuilder* PlainTableFactory::NewTableBuilder( - const Options& options, const InternalKeyComparator& internal_comparator, - WritableFile* file, CompressionType compression_type) const { - return new PlainTableBuilder(options, file, user_key_len_, encoding_type_, + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& internal_comparator, + WritableFile* file, const CompressionType, + const CompressionOptions&) const { + return new PlainTableBuilder(ioptions, file, user_key_len_, encoding_type_, index_sparseness_, bloom_bits_per_key_, 6, huge_page_tlb_size_, hash_table_ratio_, store_index_in_file_); } +std::string PlainTableFactory::GetPrintableTableOptions() const { + std::string ret; + ret.reserve(20000); + const int kBufferSize = 200; + char buffer[kBufferSize]; + + snprintf(buffer, kBufferSize, " user_key_len: %u\n", + user_key_len_); + ret.append(buffer); + snprintf(buffer, kBufferSize, " bloom_bits_per_key: %d\n", + bloom_bits_per_key_); + ret.append(buffer); + snprintf(buffer, kBufferSize, " hash_table_ratio: %lf\n", + hash_table_ratio_); + ret.append(buffer); + snprintf(buffer, kBufferSize, " index_sparseness: %zd\n", + index_sparseness_); + ret.append(buffer); + snprintf(buffer, kBufferSize, " huge_page_tlb_size: %zd\n", + huge_page_tlb_size_); + ret.append(buffer); + snprintf(buffer, kBufferSize, " encoding_type: %d\n", + encoding_type_); + ret.append(buffer); + snprintf(buffer, kBufferSize, " full_scan_mode: %d\n", + full_scan_mode_); + ret.append(buffer); + snprintf(buffer, kBufferSize, " store_index_in_file: %d\n", + store_index_in_file_); + ret.append(buffer); + return ret; +} + extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options) { return new PlainTableFactory(options); } diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h index ed54c4d103b..e79475221d3 100644 --- a/table/plain_table_factory.h +++ b/table/plain_table_factory.h @@ -14,7 +14,6 @@ namespace rocksdb { -struct Options; struct EnvOptions; using std::unique_ptr; @@ -128,7 +127,7 @@ class TableBuilder; class PlainTableFactory : public TableFactory { public: ~PlainTableFactory() {} - // user_key_size is the length of the user key. If it is set to be + // user_key_len is the length of the user key. If it is set to be // kPlainTableVariableLength, then it means variable length. Otherwise, all // the keys need to have the fix length of this value. bloom_bits_per_key is // number of bits used for bloom filer per key. hash_table_ratio is @@ -154,18 +153,31 @@ class PlainTableFactory : public TableFactory { full_scan_mode_(options.full_scan_mode), store_index_in_file_(options.store_index_in_file) {} const char* Name() const override { return "PlainTable"; } - Status NewTableReader(const Options& options, const EnvOptions& soptions, - const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table) const override; - TableBuilder* NewTableBuilder(const Options& options, - const InternalKeyComparator& icomparator, - WritableFile* file, - CompressionType compression_type) const - override; + Status NewTableReader( + const ImmutableCFOptions& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table) const override; + TableBuilder* NewTableBuilder( + const ImmutableCFOptions& options, + const InternalKeyComparator& icomparator, + WritableFile* file, + const CompressionType, + const CompressionOptions&) const override; + + std::string GetPrintableTableOptions() const override; static const char kValueTypeSeqId0 = 0xFF; + // Sanitizes the specified DB Options. + Status SanitizeDBOptions(const DBOptions* db_opts) const override { + if (db_opts->allow_mmap_reads == false) { + return Status::NotSupported( + "PlainTable with allow_mmap_reads == false is not supported."); + } + return Status::OK(); + } + private: uint32_t user_key_len_; int bloom_bits_per_key_; diff --git a/table/plain_table_index.cc b/table/plain_table_index.cc index efba9b71dac..61f9e335b4d 100644 --- a/table/plain_table_index.cc +++ b/table/plain_table_index.cc @@ -93,7 +93,7 @@ Slice PlainTableIndexBuilder::Finish() { BucketizeIndexes(&hash_to_offsets, &entries_per_bucket); keys_per_prefix_hist_.Add(num_keys_per_prefix_); - Log(options_.info_log, "Number of Keys per prefix Histogram: %s", + Log(ioptions_.info_log, "Number of Keys per prefix Histogram: %s", keys_per_prefix_hist_.ToString().c_str()); // From the temp data structure, populate indexes. @@ -147,11 +147,11 @@ void PlainTableIndexBuilder::BucketizeIndexes( Slice PlainTableIndexBuilder::FillIndexes( const std::vector& hash_to_offsets, const std::vector& entries_per_bucket) { - Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index", + Log(ioptions_.info_log, "Reserving %zu bytes for plain table's sub_index", sub_index_size_); auto total_allocate_size = GetTotalSize(); char* allocated = arena_->AllocateAligned( - total_allocate_size, huge_page_tlb_size_, options_.info_log.get()); + total_allocate_size, huge_page_tlb_size_, ioptions_.info_log); auto temp_ptr = EncodeVarint32(allocated, index_size_); uint32_t* index = @@ -191,7 +191,7 @@ Slice PlainTableIndexBuilder::FillIndexes( } assert(sub_index_offset == sub_index_size_); - Log(options_.info_log, "hash table size: %d, suffix_map length %zu", + Log(ioptions_.info_log, "hash table size: %d, suffix_map length %zu", index_size_, sub_index_size_); return Slice(allocated, GetTotalSize()); } diff --git a/table/plain_table_index.h b/table/plain_table_index.h index f63bbd0d522..0b26ecd0d0e 100644 --- a/table/plain_table_index.h +++ b/table/plain_table_index.h @@ -108,11 +108,11 @@ class PlainTableIndex { // #wiki-in-memory-index-format class PlainTableIndexBuilder { public: - PlainTableIndexBuilder(Arena* arena, const Options& options, + PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions, uint32_t index_sparseness, double hash_table_ratio, double huge_page_tlb_size) : arena_(arena), - options_(options), + ioptions_(ioptions), record_list_(kRecordsPerGroup), is_first_record_(true), due_index_(false), @@ -120,7 +120,7 @@ class PlainTableIndexBuilder { num_keys_per_prefix_(0), prev_key_prefix_hash_(0), index_sparseness_(index_sparseness), - prefix_extractor_(options.prefix_extractor.get()), + prefix_extractor_(ioptions.prefix_extractor), hash_table_ratio_(hash_table_ratio), huge_page_tlb_size_(huge_page_tlb_size) {} @@ -196,7 +196,7 @@ class PlainTableIndexBuilder { const std::vector& entries_per_bucket); Arena* arena_; - Options options_; + const ImmutableCFOptions ioptions_; HistogramImpl keys_per_prefix_hist_; IndexRecordList record_list_; bool is_first_record_; diff --git a/table/plain_table_key_coding.cc b/table/plain_table_key_coding.cc index eedf58aeaa1..c553752e175 100644 --- a/table/plain_table_key_coding.cc +++ b/table/plain_table_key_coding.cc @@ -30,7 +30,7 @@ const unsigned char kSizeInlineLimit = 0x3F; size_t EncodeSize(EntryType type, uint32_t key_size, char* out_buffer) { out_buffer[0] = type << 6; - if (key_size < 0x3F) { + if (key_size < static_cast(kSizeInlineLimit)) { // size inlined out_buffer[0] |= static_cast(key_size); return 1; @@ -97,9 +97,9 @@ Status PlainTableKeyEncoder::AppendKey(const Slice& key, WritableFile* file, Slice prefix = prefix_extractor_->Transform(Slice(key.data(), user_key_size)); - if (key_count_for_prefix == 0 || prefix != pre_prefix_.GetKey() || - key_count_for_prefix % index_sparseness_ == 0) { - key_count_for_prefix = 1; + if (key_count_for_prefix_ == 0 || prefix != pre_prefix_.GetKey() || + key_count_for_prefix_ % index_sparseness_ == 0) { + key_count_for_prefix_ = 1; pre_prefix_.SetKey(prefix); size_bytes_pos += EncodeSize(kFullKey, user_key_size, size_bytes); Status s = file->Append(Slice(size_bytes, size_bytes_pos)); @@ -108,8 +108,8 @@ Status PlainTableKeyEncoder::AppendKey(const Slice& key, WritableFile* file, } *offset += size_bytes_pos; } else { - key_count_for_prefix++; - if (key_count_for_prefix == 2) { + key_count_for_prefix_++; + if (key_count_for_prefix_ == 2) { // For second key within a prefix, need to encode prefix length size_bytes_pos += EncodeSize(kPrefixFromPreviousKey, pre_prefix_.GetKey().size(), diff --git a/table/plain_table_key_coding.h b/table/plain_table_key_coding.h index ba66c26452a..9047087aed0 100644 --- a/table/plain_table_key_coding.h +++ b/table/plain_table_key_coding.h @@ -26,7 +26,7 @@ class PlainTableKeyEncoder { fixed_user_key_len_(user_key_len), prefix_extractor_(prefix_extractor), index_sparseness_((index_sparseness > 1) ? index_sparseness : 1), - key_count_for_prefix(0) {} + key_count_for_prefix_(0) {} // key: the key to write out, in the format of internal key. // file: the output file to write out // offset: offset in the file. Needs to be updated after appending bytes @@ -45,7 +45,7 @@ class PlainTableKeyEncoder { uint32_t fixed_user_key_len_; const SliceTransform* prefix_extractor_; const size_t index_sparseness_; - size_t key_count_for_prefix; + size_t key_count_for_prefix_; IterKey pre_prefix_; }; diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc index 8728eb1d39f..3a6d48be82e 100644 --- a/table/plain_table_reader.cc +++ b/table/plain_table_reader.cc @@ -87,7 +87,7 @@ class PlainTableIterator : public Iterator { }; extern const uint64_t kPlainTableMagicNumber; -PlainTableReader::PlainTableReader(const Options& options, +PlainTableReader::PlainTableReader(const ImmutableCFOptions& ioptions, unique_ptr&& file, const EnvOptions& storage_options, const InternalKeyComparator& icomparator, @@ -99,10 +99,10 @@ PlainTableReader::PlainTableReader(const Options& options, full_scan_mode_(false), data_end_offset_(table_properties->data_size), user_key_len_(table_properties->fixed_key_len), - prefix_extractor_(options.prefix_extractor.get()), + prefix_extractor_(ioptions.prefix_extractor), enable_bloom_(false), bloom_(6, nullptr), - options_(options), + ioptions_(ioptions), file_(std::move(file)), file_size_(file_size), table_properties_(nullptr) {} @@ -110,8 +110,8 @@ PlainTableReader::PlainTableReader(const Options& options, PlainTableReader::~PlainTableReader() { } -Status PlainTableReader::Open(const Options& options, - const EnvOptions& soptions, +Status PlainTableReader::Open(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, @@ -119,14 +119,14 @@ Status PlainTableReader::Open(const Options& options, const int bloom_bits_per_key, double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size, bool full_scan_mode) { - assert(options.allow_mmap_reads); + assert(ioptions.allow_mmap_reads); if (file_size > PlainTableIndex::kMaxFileSize) { return Status::NotSupported("File is too large for PlainTableReader!"); } TableProperties* props = nullptr; auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, - options.env, options.info_log.get(), &props); + ioptions.env, ioptions.info_log, &props); if (!s.ok()) { return s; } @@ -137,12 +137,12 @@ Status PlainTableReader::Open(const Options& options, user_props.find(PlainTablePropertyNames::kPrefixExtractorName); if (!full_scan_mode && prefix_extractor_in_file != user_props.end()) { - if (!options.prefix_extractor) { + if (!ioptions.prefix_extractor) { return Status::InvalidArgument( "Prefix extractor is missing when opening a PlainTable built " "using a prefix extractor"); } else if (prefix_extractor_in_file->second.compare( - options.prefix_extractor->Name()) != 0) { + ioptions.prefix_extractor->Name()) != 0) { return Status::InvalidArgument( "Prefix extractor given doesn't match the one used to build " "PlainTable"); @@ -158,8 +158,8 @@ Status PlainTableReader::Open(const Options& options, } std::unique_ptr new_reader(new PlainTableReader( - options, std::move(file), soptions, internal_comparator, encoding_type, - file_size, props)); + ioptions, std::move(file), env_options, internal_comparator, + encoding_type, file_size, props)); s = new_reader->MmapDataFile(); if (!s.ok()) { @@ -187,6 +187,10 @@ void PlainTableReader::SetupForCompaction() { Iterator* PlainTableReader::NewIterator(const ReadOptions& options, Arena* arena) { + if (options.total_order_seek && !IsTotalOrderMode()) { + return NewErrorIterator( + Status::InvalidArgument("total_order_seek not supported"), arena); + } if (arena == nullptr) { return new PlainTableIterator(this, prefix_extractor_ != nullptr); } else { @@ -203,7 +207,7 @@ Status PlainTableReader::PopulateIndexRecordList( bool is_first_record = true; Slice key_prefix_slice; PlainTableKeyDecoder decoder(encoding_type_, user_key_len_, - options_.prefix_extractor.get()); + ioptions_.prefix_extractor); while (pos < data_end_offset_) { uint32_t key_offset = pos; ParsedInternalKey key; @@ -248,8 +252,8 @@ void PlainTableReader::AllocateAndFillBloom(int bloom_bits_per_key, uint32_t bloom_total_bits = num_prefixes * bloom_bits_per_key; if (bloom_total_bits > 0) { enable_bloom_ = true; - bloom_.SetTotalBits(&arena_, bloom_total_bits, options_.bloom_locality, - huge_page_tlb_size, options_.info_log.get()); + bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality, + huge_page_tlb_size, ioptions_.info_log); FillBloom(prefix_hashes); } } @@ -277,14 +281,14 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, BlockContents bloom_block_contents; auto s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber, - options_.env, BloomBlockBuilder::kBloomBlock, + ioptions_.env, BloomBlockBuilder::kBloomBlock, &bloom_block_contents); bool index_in_file = s.ok(); BlockContents index_block_contents; s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber, - options_.env, PlainTableIndexBuilder::kPlainTableIndexBlock, - &index_block_contents); + ioptions_.env, PlainTableIndexBuilder::kPlainTableIndexBlock, + &index_block_contents); index_in_file &= s.ok(); @@ -306,8 +310,9 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, index_block = nullptr; } - if ((options_.prefix_extractor.get() == nullptr) && (hash_table_ratio != 0)) { - // options.prefix_extractor is requried for a hash-based look-up. + if ((ioptions_.prefix_extractor == nullptr) && + (hash_table_ratio != 0)) { + // ioptions.prefix_extractor is requried for a hash-based look-up. return Status::NotSupported( "PlainTable requires a prefix extractor enable prefix hash mode."); } @@ -324,8 +329,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, table_properties_->num_entries * bloom_bits_per_key; if (num_bloom_bits > 0) { enable_bloom_ = true; - bloom_.SetTotalBits(&arena_, num_bloom_bits, options_.bloom_locality, - huge_page_tlb_size, options_.info_log.get()); + bloom_.SetTotalBits(&arena_, num_bloom_bits, ioptions_.bloom_locality, + huge_page_tlb_size, ioptions_.info_log); } } } else { @@ -347,7 +352,7 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, bloom_block->size() * 8, num_blocks); } - PlainTableIndexBuilder index_builder(&arena_, options_, index_sparseness, + PlainTableIndexBuilder index_builder(&arena_, ioptions_, index_sparseness, hash_table_ratio, huge_page_tlb_size); std::vector prefix_hashes; @@ -418,7 +423,7 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, uint32_t file_offset = GetFixed32Element(base_ptr, mid); size_t tmp; Status s = PlainTableKeyDecoder(encoding_type_, user_key_len_, - options_.prefix_extractor.get()) + ioptions_.prefix_extractor) .NextKey(file_data_.data() + file_offset, file_data_.data() + data_end_offset_, &mid_key, nullptr, &tmp); @@ -447,7 +452,7 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, size_t tmp; uint32_t low_key_offset = GetFixed32Element(base_ptr, low); Status s = PlainTableKeyDecoder(encoding_type_, user_key_len_, - options_.prefix_extractor.get()) + ioptions_.prefix_extractor) .NextKey(file_data_.data() + low_key_offset, file_data_.data() + data_end_offset_, &low_key, nullptr, &tmp); @@ -561,7 +566,7 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, } Slice found_value; PlainTableKeyDecoder decoder(encoding_type_, user_key_len_, - options_.prefix_extractor.get()); + ioptions_.prefix_extractor); while (offset < data_end_offset_) { Status s = Next(&decoder, &offset, &found_key, nullptr, &found_value); if (!s.ok()) { diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h index 4a626979a83..fcc94a53e15 100644 --- a/table/plain_table_reader.h +++ b/table/plain_table_reader.h @@ -52,7 +52,8 @@ extern const uint32_t kPlainTableVariableLength; // The implementation of IndexedTableReader requires output file is mmaped class PlainTableReader: public TableReader { public: - static Status Open(const Options& options, const EnvOptions& soptions, + static Status Open(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table, @@ -82,8 +83,9 @@ class PlainTableReader: public TableReader { return arena_.MemoryAllocatedBytes(); } - PlainTableReader(const Options& options, unique_ptr&& file, - const EnvOptions& storage_options, + PlainTableReader(const ImmutableCFOptions& ioptions, + unique_ptr&& file, + const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, EncodingType encoding_type, uint64_t file_size, const TableProperties* table_properties); @@ -132,7 +134,7 @@ class PlainTableReader: public TableReader { DynamicBloom bloom_; Arena arena_; - const Options& options_; + const ImmutableCFOptions& ioptions_; unique_ptr file_; uint32_t file_size_; std::shared_ptr table_properties_; diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index effa90a0b4c..aa791f4c4f8 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -88,10 +88,12 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, TableBuilder* tb = nullptr; DB* db = nullptr; Status s; + const ImmutableCFOptions ioptions(opts); if (!through_db) { env->NewWritableFile(file_name, &file, env_options); - tb = opts.table_factory->NewTableBuilder(opts, ikc, file.get(), - CompressionType::kNoCompression); + tb = opts.table_factory->NewTableBuilder(ioptions, ikc, file.get(), + CompressionType::kNoCompression, + CompressionOptions()); } else { s = DB::Open(opts, dbname, &db); ASSERT_OK(s); @@ -122,7 +124,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, uint64_t file_size; env->GetFileSize(file_name, &file_size); s = opts.table_factory->NewTableReader( - opts, env_options, ikc, std::move(raf), file_size, &table_reader); + ioptions, env_options, ikc, std::move(raf), file_size, &table_reader); } Random rnd(301); @@ -232,7 +234,9 @@ DEFINE_bool(iterator, false, "For test iterator"); DEFINE_bool(through_db, false, "If enable, a DB instance will be created and " "the query will be against DB. Otherwise, will be directly against " "a table reader."); -DEFINE_bool(plain_table, false, "Use PlainTable"); +DEFINE_string(table_factory, "block_based", + "Table factory to use: `block_based` (default), `plain_table` or " + "`cuckoo_hash`."); DEFINE_string(time_unit, "microsecond", "The time unit used for measuring performance. User can specify " "`microsecond` (default) or `nanosecond`"); @@ -242,7 +246,7 @@ int main(int argc, char** argv) { " [OPTIONS]..."); ParseCommandLineFlags(&argc, &argv, true); - rocksdb::TableFactory* tf = new rocksdb::BlockBasedTableFactory(); + std::shared_ptr tf; rocksdb::Options options; if (FLAGS_prefix_len < 16) { options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform( @@ -253,7 +257,13 @@ int main(int argc, char** argv) { options.create_if_missing = true; options.compression = rocksdb::CompressionType::kNoCompression; - if (FLAGS_plain_table) { + if (FLAGS_table_factory == "cuckoo_hash") { + options.allow_mmap_reads = true; + env_options.use_mmap_reads = true; + rocksdb::CuckooTableOptions table_options; + table_options.hash_table_ratio = 0.75; + tf.reset(rocksdb::NewCuckooTableFactory(table_options)); + } else if (FLAGS_table_factory == "plain_table") { options.allow_mmap_reads = true; env_options.use_mmap_reads = true; @@ -262,22 +272,28 @@ int main(int argc, char** argv) { plain_table_options.bloom_bits_per_key = (FLAGS_prefix_len == 16) ? 0 : 8; plain_table_options.hash_table_ratio = 0.75; - tf = new rocksdb::PlainTableFactory(plain_table_options); + tf.reset(new rocksdb::PlainTableFactory(plain_table_options)); options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform( FLAGS_prefix_len)); + } else if (FLAGS_table_factory == "block_based") { + tf.reset(new rocksdb::BlockBasedTableFactory()); + } else { + fprintf(stderr, "Invalid table type %s\n", FLAGS_table_factory.c_str()); + } + + if (tf) { + // if user provides invalid options, just fall back to microsecond. + bool measured_by_nanosecond = FLAGS_time_unit == "nanosecond"; + + options.table_factory = tf; + rocksdb::TableReaderBenchmark(options, env_options, ro, FLAGS_num_keys1, + FLAGS_num_keys2, FLAGS_iter, FLAGS_prefix_len, + FLAGS_query_empty, FLAGS_iterator, + FLAGS_through_db, measured_by_nanosecond); } else { - tf = new rocksdb::BlockBasedTableFactory(); + return 1; } - // if user provides invalid options, just fall back to microsecond. - bool measured_by_nanosecond = FLAGS_time_unit == "nanosecond"; - options.table_factory = - std::shared_ptr(tf); - rocksdb::TableReaderBenchmark(options, env_options, ro, FLAGS_num_keys1, - FLAGS_num_keys2, FLAGS_iter, FLAGS_prefix_len, - FLAGS_query_empty, FLAGS_iterator, - FLAGS_through_db, measured_by_nanosecond); - delete tf; return 0; } diff --git a/table/table_test.cc b/table/table_test.cc index 335b33cc909..776490871b2 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -42,6 +42,7 @@ #include "util/statistics.h" #include "util/testharness.h" #include "util/testutil.h" +#include "util/scoped_arena_iterator.h" namespace rocksdb { @@ -194,6 +195,8 @@ class Constructor { // been added so far. Returns the keys in sorted order in "*keys" // and stores the key/value pairs in "*kvmap" void Finish(const Options& options, + const ImmutableCFOptions& ioptions, + const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, std::vector* keys, KVMap* kvmap) { last_internal_key_ = &internal_comparator; @@ -205,12 +208,15 @@ class Constructor { keys->push_back(it->first); } data_.clear(); - Status s = FinishImpl(options, internal_comparator, *kvmap); + Status s = FinishImpl(options, ioptions, table_options, + internal_comparator, *kvmap); ASSERT_TRUE(s.ok()) << s.ToString(); } // Construct the data structure from the data in "data" virtual Status FinishImpl(const Options& options, + const ImmutableCFOptions& ioptions, + const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, const KVMap& data) = 0; @@ -218,8 +224,12 @@ class Constructor { virtual const KVMap& data() { return data_; } + virtual bool IsArenaMode() const { return false; } + virtual DB* db() const { return nullptr; } // Overridden in DBConstructor + virtual bool AnywayDeleteIterator() const { return false; } + protected: const InternalKeyComparator* last_internal_key_; @@ -237,11 +247,13 @@ class BlockConstructor: public Constructor { delete block_; } virtual Status FinishImpl(const Options& options, + const ImmutableCFOptions& ioptions, + const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, const KVMap& data) { delete block_; block_ = nullptr; - BlockBuilder builder(options, &internal_comparator); + BlockBuilder builder(table_options.block_restart_interval); for (KVMap::const_iterator it = data.begin(); it != data.end(); @@ -253,8 +265,7 @@ class BlockConstructor: public Constructor { BlockContents contents; contents.data = data_; contents.cachable = false; - contents.heap_allocated = false; - block_ = new Block(contents); + block_ = new Block(std::move(contents)); return Status::OK(); } virtual Iterator* NewIterator() const { @@ -272,8 +283,15 @@ class BlockConstructor: public Constructor { // A helper class that converts internal format keys into user keys class KeyConvertingIterator: public Iterator { public: - explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { } - virtual ~KeyConvertingIterator() { delete iter_; } + KeyConvertingIterator(Iterator* iter, bool arena_mode = false) + : iter_(iter), arena_mode_(arena_mode) {} + virtual ~KeyConvertingIterator() { + if (arena_mode_) { + iter_->~Iterator(); + } else { + delete iter_; + } + } virtual bool Valid() const { return iter_->Valid(); } virtual void Seek(const Slice& target) { ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue); @@ -304,6 +322,7 @@ class KeyConvertingIterator: public Iterator { private: mutable Status status_; Iterator* iter_; + bool arena_mode_; // No copying allowed KeyConvertingIterator(const KeyConvertingIterator&); @@ -319,13 +338,16 @@ class TableConstructor: public Constructor { ~TableConstructor() { Reset(); } virtual Status FinishImpl(const Options& options, + const ImmutableCFOptions& ioptions, + const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, const KVMap& data) { Reset(); sink_.reset(new StringSink()); unique_ptr builder; - builder.reset(options.table_factory->NewTableBuilder( - options, internal_comparator, sink_.get(), options.compression)); + builder.reset(ioptions.table_factory->NewTableBuilder( + ioptions, internal_comparator, sink_.get(), options.compression, + CompressionOptions())); for (KVMap::const_iterator it = data.begin(); it != data.end(); @@ -348,9 +370,9 @@ class TableConstructor: public Constructor { // Open the table uniq_id_ = cur_uniq_id_++; source_.reset(new StringSource(sink_->contents(), uniq_id_, - options.allow_mmap_reads)); - return options.table_factory->NewTableReader( - options, soptions, internal_comparator, std::move(source_), + ioptions.allow_mmap_reads)); + return ioptions.table_factory->NewTableReader( + ioptions, soptions, internal_comparator, std::move(source_), sink_->contents().size(), &table_reader_); } @@ -368,19 +390,23 @@ class TableConstructor: public Constructor { return table_reader_->ApproximateOffsetOf(key); } - virtual Status Reopen(const Options& options) { + virtual Status Reopen(const ImmutableCFOptions& ioptions) { source_.reset( new StringSource(sink_->contents(), uniq_id_, - options.allow_mmap_reads)); - return options.table_factory->NewTableReader( - options, soptions, *last_internal_key_, std::move(source_), + ioptions.allow_mmap_reads)); + return ioptions.table_factory->NewTableReader( + ioptions, soptions, *last_internal_key_, std::move(source_), sink_->contents().size(), &table_reader_); } - virtual TableReader* table_reader() { + virtual TableReader* GetTableReader() { return table_reader_.get(); } + virtual bool AnywayDeleteIterator() const override { + return convert_to_internal_key_; + } + private: void Reset() { uniq_id_ = 0; @@ -388,12 +414,12 @@ class TableConstructor: public Constructor { sink_.reset(); source_.reset(); } - bool convert_to_internal_key_; uint64_t uniq_id_; unique_ptr sink_; unique_ptr source_; unique_ptr table_reader_; + bool convert_to_internal_key_; TableConstructor(); @@ -410,19 +436,23 @@ class MemTableConstructor: public Constructor { table_factory_(new SkipListFactory) { Options options; options.memtable_factory = table_factory_; - memtable_ = new MemTable(internal_comparator_, options); + memtable_ = new MemTable(internal_comparator_, ImmutableCFOptions(options), + MemTableOptions(MutableCFOptions(options), options)); memtable_->Ref(); } ~MemTableConstructor() { delete memtable_->Unref(); } - virtual Status FinishImpl(const Options& options, + virtual Status FinishImpl(const Options&, + const ImmutableCFOptions& ioptions, + const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, const KVMap& data) { delete memtable_->Unref(); - Options memtable_options; - memtable_options.memtable_factory = table_factory_; - memtable_ = new MemTable(internal_comparator_, memtable_options); + Options options; + options.memtable_factory = table_factory_; + memtable_ = new MemTable(internal_comparator_, ImmutableCFOptions(options), + MemTableOptions(MutableCFOptions(options), options)); memtable_->Ref(); int seq = 1; for (KVMap::const_iterator it = data.begin(); @@ -434,10 +464,16 @@ class MemTableConstructor: public Constructor { return Status::OK(); } virtual Iterator* NewIterator() const { - return new KeyConvertingIterator(memtable_->NewIterator(ReadOptions())); + return new KeyConvertingIterator( + memtable_->NewIterator(ReadOptions(), &arena_), true); } + virtual bool AnywayDeleteIterator() const override { return true; } + + virtual bool IsArenaMode() const override { return true; } + private: + mutable Arena arena_; InternalKeyComparator internal_comparator_; MemTable* memtable_; std::shared_ptr table_factory_; @@ -455,6 +491,8 @@ class DBConstructor: public Constructor { delete db_; } virtual Status FinishImpl(const Options& options, + const ImmutableCFOptions& ioptions, + const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, const KVMap& data) { delete db_; @@ -664,18 +702,15 @@ class FixedOrLessPrefixTransform : public SliceTransform { class Harness { public: - Harness() : constructor_(nullptr) { } + Harness() : ioptions_(options_), constructor_(nullptr) {} void Init(const TestArgs& args) { delete constructor_; constructor_ = nullptr; options_ = Options(); - - options_.block_restart_interval = args.restart_interval; options_.compression = args.compression; // Use shorter block size for tests to exercise block boundary // conditions more. - options_.block_size = 256; if (args.reverse_compare) { options_.comparator = &reverse_key_comparator; } @@ -685,12 +720,14 @@ class Harness { support_prev_ = true; only_support_prefix_seek_ = false; - BlockBasedTableOptions table_options; switch (args.type) { case BLOCK_BASED_TABLE_TEST: - table_options.flush_block_policy_factory.reset( + table_options_.flush_block_policy_factory.reset( new FlushBlockBySizePolicyFactory()); - options_.table_factory.reset(new BlockBasedTableFactory(table_options)); + table_options_.block_size = 256; + table_options_.block_restart_interval = args.restart_interval; + options_.table_factory.reset( + new BlockBasedTableFactory(table_options_)); constructor_ = new TableConstructor(options_.comparator); break; case PLAIN_TABLE_SEMI_FIXED_PREFIX: @@ -733,15 +770,25 @@ class Harness { new InternalKeyComparator(options_.comparator)); break; case BLOCK_TEST: + table_options_.block_size = 256; + options_.table_factory.reset( + new BlockBasedTableFactory(table_options_)); constructor_ = new BlockConstructor(options_.comparator); break; case MEMTABLE_TEST: + table_options_.block_size = 256; + options_.table_factory.reset( + new BlockBasedTableFactory(table_options_)); constructor_ = new MemTableConstructor(options_.comparator); break; case DB_TEST: + table_options_.block_size = 256; + options_.table_factory.reset( + new BlockBasedTableFactory(table_options_)); constructor_ = new DBConstructor(options_.comparator); break; } + ioptions_ = ImmutableCFOptions(options_); } ~Harness() { @@ -755,7 +802,8 @@ class Harness { void Test(Random* rnd) { std::vector keys; KVMap data; - constructor_->Finish(options_, *internal_comparator_, &keys, &data); + constructor_->Finish(options_, ioptions_, table_options_, + *internal_comparator_, &keys, &data); TestForwardScan(keys, data); if (support_prev_) { @@ -776,7 +824,11 @@ class Harness { iter->Next(); } ASSERT_TRUE(!iter->Valid()); - delete iter; + if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) { + iter->~Iterator(); + } else { + delete iter; + } } void TestBackwardScan(const std::vector& keys, @@ -791,7 +843,11 @@ class Harness { iter->Prev(); } ASSERT_TRUE(!iter->Valid()); - delete iter; + if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) { + iter->~Iterator(); + } else { + delete iter; + } } void TestRandomAccess(Random* rnd, @@ -861,7 +917,11 @@ class Harness { } } } - delete iter; + if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) { + iter->~Iterator(); + } else { + delete iter; + } } std::string ToString(const KVMap& data, const KVMap::const_iterator& it) { @@ -924,6 +984,8 @@ class Harness { private: Options options_ = Options(); + ImmutableCFOptions ioptions_; + BlockBasedTableOptions table_options_ = BlockBasedTableOptions(); Constructor* constructor_; bool support_prev_; bool only_support_prefix_seek_; @@ -1018,12 +1080,15 @@ TEST(BlockBasedTableTest, BasicBlockBasedTableProperties) { KVMap kvmap; Options options; options.compression = kNoCompression; - options.block_restart_interval = 1; + BlockBasedTableOptions table_options; + table_options.block_restart_interval = 1; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - c.Finish(options, GetPlainInternalComparator(options.comparator), &keys, - &kvmap); + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); - auto& props = *c.table_reader()->GetTableProperties(); + auto& props = *c.GetTableReader()->GetTableProperties(); ASSERT_EQ(kvmap.size(), props.num_entries); auto raw_key_size = kvmap.size() * 2ul; @@ -1035,7 +1100,7 @@ TEST(BlockBasedTableTest, BasicBlockBasedTableProperties) { ASSERT_EQ("", props.filter_policy_name); // no filter policy is used // Verify data size. - BlockBuilder block_builder(options, options.comparator); + BlockBuilder block_builder(1); for (const auto& item : kvmap) { block_builder.Add(item.first, item.second); } @@ -1044,20 +1109,106 @@ TEST(BlockBasedTableTest, BasicBlockBasedTableProperties) { } TEST(BlockBasedTableTest, FilterPolicyNameProperties) { - TableConstructor c(BytewiseComparator()); + TableConstructor c(BytewiseComparator(), true); c.Add("a1", "val1"); std::vector keys; KVMap kvmap; + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); Options options; - std::unique_ptr filter_policy(NewBloomFilterPolicy(10)); - options.filter_policy = filter_policy.get(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - c.Finish(options, GetPlainInternalComparator(options.comparator), &keys, - &kvmap); - auto& props = *c.table_reader()->GetTableProperties(); + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + auto& props = *c.GetTableReader()->GetTableProperties(); ASSERT_EQ("rocksdb.BuiltinBloomFilter", props.filter_policy_name); } +TEST(BlockBasedTableTest, TotalOrderSeekOnHashIndex) { + BlockBasedTableOptions table_options; + for (int i = 0; i < 4; ++i) { + Options options; + // Make each key/value an individual block + table_options.block_size = 64; + switch (i) { + case 0: + // Binary search index + table_options.index_type = BlockBasedTableOptions::kBinarySearch; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + break; + case 1: + // Hash search index + table_options.index_type = BlockBasedTableOptions::kHashSearch; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(4)); + break; + case 2: + // Hash search index with hash_index_allow_collision + table_options.index_type = BlockBasedTableOptions::kHashSearch; + table_options.hash_index_allow_collision = true; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(4)); + break; + case 3: + default: + // Hash search index with filter policy + table_options.index_type = BlockBasedTableOptions::kHashSearch; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(4)); + break; + } + + TableConstructor c(BytewiseComparator(), true); + c.Add("aaaa1", std::string('a', 56)); + c.Add("bbaa1", std::string('a', 56)); + c.Add("cccc1", std::string('a', 56)); + c.Add("bbbb1", std::string('a', 56)); + c.Add("baaa1", std::string('a', 56)); + c.Add("abbb1", std::string('a', 56)); + c.Add("cccc2", std::string('a', 56)); + std::vector keys; + KVMap kvmap; + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + auto props = c.GetTableReader()->GetTableProperties(); + ASSERT_EQ(7u, props->num_data_blocks); + auto* reader = c.GetTableReader(); + ReadOptions ro; + ro.total_order_seek = true; + std::unique_ptr iter(reader->NewIterator(ro)); + + iter->Seek(InternalKey("b", 0, kTypeValue).Encode()); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("baaa1", ExtractUserKey(iter->key()).ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bbaa1", ExtractUserKey(iter->key()).ToString()); + + iter->Seek(InternalKey("bb", 0, kTypeValue).Encode()); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bbaa1", ExtractUserKey(iter->key()).ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bbbb1", ExtractUserKey(iter->key()).ToString()); + + iter->Seek(InternalKey("bbb", 0, kTypeValue).Encode()); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bbbb1", ExtractUserKey(iter->key()).ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("cccc1", ExtractUserKey(iter->key()).ToString()); + } +} + static std::string RandomString(Random* rnd, int len) { std::string r; test::RandomString(rnd, len, &r); @@ -1094,21 +1245,21 @@ TEST(TableTest, HashIndexTest) { std::vector keys; KVMap kvmap; Options options; + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); BlockBasedTableOptions table_options; table_options.index_type = BlockBasedTableOptions::kHashSearch; table_options.hash_index_allow_collision = true; - options.table_factory.reset(new BlockBasedTableFactory(table_options)); - - options.prefix_extractor.reset(NewFixedPrefixTransform(3)); - options.block_cache = NewLRUCache(1024); - options.block_size = 1700; + table_options.block_size = 1700; + table_options.block_cache = NewLRUCache(1024); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); std::unique_ptr comparator( new InternalKeyComparator(BytewiseComparator())); - c.Finish(options, *comparator, &keys, &kvmap); - auto reader = c.table_reader(); + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, *comparator, &keys, &kvmap); + auto reader = c.GetTableReader(); - auto props = c.table_reader()->GetTableProperties(); + auto props = reader->GetTableProperties(); ASSERT_EQ(5u, props->num_data_blocks); std::unique_ptr hash_iter(reader->NewIterator(ReadOptions())); @@ -1209,11 +1360,14 @@ TEST(BlockBasedTableTest, IndexSizeStat) { KVMap kvmap; Options options; options.compression = kNoCompression; - options.block_restart_interval = 1; + BlockBasedTableOptions table_options; + table_options.block_restart_interval = 1; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - c.Finish(options, GetPlainInternalComparator(options.comparator), &ks, - &kvmap); - auto index_size = c.table_reader()->GetTableProperties()->index_size; + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, + GetPlainInternalComparator(options.comparator), &ks, &kvmap); + auto index_size = c.GetTableReader()->GetTableProperties()->index_size; ASSERT_GT(index_size, last_index_size); last_index_size = index_size; } @@ -1224,8 +1378,10 @@ TEST(BlockBasedTableTest, NumBlockStat) { TableConstructor c(BytewiseComparator()); Options options; options.compression = kNoCompression; - options.block_restart_interval = 1; - options.block_size = 1000; + BlockBasedTableOptions table_options; + table_options.block_restart_interval = 1; + table_options.block_size = 1000; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); for (int i = 0; i < 10; ++i) { // the key/val are slightly smaller than block size, so that each block @@ -1235,10 +1391,11 @@ TEST(BlockBasedTableTest, NumBlockStat) { std::vector ks; KVMap kvmap; - c.Finish(options, GetPlainInternalComparator(options.comparator), &ks, - &kvmap); + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, + GetPlainInternalComparator(options.comparator), &ks, &kvmap); ASSERT_EQ(kvmap.size(), - c.table_reader()->GetTableProperties()->num_data_blocks); + c.GetTableReader()->GetTableProperties()->num_data_blocks); } // A simple tool that takes the snapshot of block cache statistics. @@ -1300,23 +1457,23 @@ TEST(BlockBasedTableTest, BlockCacheDisabledTest) { Options options; options.create_if_missing = true; options.statistics = CreateDBStatistics(); - options.block_cache = NewLRUCache(1024); - std::unique_ptr filter_policy(NewBloomFilterPolicy(10)); - options.filter_policy = filter_policy.get(); BlockBasedTableOptions table_options; // Intentionally commented out: table_options.cache_index_and_filter_blocks = // true; + table_options.block_cache = NewLRUCache(1024); + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); options.table_factory.reset(new BlockBasedTableFactory(table_options)); std::vector keys; KVMap kvmap; - TableConstructor c(BytewiseComparator()); + TableConstructor c(BytewiseComparator(), true); c.Add("key", "value"); - c.Finish(options, GetPlainInternalComparator(options.comparator), &keys, - &kvmap); + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); // preloading filter/index blocks is enabled. - auto reader = dynamic_cast(c.table_reader()); + auto reader = dynamic_cast(c.GetTableReader()); ASSERT_TRUE(reader->TEST_filter_block_preloaded()); ASSERT_TRUE(reader->TEST_index_reader_preloaded()); @@ -1343,10 +1500,10 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) { Options options; options.create_if_missing = true; options.statistics = CreateDBStatistics(); - options.block_cache = NewLRUCache(1024); // Enable the cache for index/filter blocks BlockBasedTableOptions table_options; + table_options.block_cache = NewLRUCache(1024); table_options.cache_index_and_filter_blocks = true; options.table_factory.reset(new BlockBasedTableFactory(table_options)); std::vector keys; @@ -1354,10 +1511,11 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) { TableConstructor c(BytewiseComparator()); c.Add("key", "value"); - c.Finish(options, GetPlainInternalComparator(options.comparator), &keys, - &kvmap); + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); // preloading filter/index blocks is prohibited. - auto reader = dynamic_cast(c.table_reader()); + auto reader = dynamic_cast(c.GetTableReader()); ASSERT_TRUE(!reader->TEST_filter_block_preloaded()); ASSERT_TRUE(!reader->TEST_index_reader_preloaded()); @@ -1404,9 +1562,13 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) { iter.reset(); // -- PART 2: Open without block cache - options.block_cache.reset(); + table_options.no_block_cache = true; + table_options.block_cache.reset(); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); options.statistics = CreateDBStatistics(); // reset the stats - c.Reopen(options); + const ImmutableCFOptions ioptions1(options); + c.Reopen(ioptions1); + table_options.no_block_cache = false; { iter.reset(c.NewIterator()); @@ -1420,8 +1582,10 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) { // -- PART 3: Open with very small block cache // In this test, no block will ever get hit since the block cache is // too small to fit even one entry. - options.block_cache = NewLRUCache(1); - c.Reopen(options); + table_options.block_cache = NewLRUCache(1); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + const ImmutableCFOptions ioptions2(options); + c.Reopen(ioptions2); { BlockCachePropertiesSnapshot props(options.statistics.get()); props.AssertEqual(1, // index block miss @@ -1458,11 +1622,12 @@ TEST(BlockBasedTableTest, BlockCacheLeak) { Options opt; unique_ptr ikc; ikc.reset(new test::PlainInternalKeyComparator(opt.comparator)); - opt.block_size = 1024; opt.compression = kNoCompression; - opt.block_cache = - NewLRUCache(16 * 1024 * 1024); // big enough so we don't ever - // lose cached values. + BlockBasedTableOptions table_options; + table_options.block_size = 1024; + // big enough so we don't ever lose cached values. + table_options.block_cache = NewLRUCache(16 * 1024 * 1024); + opt.table_factory.reset(NewBlockBasedTableFactory(table_options)); TableConstructor c(BytewiseComparator()); c.Add("k01", "hello"); @@ -1474,7 +1639,8 @@ TEST(BlockBasedTableTest, BlockCacheLeak) { c.Add("k07", std::string(100000, 'x')); std::vector keys; KVMap kvmap; - c.Finish(opt, *ikc, &keys, &kvmap); + const ImmutableCFOptions ioptions(opt); + c.Finish(opt, ioptions, table_options, *ikc, &keys, &kvmap); unique_ptr iter(c.NewIterator()); iter->SeekToFirst(); @@ -1485,16 +1651,19 @@ TEST(BlockBasedTableTest, BlockCacheLeak) { } ASSERT_OK(iter->status()); - ASSERT_OK(c.Reopen(opt)); - auto table_reader = dynamic_cast(c.table_reader()); + const ImmutableCFOptions ioptions1(opt); + ASSERT_OK(c.Reopen(ioptions1)); + auto table_reader = dynamic_cast(c.GetTableReader()); for (const std::string& key : keys) { ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key)); } // rerun with different block cache - opt.block_cache = NewLRUCache(16 * 1024 * 1024); - ASSERT_OK(c.Reopen(opt)); - table_reader = dynamic_cast(c.table_reader()); + table_options.block_cache = NewLRUCache(16 * 1024 * 1024); + opt.table_factory.reset(NewBlockBasedTableFactory(table_options)); + const ImmutableCFOptions ioptions2(opt); + ASSERT_OK(c.Reopen(ioptions2)); + table_reader = dynamic_cast(c.GetTableReader()); for (const std::string& key : keys) { ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), key)); } @@ -1509,9 +1678,11 @@ TEST(PlainTableTest, BasicPlainTableProperties) { PlainTableFactory factory(plain_table_options); StringSink sink; Options options; + const ImmutableCFOptions ioptions(options); InternalKeyComparator ikc(options.comparator); std::unique_ptr builder( - factory.NewTableBuilder(options, ikc, &sink, kNoCompression)); + factory.NewTableBuilder(ioptions, ikc, &sink, kNoCompression, + CompressionOptions())); for (char c = 'a'; c <= 'z'; ++c) { std::string key(8, c); @@ -1551,9 +1722,12 @@ TEST(GeneralTableTest, ApproximateOffsetOfPlain) { KVMap kvmap; Options options; test::PlainInternalKeyComparator internal_comparator(options.comparator); - options.block_size = 1024; options.compression = kNoCompression; - c.Finish(options, internal_comparator, &keys, &kvmap); + BlockBasedTableOptions table_options; + table_options.block_size = 1024; + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, internal_comparator, + &keys, &kvmap); ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); @@ -1580,9 +1754,11 @@ static void DoCompressionTest(CompressionType comp) { KVMap kvmap; Options options; test::PlainInternalKeyComparator ikc(options.comparator); - options.block_size = 1024; options.compression = comp; - c.Finish(options, ikc, &keys, &kvmap); + BlockBasedTableOptions table_options; + table_options.block_size = 1024; + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, ikc, &keys, &kvmap); ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); @@ -1684,7 +1860,8 @@ TEST(MemTableTest, Simple) { auto table_factory = std::make_shared(); Options options; options.memtable_factory = table_factory; - MemTable* memtable = new MemTable(cmp, options); + MemTable* memtable = new MemTable(cmp, ImmutableCFOptions(options), + MemTableOptions(MutableCFOptions(options), options)); memtable->Ref(); WriteBatch batch; WriteBatchInternal::SetSequence(&batch, 100); @@ -1695,7 +1872,8 @@ TEST(MemTableTest, Simple) { ColumnFamilyMemTablesDefault cf_mems_default(memtable, &options); ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &cf_mems_default).ok()); - Iterator* iter = memtable->NewIterator(ReadOptions()); + Arena arena; + ScopedArenaIterator iter(memtable->NewIterator(ReadOptions(), &arena)); iter->SeekToFirst(); while (iter->Valid()) { fprintf(stderr, "key: '%s' -> '%s'\n", @@ -1704,7 +1882,6 @@ TEST(MemTableTest, Simple) { iter->Next(); } - delete iter; delete memtable->Unref(); } diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc index 6af48f58ce0..ae4e46239bf 100644 --- a/table/two_level_iterator.cc +++ b/table/two_level_iterator.cc @@ -172,8 +172,9 @@ void TwoLevelIterator::InitDataBlock() { SetSecondLevelIterator(nullptr); } else { Slice handle = first_level_iter_.value(); - if (second_level_iter_.iter() != nullptr - && handle.compare(data_block_handle_) == 0) { + if (second_level_iter_.iter() != nullptr && + !second_level_iter_.status().IsIncomplete() && + handle.compare(data_block_handle_) == 0) { // second_level_iter is already constructed with this iterator, so // no need to change anything } else { diff --git a/tools/auto_sanity_test.sh b/tools/auto_sanity_test.sh index 2d63c0a85f5..138c855c08c 100755 --- a/tools/auto_sanity_test.sh +++ b/tools/auto_sanity_test.sh @@ -37,6 +37,11 @@ echo "Running db sanity check with commits $commit_new and $commit_old." echo "=============================================================" echo "Making build $commit_new" +git checkout $commit_new +if [ $? -ne 0 ]; then + echo "[ERROR] Can't checkout $commit_new" + exit 1 +fi makestuff mv db_sanity_test new_db_sanity_test echo "Creating db based on the new commit --- $commit_new" @@ -44,6 +49,11 @@ echo "Creating db based on the new commit --- $commit_new" echo "=============================================================" echo "Making build $commit_old" +git checkout $commit_old +if [ $? -ne 0 ]; then + echo "[ERROR] Can't checkout $commit_old" + exit 1 +fi makestuff mv db_sanity_test old_db_sanity_test echo "Creating db based on the old commit --- $commit_old" diff --git a/tools/benchmark.sh b/tools/benchmark.sh new file mode 100755 index 00000000000..cde545801ce --- /dev/null +++ b/tools/benchmark.sh @@ -0,0 +1,205 @@ +#!/bin/bash +# REQUIRE: db_bench binary exists in the current directory + +if [ $# -ne 1 ]; then + echo "./benchmark.sh [bulkload/fillseq/overwrite/filluniquerandom/readrandom/readwhilewriting]" + exit 0 +fi + +# size constants +K=1024 +M=$((1024 * K)) +G=$((1024 * M)) + +if [ -z $DB_DIR ]; then + echo "DB_DIR is not defined" + exit 0 +fi + +if [ -z $WAL_DIR ]; then + echo "WAL_DIR is not defined" + exit 0 +fi + +output_dir=${OUTPUT_DIR:-/tmp/} +if [ ! -d $output_dir ]; then + mkdir -p $output_dir +fi + +num_read_threads=${NUM_READ_THREADS:-16} +writes_per_second=${WRITES_PER_SEC:-$((80 * K))} # (only for readwhilewriting) +cache_size=$((16 * G)) +duration=${DURATION:-0} + +num_keys=${NUM_KEYS:-$((1 * G))} +key_size=20 +value_size=800 + +const_params=" + --db=$DB_DIR \ + --wal_dir=$WAL_DIR \ + \ + --num_levels=6 \ + --key_size=$key_size \ + --value_size=$value_size \ + --block_size=4096 \ + --cache_size=$cache_size \ + --cache_numshardbits=6 \ + --compression_type=snappy \ + --compression_ratio=0.5 \ + \ + --hard_rate_limit=2 \ + --rate_limit_delay_max_milliseconds=1000000 \ + --write_buffer_size=$((128 * M)) \ + --max_write_buffer_number=2 \ + --target_file_size_base=$((128 * M)) \ + --max_bytes_for_level_base=$((1 * G)) \ + \ + --sync=0 \ + --disable_data_sync=1 \ + --verify_checksum=1 \ + --delete_obsolete_files_period_micros=$((60 * M)) \ + --max_grandparent_overlap_factor=10 \ + \ + --statistics=1 \ + --stats_per_interval=1 \ + --stats_interval=$((1 * M)) \ + --histogram=1 \ + \ + --memtablerep=skip_list \ + --bloom_bits=10 \ + --open_files=$((20 * K))" + +l0_config=" + --level0_file_num_compaction_trigger=8 \ + --level0_slowdown_writes_trigger=16 \ + --level0_stop_writes_trigger=24" + +if [ $duration -gt 0 ]; then + const_params="$const_params --duration=$duration" +fi + +params_r="$const_params $l0_config --max_background_compactions=4 --max_background_flushes=1" +params_w="$const_params $l0_config --max_background_compactions=16 --max_background_flushes=16" +params_bulkload="$const_params --max_background_compactions=16 --max_background_flushes=16 \ + --level0_file_num_compaction_trigger=$((100 * M)) \ + --level0_slowdown_writes_trigger=$((100 * M)) \ + --level0_stop_writes_trigger=$((100 * M))" + +function run_bulkload { + echo "Bulk loading $num_keys random keys into database..." + cmd="./db_bench $params_bulkload --benchmarks=fillrandom \ + --use_existing_db=0 \ + --num=$num_keys \ + --disable_auto_compactions=1 \ + --disable_data_sync=1 \ + --threads=1 2>&1 | tee $output_dir/benchmark_bulkload_fillrandom.log" + echo $cmd | tee $output_dir/benchmark_bulkload_fillrandom.log + eval $cmd + echo "Compacting..." + cmd="./db_bench $params_w --benchmarks=compact \ + --use_existing_db=1 \ + --num=$num_keys \ + --disable_auto_compactions=1 \ + --disable_data_sync=1 \ + --threads=1 2>&1 | tee $output_dir/benchmark_bulkload_compact.log" + echo $cmd | tee $output_dir/benchmark_bulkload_compact.log + eval $cmd +} + +function run_fillseq { + echo "Loading $num_keys keys sequentially into database..." + cmd="./db_bench $params_w --benchmarks=fillseq \ + --use_existing_db=0 \ + --num=$num_keys \ + --threads=1 2>&1 | tee $output_dir/benchmark_fillseq.log" + echo $cmd | tee $output_dir/benchmark_fillseq.log + eval $cmd +} + +function run_overwrite { + echo "Loading $num_keys keys sequentially into database..." + cmd="./db_bench $params_w --benchmarks=overwrite \ + --use_existing_db=1 \ + --num=$num_keys \ + --threads=1 2>&1 | tee $output_dir/benchmark_overwrite.log" + echo $cmd | tee $output_dir/benchmark_overwrite.log + eval $cmd +} + +function run_filluniquerandom { + echo "Loading $num_keys unique keys randomly into database..." + cmd="./db_bench $params_w --benchmarks=filluniquerandom \ + --use_existing_db=0 \ + --num=$num_keys \ + --threads=1 2>&1 | tee $output_dir/benchmark_filluniquerandom.log" + echo $cmd | tee $output_dir/benchmark_filluniquerandom.log + eval $cmd +} + +function run_readrandom { + echo "Reading $num_keys random keys from database..." + cmd="./db_bench $params_r --benchmarks=readrandom \ + --use_existing_db=1 \ + --num=$num_keys \ + --threads=$num_read_threads \ + --disable_auto_compactions=1 \ + 2>&1 | tee $output_dir/benchmark_readrandom.log" + echo $cmd | tee $output_dir/benchmark_readrandom.log + eval $cmd +} + +function run_readwhilewriting { + echo "Reading $num_keys random keys from database whiling writing.." + cmd="./db_bench $params_r --benchmarks=readwhilewriting \ + --use_existing_db=1 \ + --num=$num_keys \ + --threads=$num_read_threads \ + --writes_per_second=$writes_per_second \ + 2>&1 | tee $output_dir/benchmark_readwhilewriting.log" + echo $cmd | tee $output_dir/benchmark_readwhilewriting.log + eval $cmd +} + +function now() { + echo `date +"%s"` +} + +report="$output_dir/report.txt" + +# print start time +echo "===== Benchmark =====" + +# Run!!! +IFS=',' read -a jobs <<< $1 +for job in ${jobs[@]}; do + echo "Start $job at `date`" | tee -a $report + start=$(now) + if [ $job = bulkload ]; then + run_bulkload + elif [ $job = fillseq ]; then + run_fillseq + elif [ $job = overwrite ]; then + run_overwrite + elif [ $job = filluniquerandom ]; then + run_filluniquerandom + elif [ $job = readrandom ]; then + run_readrandom + elif [ $job = readwhilewriting ]; then + run_readwhilewriting + else + echo "unknown job $job" + exit + fi + end=$(now) + + echo "Complete $job in $((end-start)) seconds" | tee -a $report + if [[ $job = readrandom || $job = readwhilewriting ]]; then + qps=$(grep "micros\/op" "$output_dir/benchmark_$job.log" | grep "ops\/sec" | awk '{print $5}') + line=$(grep "rocksdb.db.get.micros" "$output_dir/benchmark_$job.log") + p50=$(echo $line | awk '{print $7}') + p99=$(echo $line | awk '{print $13}') + echo "Read latency p50 = $p50 us, p99 = $p99 us" | tee -a $report + echo "QPS = $qps ops/sec" | tee -a $report + fi +done diff --git a/tools/db_sanity_test.cc b/tools/db_sanity_test.cc index 4ae120c21e6..237ef07d0d4 100644 --- a/tools/db_sanity_test.cc +++ b/tools/db_sanity_test.cc @@ -8,14 +8,15 @@ #include #include -#include "include/rocksdb/db.h" -#include "include/rocksdb/options.h" -#include "include/rocksdb/env.h" -#include "include/rocksdb/slice.h" -#include "include/rocksdb/status.h" -#include "include/rocksdb/comparator.h" -#include "include/rocksdb/table.h" -#include "include/rocksdb/slice_transform.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/comparator.h" +#include "rocksdb/table.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/filter_policy.h" namespace rocksdb { @@ -49,7 +50,7 @@ class SanityTest { return s; } } - return Status::OK(); + return db->Flush(FlushOptions()); } Status Verify() { DB* db; @@ -146,13 +147,29 @@ class SanityTestPlainTableFactory : public SanityTest { Options options_; }; +class SanityTestBloomFilter : public SanityTest { + public: + explicit SanityTestBloomFilter(const std::string& path) : SanityTest(path) { + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + } + ~SanityTestBloomFilter() {} + virtual Options GetOptions() const { return options_; } + virtual std::string Name() const { return "BloomFilter"; } + + private: + Options options_; +}; + namespace { bool RunSanityTests(const std::string& command, const std::string& path) { std::vector sanity_tests = { new SanityTestBasic(path), new SanityTestSpecialComparator(path), new SanityTestZlibCompression(path), - new SanityTestPlainTableFactory(path)}; + new SanityTestPlainTableFactory(path), + new SanityTestBloomFilter(path)}; if (command == "create") { fprintf(stderr, "Creating...\n"); diff --git a/tools/db_stress.cc b/tools/db_stress.cc index 05dd3cc88d8..b5c79bf3b3c 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -31,6 +31,7 @@ int main() { #include #include #include +#include #include #include "db/db_impl.h" #include "db/version_set.h" @@ -41,7 +42,6 @@ int main() { #include "rocksdb/write_batch.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" -#include "rocksdb/statistics.h" #include "port/port.h" #include "util/coding.h" #include "util/crc32c.h" @@ -154,7 +154,7 @@ DEFINE_int32(level0_stop_writes_trigger, rocksdb::Options().level0_stop_writes_trigger, "Number of files in level-0 that will trigger put stop."); -DEFINE_int32(block_size, rocksdb::Options().block_size, +DEFINE_int32(block_size, rocksdb::BlockBasedTableOptions().block_size, "Number of bytes in a block."); DEFINE_int32(max_background_compactions, @@ -209,6 +209,9 @@ static const bool FLAGS_reopen_dummy __attribute__((unused)) = DEFINE_int32(bloom_bits, 10, "Bloom filter bits per key. " "Negative means use default settings."); +DEFINE_bool(use_block_based_filter, false, "use block based filter" + "instead of full filter for block based table"); + DEFINE_string(db, "", "Use the db with the following name."); DEFINE_bool(verify_checksum, false, @@ -757,10 +760,12 @@ class StressTest { ? NewLRUCache(FLAGS_compressed_cache_size) : nullptr), filter_policy_(FLAGS_bloom_bits >= 0 - ? NewBloomFilterPolicy(FLAGS_bloom_bits) - : nullptr), + ? FLAGS_use_block_based_filter + ? NewBloomFilterPolicy(FLAGS_bloom_bits, true) + : NewBloomFilterPolicy(FLAGS_bloom_bits, false) + : nullptr), db_(nullptr), - new_column_family_name_(0), + new_column_family_name_(1), num_times_reopened_(0) { if (FLAGS_destroy_db_initially) { std::vector files; @@ -780,7 +785,6 @@ class StressTest { } column_families_.clear(); delete db_; - delete filter_policy_; } bool Run() { @@ -1219,12 +1223,20 @@ class StressTest { Status s __attribute__((unused)); s = db_->DropColumnFamily(column_families_[cf]); delete column_families_[cf]; - assert(s.ok()); + if (!s.ok()) { + fprintf(stderr, "dropping column family error: %s\n", + s.ToString().c_str()); + std::terminate(); + } s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), new_name, &column_families_[cf]); column_family_names_[cf] = new_name; thread->shared->ClearColumnFamily(cf); - assert(s.ok()); + if (!s.ok()) { + fprintf(stderr, "creating column family error: %s\n", + s.ToString().c_str()); + std::terminate(); + } thread->shared->UnlockColumnFamily(cf); } } @@ -1299,10 +1311,15 @@ class StressTest { } } thread->shared->Put(rand_column_family, rand_key, value_base); + Status s; if (FLAGS_use_merge) { - db_->Merge(write_opts, column_family, key, v); + s = db_->Merge(write_opts, column_family, key, v); } else { - db_->Put(write_opts, column_family, key, v); + s = db_->Put(write_opts, column_family, key, v); + } + if (!s.ok()) { + fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str()); + std::terminate(); } thread->stats.AddBytesForWrites(1, sz); } else { @@ -1313,8 +1330,12 @@ class StressTest { // OPERATION delete if (!FLAGS_test_batches_snapshots) { thread->shared->Delete(rand_column_family, rand_key); - db_->Delete(write_opts, column_family, key); + Status s = db_->Delete(write_opts, column_family, key); thread->stats.AddDeletes(1); + if (!s.ok()) { + fprintf(stderr, "delete error: %s\n", s.ToString().c_str()); + std::terminate(); + } } else { MultiDelete(thread, write_opts, column_family, key); } @@ -1550,8 +1571,13 @@ class StressTest { void Open() { assert(db_ == nullptr); - options_.block_cache = cache_; - options_.block_cache_compressed = compressed_cache_; + BlockBasedTableOptions block_based_options; + block_based_options.block_cache = cache_; + block_based_options.block_cache_compressed = compressed_cache_; + block_based_options.block_size = FLAGS_block_size; + block_based_options.filter_policy = filter_policy_; + options_.table_factory.reset( + NewBlockBasedTableFactory(block_based_options)); options_.write_buffer_size = FLAGS_write_buffer_size; options_.max_write_buffer_number = FLAGS_max_write_buffer_number; options_.min_write_buffer_number_to_merge = @@ -1560,8 +1586,6 @@ class StressTest { options_.max_background_flushes = FLAGS_max_background_flushes; options_.compaction_style = static_cast(FLAGS_compaction_style); - options_.block_size = FLAGS_block_size; - options_.filter_policy = filter_policy_; options_.prefix_extractor.reset(NewFixedPrefixTransform(FLAGS_prefix_size)); options_.max_open_files = FLAGS_open_files; options_.statistics = dbstats; @@ -1718,9 +1742,9 @@ class StressTest { } private: - shared_ptr cache_; - shared_ptr compressed_cache_; - const FilterPolicy* filter_policy_; + std::shared_ptr cache_; + std::shared_ptr compressed_cache_; + std::shared_ptr filter_policy_; DB* db_; Options options_; std::vector column_families_; diff --git a/tools/reduce_levels_test.cc b/tools/reduce_levels_test.cc index b41f36d0106..b1d58e10ec1 100644 --- a/tools/reduce_levels_test.cc +++ b/tools/reduce_levels_test.cc @@ -76,6 +76,7 @@ Status ReduceLevelTest::OpenDB(bool create_if_missing, int num_levels, opt.num_levels = num_levels; opt.create_if_missing = create_if_missing; opt.max_mem_compaction_level = mem_table_compact_level; + opt.max_background_flushes = 0; rocksdb::Status st = rocksdb::DB::Open(opt, dbname_, &db_); if (!st.ok()) { fprintf(stderr, "Can't open the db:%s\n", st.ToString().c_str()); diff --git a/tools/run_flash_bench.sh b/tools/run_flash_bench.sh new file mode 100755 index 00000000000..be7d1631f64 --- /dev/null +++ b/tools/run_flash_bench.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# REQUIRE: benchmark.sh exists in the current directory +# After execution of this script, log files are generated in $output_dir. +# report.txt provides a high level statistics + +# Size constants +K=1024 +M=$((1024 * K)) +G=$((1024 * M)) + +n=$((1 * G)) +wps=$((80 * K)) +duration=$((6 * 60 * 60)) +num_read_threads=24 + +# Update these parameters before execution !!! +db_dir="/tmp/rocksdb/" +wal_dir="/tmp/rocksdb/" +output_dir="/tmp/output" + +# Test 1: bulk load +OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \ + ./benchmark.sh bulkload + +# Test 2: sequential fill +OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \ + ./benchmark.sh fillseq + +# Test 3: overwrite +OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \ + ./benchmark.sh overwrite + +# Prepare: populate DB with random data +OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \ + ./benchmark.sh filluniquerandom + +# Test 4: random read +OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \ + DURATION=$duration NUM_READ_THREADS=$num_read_threads \ + ./benchmark.sh readrandom + +# Test 5: random read while writing +OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \ + DURATION=$duration NUM_READ_THREADS=$num_read_threads WRITES_PER_SECOND=$wps \ + ./benchmark.sh readwhilewriting diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc index 9b130c7c63d..6c496e8ddb4 100644 --- a/tools/sst_dump.cc +++ b/tools/sst_dump.cc @@ -68,6 +68,7 @@ class SstFileReader { // options_ and internal_comparator_ will also be used in // ReadSequential internally (specifically, seek-related operations) Options options_; + const ImmutableCFOptions ioptions_; InternalKeyComparator internal_comparator_; unique_ptr table_properties_; }; @@ -76,7 +77,8 @@ SstFileReader::SstFileReader(const std::string& file_path, bool verify_checksum, bool output_hex) :file_name_(file_path), read_num_(0), verify_checksum_(verify_checksum), - output_hex_(output_hex), internal_comparator_(BytewiseComparator()) { + output_hex_(output_hex), ioptions_(options_), + internal_comparator_(BytewiseComparator()) { fprintf(stdout, "Process %s\n", file_path.c_str()); init_result_ = NewTableReader(file_name_); @@ -123,7 +125,7 @@ Status SstFileReader::NewTableReader(const std::string& file_path) { if (s.ok()) { s = options_.table_factory->NewTableReader( - options_, soptions_, internal_comparator_, std::move(file_), file_size, + ioptions_, soptions_, internal_comparator_, std::move(file_), file_size, &table_reader_); } return s; diff --git a/util/bloom.cc b/util/bloom.cc index 723adf843c2..19d8edead27 100644 --- a/util/bloom.cc +++ b/util/bloom.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2014, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -10,42 +10,266 @@ #include "rocksdb/filter_policy.h" #include "rocksdb/slice.h" +#include "table/block_based_filter_block.h" +#include "table/full_filter_block.h" #include "util/hash.h" +#include "util/coding.h" namespace rocksdb { +class BlockBasedFilterBlockBuilder; +class FullFilterBlockBuilder; + namespace { +class FullFilterBitsBuilder : public FilterBitsBuilder { + public: + explicit FullFilterBitsBuilder(const size_t bits_per_key, + const size_t num_probes) + : bits_per_key_(bits_per_key), + num_probes_(num_probes) { + assert(bits_per_key_); + } + + ~FullFilterBitsBuilder() {} + + virtual void AddKey(const Slice& key) override { + uint32_t hash = BloomHash(key); + if (hash_entries_.size() == 0 || hash != hash_entries_.back()) { + hash_entries_.push_back(hash); + } + } + + // Create a filter that for hashes [0, n-1], the filter is allocated here + // When creating filter, it is ensured that + // total_bits = num_lines * CACHE_LINE_SIZE * 8 + // dst len is >= 5, 1 for num_probes, 4 for num_lines + // Then total_bits = (len - 5) * 8, and cache_line_size could be calulated + // +----------------------------------------------------------------+ + // | filter data with length total_bits/8 | + // +----------------------------------------------------------------+ + // | | + // | ... | + // | | + // +----------------------------------------------------------------+ + // | ... | num_probes : 1 byte | num_lines : 4 bytes | + // +----------------------------------------------------------------+ + virtual Slice Finish(std::unique_ptr* buf) override { + uint32_t total_bits, num_lines; + char* data = ReserveSpace(hash_entries_.size(), &total_bits, &num_lines); + assert(data); + + if (total_bits != 0 && num_lines != 0) { + for (auto h : hash_entries_) { + AddHash(h, data, num_lines, total_bits); + } + } + data[total_bits/8] = static_cast(num_probes_); + EncodeFixed32(data + total_bits/8 + 1, static_cast(num_lines)); + + const char* const_data = data; + buf->reset(const_data); + hash_entries_.clear(); + + return Slice(data, total_bits / 8 + 5); + } -class BloomFilterPolicy : public FilterPolicy { private: size_t bits_per_key_; - size_t k_; - uint32_t (*hash_func_)(const Slice& key); + size_t num_probes_; + std::vector hash_entries_; - void initialize() { - // We intentionally round down to reduce probing cost a little bit - k_ = static_cast(bits_per_key_ * 0.69); // 0.69 =~ ln(2) - if (k_ < 1) k_ = 1; - if (k_ > 30) k_ = 30; + // Get totalbits that optimized for cpu cache line + uint32_t GetTotalBitsForLocality(uint32_t total_bits); + + // Reserve space for new filter + char* ReserveSpace(const int num_entry, uint32_t* total_bits, + uint32_t* num_lines); + + // Assuming single threaded access to this function. + void AddHash(uint32_t h, char* data, uint32_t num_lines, + uint32_t total_bits); + + // No Copy allowed + FullFilterBitsBuilder(const FullFilterBitsBuilder&); + void operator=(const FullFilterBitsBuilder&); +}; + +uint32_t FullFilterBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) { + uint32_t num_lines = + (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8); + + // Make num_lines an odd number to make sure more bits are involved + // when determining which block. + if (num_lines % 2 == 0) { + num_lines++; + } + return num_lines * (CACHE_LINE_SIZE * 8); +} + +char* FullFilterBitsBuilder::ReserveSpace(const int num_entry, + uint32_t* total_bits, uint32_t* num_lines) { + assert(bits_per_key_); + char* data = nullptr; + if (num_entry != 0) { + uint32_t total_bits_tmp = num_entry * bits_per_key_; + + *total_bits = GetTotalBitsForLocality(total_bits_tmp); + *num_lines = *total_bits / (CACHE_LINE_SIZE * 8); + assert(*total_bits > 0 && *total_bits % 8 == 0); + } else { + // filter is empty, just leave space for metadata + *total_bits = 0; + *num_lines = 0; } + // Reserve space for Filter + uint32_t sz = *total_bits / 8; + sz += 5; // 4 bytes for num_lines, 1 byte for num_probes + + data = new char[sz]; + memset(data, 0, sz); + return data; +} + +inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data, + uint32_t num_lines, uint32_t total_bits) { + assert(num_lines > 0 && total_bits > 0); + + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + uint32_t b = (h % num_lines) * (CACHE_LINE_SIZE * 8); + + for (uint32_t i = 0; i < num_probes_; ++i) { + // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized + // to a simple operation by compiler. + const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8)); + data[bitpos / 8] |= (1 << (bitpos % 8)); + + h += delta; + } +} + +class FullFilterBitsReader : public FilterBitsReader { public: - explicit BloomFilterPolicy(int bits_per_key, - uint32_t (*hash_func)(const Slice& key)) - : bits_per_key_(bits_per_key), hash_func_(hash_func) { - initialize(); + explicit FullFilterBitsReader(const Slice& contents) + : data_(const_cast(contents.data())), + data_len_(contents.size()), + num_probes_(0), num_lines_(0) { + assert(data_); + GetFilterMeta(contents, &num_probes_, &num_lines_); + // Sanitize broken parameter + if (num_lines_ != 0 && (data_len_-5) % num_lines_ != 0) { + num_lines_ = 0; + num_probes_ = 0; + } + } + + ~FullFilterBitsReader() {} + + virtual bool MayMatch(const Slice& entry) override { + if (data_len_ <= 5) { // remain same with original filter + return false; + } + // Other Error params, including a broken filter, regarded as match + if (num_probes_ == 0 || num_lines_ == 0) return true; + uint32_t hash = BloomHash(entry); + return HashMayMatch(hash, Slice(data_, data_len_), + num_probes_, num_lines_); } - explicit BloomFilterPolicy(int bits_per_key) - : bits_per_key_(bits_per_key) { - hash_func_ = BloomHash; + + private: + // Filter meta data + char* data_; + uint32_t data_len_; + size_t num_probes_; + uint32_t num_lines_; + + // Get num_probes, and num_lines from filter + // If filter format broken, set both to 0. + void GetFilterMeta(const Slice& filter, size_t* num_probes, + uint32_t* num_lines); + + // "filter" contains the data appended by a preceding call to + // CreateFilterFromHash() on this class. This method must return true if + // the key was in the list of keys passed to CreateFilter(). + // This method may return true or false if the key was not on the + // list, but it should aim to return false with a high probability. + // + // hash: target to be checked + // filter: the whole filter, including meta data bytes + // num_probes: number of probes, read before hand + // num_lines: filter metadata, read before hand + // Before calling this function, need to ensure the input meta data + // is valid. + bool HashMayMatch(const uint32_t& hash, const Slice& filter, + const size_t& num_probes, const uint32_t& num_lines); + + // No Copy allowed + FullFilterBitsReader(const FullFilterBitsReader&); + void operator=(const FullFilterBitsReader&); +}; + +void FullFilterBitsReader::GetFilterMeta(const Slice& filter, + size_t* num_probes, uint32_t* num_lines) { + uint32_t len = filter.size(); + if (len <= 5) { + // filter is empty or broken + *num_probes = 0; + *num_lines = 0; + return; + } + + *num_probes = filter.data()[len - 5]; + *num_lines = DecodeFixed32(filter.data() + len - 4); +} + +bool FullFilterBitsReader::HashMayMatch(const uint32_t& hash, + const Slice& filter, const size_t& num_probes, + const uint32_t& num_lines) { + uint32_t len = filter.size(); + if (len <= 5) return false; // remain the same with original filter + + // It is ensured the params are valid before calling it + assert(num_probes != 0); + assert(num_lines != 0 && (len - 5) % num_lines == 0); + uint32_t cache_line_size = (len - 5) / num_lines; + const char* data = filter.data(); + + uint32_t h = hash; + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + uint32_t b = (h % num_lines) * (cache_line_size * 8); + + for (uint32_t i = 0; i < num_probes; ++i) { + // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized + // to a simple and operation by compiler. + const uint32_t bitpos = b + (h % (cache_line_size * 8)); + if (((data[bitpos / 8]) & (1 << (bitpos % 8))) == 0) { + return false; + } + + h += delta; + } + + return true; +} + +// An implementation of filter policy +class BloomFilterPolicy : public FilterPolicy { + public: + explicit BloomFilterPolicy(int bits_per_key, bool use_block_based_builder) + : bits_per_key_(bits_per_key), hash_func_(BloomHash), + use_block_based_builder_(use_block_based_builder) { initialize(); } - virtual const char* Name() const { + ~BloomFilterPolicy() { + } + + virtual const char* Name() const override { return "rocksdb.BuiltinBloomFilter"; } - virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { + virtual void CreateFilter(const Slice* keys, int n, + std::string* dst) const override { // Compute bloom filter size (in both bits and bytes) size_t bits = n * bits_per_key_; @@ -58,14 +282,14 @@ class BloomFilterPolicy : public FilterPolicy { const size_t init_size = dst->size(); dst->resize(init_size + bytes, 0); - dst->push_back(static_cast(k_)); // Remember # of probes in filter + dst->push_back(static_cast(num_probes_)); // Remember # of probes char* array = &(*dst)[init_size]; for (size_t i = 0; i < (size_t)n; i++) { // Use double-hashing to generate a sequence of hash values. // See analysis in [Kirsch,Mitzenmacher 2006]. uint32_t h = hash_func_(keys[i]); const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits - for (size_t j = 0; j < k_; j++) { + for (size_t j = 0; j < num_probes_; j++) { const uint32_t bitpos = h % bits; array[bitpos/8] |= (1 << (bitpos % 8)); h += delta; @@ -73,7 +297,8 @@ class BloomFilterPolicy : public FilterPolicy { } } - virtual bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const { + virtual bool KeyMayMatch(const Slice& key, + const Slice& bloom_filter) const override { const size_t len = bloom_filter.size(); if (len < 2) return false; @@ -98,11 +323,43 @@ class BloomFilterPolicy : public FilterPolicy { } return true; } + + virtual FilterBitsBuilder* GetFilterBitsBuilder() const override { + if (use_block_based_builder_) { + return nullptr; + } + + return new FullFilterBitsBuilder(bits_per_key_, num_probes_); + } + + virtual FilterBitsReader* GetFilterBitsReader(const Slice& contents) + const override { + return new FullFilterBitsReader(contents); + } + + // If choose to use block based builder + bool UseBlockBasedBuilder() { return use_block_based_builder_; } + + private: + size_t bits_per_key_; + size_t num_probes_; + uint32_t (*hash_func_)(const Slice& key); + + const bool use_block_based_builder_; + + void initialize() { + // We intentionally round down to reduce probing cost a little bit + num_probes_ = static_cast(bits_per_key_ * 0.69); // 0.69 =~ ln(2) + if (num_probes_ < 1) num_probes_ = 1; + if (num_probes_ > 30) num_probes_ = 30; + } }; -} -const FilterPolicy* NewBloomFilterPolicy(int bits_per_key) { - return new BloomFilterPolicy(bits_per_key); +} // namespace + +const FilterPolicy* NewBloomFilterPolicy(int bits_per_key, + bool use_block_based_builder) { + return new BloomFilterPolicy(bits_per_key, use_block_based_builder); } } // namespace rocksdb diff --git a/util/bloom_test.cc b/util/bloom_test.cc index 881e3b0f595..275592b70ae 100644 --- a/util/bloom_test.cc +++ b/util/bloom_test.cc @@ -16,12 +16,13 @@ int main() { #else #include +#include #include "rocksdb/filter_policy.h" - #include "util/logging.h" #include "util/testharness.h" #include "util/testutil.h" +#include "util/arena.h" using GFLAGS::ParseCommandLineFlags; @@ -36,6 +37,19 @@ static Slice Key(int i, char* buffer) { return Slice(buffer, sizeof(i)); } +static int NextLength(int length) { + if (length < 10) { + length += 1; + } else if (length < 100) { + length += 10; + } else if (length < 1000) { + length += 100; + } else { + length += 1000; + } + return length; +} + class BloomTest { private: const FilterPolicy* policy_; @@ -43,7 +57,8 @@ class BloomTest { std::vector keys_; public: - BloomTest() : policy_(NewBloomFilterPolicy(FLAGS_bits_per_key)) { } + BloomTest() : policy_( + NewBloomFilterPolicy(FLAGS_bits_per_key)) {} ~BloomTest() { delete policy_; @@ -117,19 +132,6 @@ TEST(BloomTest, Small) { ASSERT_TRUE(! Matches("foo")); } -static int NextLength(int length) { - if (length < 10) { - length += 1; - } else if (length < 100) { - length += 10; - } else if (length < 1000) { - length += 100; - } else { - length += 1000; - } - return length; -} - TEST(BloomTest, VaryingLengths) { char buffer[sizeof(int)]; @@ -171,6 +173,121 @@ TEST(BloomTest, VaryingLengths) { // Different bits-per-byte +class FullBloomTest { + private: + const FilterPolicy* policy_; + std::unique_ptr bits_builder_; + std::unique_ptr bits_reader_; + std::unique_ptr buf_; + size_t filter_size_; + + public: + FullBloomTest() : + policy_(NewBloomFilterPolicy(FLAGS_bits_per_key, false)), + filter_size_(0) { + Reset(); + } + + ~FullBloomTest() { + delete policy_; + } + + void Reset() { + bits_builder_.reset(policy_->GetFilterBitsBuilder()); + bits_reader_.reset(nullptr); + buf_.reset(nullptr); + filter_size_ = 0; + } + + void Add(const Slice& s) { + bits_builder_->AddKey(s); + } + + void Build() { + Slice filter = bits_builder_->Finish(&buf_); + bits_reader_.reset(policy_->GetFilterBitsReader(filter)); + filter_size_ = filter.size(); + } + + size_t FilterSize() const { + return filter_size_; + } + + bool Matches(const Slice& s) { + if (bits_reader_ == nullptr) { + Build(); + } + return bits_reader_->MayMatch(s); + } + + double FalsePositiveRate() { + char buffer[sizeof(int)]; + int result = 0; + for (int i = 0; i < 10000; i++) { + if (Matches(Key(i + 1000000000, buffer))) { + result++; + } + } + return result / 10000.0; + } +}; + +TEST(FullBloomTest, FullEmptyFilter) { + // Empty filter is not match, at this level + ASSERT_TRUE(!Matches("hello")); + ASSERT_TRUE(!Matches("world")); +} + +TEST(FullBloomTest, FullSmall) { + Add("hello"); + Add("world"); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + ASSERT_TRUE(!Matches("x")); + ASSERT_TRUE(!Matches("foo")); +} + +TEST(FullBloomTest, FullVaryingLengths) { + char buffer[sizeof(int)]; + + // Count number of filters that significantly exceed the false positive rate + int mediocre_filters = 0; + int good_filters = 0; + + for (int length = 1; length <= 10000; length = NextLength(length)) { + Reset(); + for (int i = 0; i < length; i++) { + Add(Key(i, buffer)); + } + Build(); + + ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + 128 + 5)) << length; + + // All added keys must match + for (int i = 0; i < length; i++) { + ASSERT_TRUE(Matches(Key(i, buffer))) + << "Length " << length << "; key " << i; + } + + // Check false positive rate + double rate = FalsePositiveRate(); + if (kVerbose >= 1) { + fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n", + rate*100.0, length, static_cast(FilterSize())); + } + ASSERT_LE(rate, 0.02); // Must not be over 2% + if (rate > 0.0125) + mediocre_filters++; // Allowed, but not too often + else + good_filters++; + } + if (kVerbose >= 1) { + fprintf(stderr, "Filters: %d good, %d mediocre\n", + good_filters, mediocre_filters); + } + ASSERT_LE(mediocre_filters, good_filters/5); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/util/cache_bench.cc b/util/cache_bench.cc new file mode 100644 index 00000000000..3d006ecf88e --- /dev/null +++ b/util/cache_bench.cc @@ -0,0 +1,278 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif +#ifndef GFLAGS +#include +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else + +#include +#include +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/cache.h" +#include "rocksdb/env.h" +#include "port/port.h" +#include "util/mutexlock.h" +#include "util/random.h" + +using GFLAGS::ParseCommandLineFlags; + +static const uint32_t KB = 1024; + +DEFINE_int32(threads, 16, "Number of concurrent threads to run."); +DEFINE_int64(cache_size, 8 * KB * KB, + "Number of bytes to use as a cache of uncompressed data."); +DEFINE_int32(num_shard_bits, 4, "shard_bits."); + +DEFINE_int64(max_key, 1 * KB * KB * KB, "Max number of key to place in cache"); +DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread."); + +DEFINE_bool(populate_cache, false, "Populate cache before operations"); +DEFINE_int32(insert_percent, 40, + "Ratio of insert to total workload (expressed as a percentage)"); +DEFINE_int32(lookup_percent, 50, + "Ratio of lookup to total workload (expressed as a percentage)"); +DEFINE_int32(erase_percent, 10, + "Ratio of erase to total workload (expressed as a percentage)"); + +namespace rocksdb { + +class CacheBench; +namespace { +void deleter(const Slice& key, void* value) { + delete reinterpret_cast(value); +} + +// State shared by all concurrent executions of the same benchmark. +class SharedState { + public: + explicit SharedState(CacheBench* cache_bench) + : cv_(&mu_), + num_threads_(FLAGS_threads), + num_initialized_(0), + start_(false), + num_done_(0), + cache_bench_(cache_bench) { + } + + ~SharedState() {} + + port::Mutex* GetMutex() { + return &mu_; + } + + port::CondVar* GetCondVar() { + return &cv_; + } + + CacheBench* GetCacheBench() const { + return cache_bench_; + } + + void IncInitialized() { + num_initialized_++; + } + + void IncDone() { + num_done_++; + } + + bool AllInitialized() const { + return num_initialized_ >= num_threads_; + } + + bool AllDone() const { + return num_done_ >= num_threads_; + } + + void SetStart() { + start_ = true; + } + + bool Started() const { + return start_; + } + + private: + port::Mutex mu_; + port::CondVar cv_; + + const uint64_t num_threads_; + uint64_t num_initialized_; + bool start_; + uint64_t num_done_; + + CacheBench* cache_bench_; +}; + +// Per-thread state for concurrent executions of the same benchmark. +struct ThreadState { + uint32_t tid; + Random rnd; + SharedState* shared; + + ThreadState(uint32_t index, SharedState *shared) + : tid(index), + rnd(1000 + index), + shared(shared) {} +}; +} // namespace + +class CacheBench { + public: + CacheBench() : + cache_(NewLRUCache(FLAGS_cache_size, FLAGS_num_shard_bits)), + num_threads_(FLAGS_threads) {} + + ~CacheBench() {} + + void PopulateCache() { + Random rnd(1); + for (int64_t i = 0; i < FLAGS_cache_size; i++) { + uint64_t rand_key = rnd.Next() % FLAGS_max_key; + // Cast uint64* to be char*, data would be copied to cache + Slice key(reinterpret_cast(&rand_key), 8); + // do insert + auto handle = cache_->Insert(key, new char[10], 1, &deleter); + cache_->Release(handle); + } + } + + bool Run() { + rocksdb::Env* env = rocksdb::Env::Default(); + + PrintEnv(); + SharedState shared(this); + std::vector threads(num_threads_); + for (uint32_t i = 0; i < num_threads_; i++) { + threads[i] = new ThreadState(i, &shared); + env->StartThread(ThreadBody, threads[i]); + } + { + MutexLock l(shared.GetMutex()); + while (!shared.AllInitialized()) { + shared.GetCondVar()->Wait(); + } + // Record start time + uint64_t start_time = env->NowMicros(); + + // Start all threads + shared.SetStart(); + shared.GetCondVar()->SignalAll(); + + // Wait threads to complete + while (!shared.AllDone()) { + shared.GetCondVar()->Wait(); + } + + // Record end time + uint64_t end_time = env->NowMicros(); + double elapsed = static_cast(end_time - start_time) * 1e-6; + uint32_t qps = static_cast( + static_cast(FLAGS_threads * FLAGS_ops_per_thread) / elapsed); + fprintf(stdout, "Complete in %.3f s; QPS = %u\n", elapsed, qps); + } + return true; + } + + private: + std::shared_ptr cache_; + uint32_t num_threads_; + + static void ThreadBody(void* v) { + ThreadState* thread = reinterpret_cast(v); + SharedState* shared = thread->shared; + + { + MutexLock l(shared->GetMutex()); + shared->IncInitialized(); + if (shared->AllInitialized()) { + shared->GetCondVar()->SignalAll(); + } + while (!shared->Started()) { + shared->GetCondVar()->Wait(); + } + } + thread->shared->GetCacheBench()->OperateCache(thread); + + { + MutexLock l(shared->GetMutex()); + shared->IncDone(); + if (shared->AllDone()) { + shared->GetCondVar()->SignalAll(); + } + } + } + + void OperateCache(ThreadState* thread) { + for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) { + uint64_t rand_key = thread->rnd.Next() % FLAGS_max_key; + // Cast uint64* to be char*, data would be copied to cache + Slice key(reinterpret_cast(&rand_key), 8); + int32_t prob_op = thread->rnd.Uniform(100); + if (prob_op >= 0 && prob_op < FLAGS_insert_percent) { + // do insert + auto handle = cache_->Insert(key, new char[10], 1, &deleter); + cache_->Release(handle); + } else if (prob_op -= FLAGS_insert_percent && + prob_op < FLAGS_lookup_percent) { + // do lookup + auto handle = cache_->Lookup(key); + if (handle) { + cache_->Release(handle); + } + } else if (prob_op -= FLAGS_lookup_percent && + prob_op < FLAGS_erase_percent) { + // do erase + cache_->Erase(key); + } + } + } + + void PrintEnv() const { + printf("RocksDB version : %d.%d\n", kMajorVersion, kMinorVersion); + printf("Number of threads : %d\n", FLAGS_threads); + printf("Ops per thread : %" PRIu64 "\n", FLAGS_ops_per_thread); + printf("Cache size : %" PRIu64 "\n", FLAGS_cache_size); + printf("Num shard bits : %d\n", FLAGS_num_shard_bits); + printf("Max key : %" PRIu64 "\n", FLAGS_max_key); + printf("Populate cache : %d\n", FLAGS_populate_cache); + printf("Insert percentage : %d%%\n", FLAGS_insert_percent); + printf("Lookup percentage : %d%%\n", FLAGS_lookup_percent); + printf("Erase percentage : %d%%\n", FLAGS_erase_percent); + printf("----------------------------\n"); + } +}; +} // namespace rocksdb + +int main(int argc, char** argv) { + ParseCommandLineFlags(&argc, &argv, true); + + if (FLAGS_threads <= 0) { + fprintf(stderr, "threads number <= 0\n"); + exit(1); + } + + rocksdb::CacheBench bench; + if (FLAGS_populate_cache) { + bench.PopulateCache(); + } + if (bench.Run()) { + return 0; + } else { + return 1; + } +} + +#endif // GFLAGS diff --git a/util/coding.cc b/util/coding.cc index 31ae0e356cd..f09e67284d3 100644 --- a/util/coding.cc +++ b/util/coding.cc @@ -78,92 +78,4 @@ const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) { return nullptr; } -void BitStreamPutInt(char* dst, size_t dstlen, size_t offset, - uint32_t bits, uint64_t value) { - assert((offset + bits + 7)/8 <= dstlen); - assert(bits <= 64); - - unsigned char* ptr = reinterpret_cast(dst); - - size_t byteOffset = offset / 8; - size_t bitOffset = offset % 8; - - // This prevents unused variable warnings when compiling. -#ifndef NDEBUG - // Store truncated value. - uint64_t origValue = (bits < 64)?(value & (((uint64_t)1 << bits) - 1)):value; - uint32_t origBits = bits; -#endif - - while (bits > 0) { - size_t bitsToGet = std::min(bits, 8 - bitOffset); - unsigned char mask = ((1 << bitsToGet) - 1); - - ptr[byteOffset] = (ptr[byteOffset] & ~(mask << bitOffset)) + - ((value & mask) << bitOffset); - - value >>= bitsToGet; - byteOffset += 1; - bitOffset = 0; - bits -= bitsToGet; - } - - assert(origValue == BitStreamGetInt(dst, dstlen, offset, origBits)); -} - -uint64_t BitStreamGetInt(const char* src, size_t srclen, size_t offset, - uint32_t bits) { - assert((offset + bits + 7)/8 <= srclen); - assert(bits <= 64); - - const unsigned char* ptr = reinterpret_cast(src); - - uint64_t result = 0; - - size_t byteOffset = offset / 8; - size_t bitOffset = offset % 8; - size_t shift = 0; - - while (bits > 0) { - size_t bitsToGet = std::min(bits, 8 - bitOffset); - unsigned char mask = ((1 << bitsToGet) - 1); - - result += (uint64_t)((ptr[byteOffset] >> bitOffset) & mask) << shift; - - shift += bitsToGet; - byteOffset += 1; - bitOffset = 0; - bits -= bitsToGet; - } - - return result; -} - -void BitStreamPutInt(std::string* dst, size_t offset, uint32_t bits, - uint64_t value) { - assert((offset + bits + 7)/8 <= dst->size()); - - const size_t kTmpBufLen = sizeof(value) + 1; - char tmpBuf[kTmpBufLen]; - - // Number of bytes of tmpBuf being used - const size_t kUsedBytes = (offset%8 + bits)/8; - - // Copy relevant parts of dst to tmpBuf - for (size_t idx = 0; idx <= kUsedBytes; ++idx) { - tmpBuf[idx] = (*dst)[offset/8 + idx]; - } - - BitStreamPutInt(tmpBuf, kTmpBufLen, offset%8, bits, value); - - // Copy tmpBuf back to dst - for (size_t idx = 0; idx <= kUsedBytes; ++idx) { - (*dst)[offset/8 + idx] = tmpBuf[idx]; - } - - // Do the check here too as we are working with a buffer. - assert(((bits < 64)?(value & (((uint64_t)1 << bits) - 1)):value) == - BitStreamGetInt(dst, offset, bits)); -} - } // namespace rocksdb diff --git a/util/coding.h b/util/coding.h index 6ad2077d44c..fa665266826 100644 --- a/util/coding.h +++ b/util/coding.h @@ -115,32 +115,6 @@ inline const char* GetVarint32Ptr(const char* p, return GetVarint32PtrFallback(p, limit, value); } -// Writes an unsigned integer with bits number of bits with its least -// significant bit at offset. -// Bits are numbered from 0 to 7 in the first byte, 8 to 15 in the second and -// so on. -// value is truncated to the bits number of least significant bits. -// REQUIRES: (offset+bits+7)/8 <= dstlen -// REQUIRES: bits <= 64 -extern void BitStreamPutInt(char* dst, size_t dstlen, size_t offset, - uint32_t bits, uint64_t value); - -// Reads an unsigned integer with bits number of bits with its least -// significant bit at offset. -// Bits are numbered in the same way as ByteStreamPutInt(). -// REQUIRES: (offset+bits+7)/8 <= srclen -// REQUIRES: bits <= 64 -extern uint64_t BitStreamGetInt(const char* src, size_t srclen, size_t offset, - uint32_t bits); - -// Convenience functions -extern void BitStreamPutInt(std::string* dst, size_t offset, uint32_t bits, - uint64_t value); -extern uint64_t BitStreamGetInt(const std::string* src, size_t offset, - uint32_t bits); -extern uint64_t BitStreamGetInt(const Slice* src, size_t offset, - uint32_t bits); - // -- Implementation of the functions declared above inline void EncodeFixed32(char* buf, uint32_t value) { #if __BYTE_ORDER == __LITTLE_ENDIAN @@ -291,14 +265,4 @@ inline Slice GetSliceUntil(Slice* slice, char delimiter) { return ret; } -inline uint64_t BitStreamGetInt(const std::string* src, size_t offset, - uint32_t bits) { - return BitStreamGetInt(src->data(), src->size(), offset, bits); -} - -inline uint64_t BitStreamGetInt(const Slice* src, size_t offset, - uint32_t bits) { - return BitStreamGetInt(src->data(), src->size(), offset, bits); -} - } // namespace rocksdb diff --git a/util/coding_test.cc b/util/coding_test.cc index ed542d6bf87..3dbe7befe9f 100644 --- a/util/coding_test.cc +++ b/util/coding_test.cc @@ -196,99 +196,6 @@ TEST(Coding, Strings) { ASSERT_EQ("", input.ToString()); } -TEST(Coding, BitStream) { - const int kNumBytes = 10; - char bytes[kNumBytes+1]; - for (int i = 0; i < kNumBytes + 1; ++i) { - bytes[i] = '\0'; - } - - // Simple byte aligned test. - for (int i = 0; i < kNumBytes; ++i) { - BitStreamPutInt(bytes, kNumBytes, i*8, 8, 255-i); - - ASSERT_EQ((unsigned char)bytes[i], (unsigned char)(255-i)); - } - for (int i = 0; i < kNumBytes; ++i) { - ASSERT_EQ(BitStreamGetInt(bytes, kNumBytes, i*8, 8), (uint32_t)(255-i)); - } - ASSERT_EQ(bytes[kNumBytes], '\0'); - - // Write and read back at strange offsets - for (int i = 0; i < kNumBytes + 1; ++i) { - bytes[i] = '\0'; - } - for (int i = 0; i < kNumBytes; ++i) { - BitStreamPutInt(bytes, kNumBytes, i*5+1, 4, (i * 7) % (1 << 4)); - } - for (int i = 0; i < kNumBytes; ++i) { - ASSERT_EQ(BitStreamGetInt(bytes, kNumBytes, i*5+1, 4), - (uint32_t)((i * 7) % (1 << 4))); - } - ASSERT_EQ(bytes[kNumBytes], '\0'); - - // Create 11011011 as a bit pattern - for (int i = 0; i < kNumBytes + 1; ++i) { - bytes[i] = '\0'; - } - for (int i = 0; i < kNumBytes; ++i) { - BitStreamPutInt(bytes, kNumBytes, i*8, 2, 3); - BitStreamPutInt(bytes, kNumBytes, i*8+3, 2, 3); - BitStreamPutInt(bytes, kNumBytes, i*8+6, 2, 3); - - ASSERT_EQ((unsigned char)bytes[i], - (unsigned char)(3 + (3 << 3) + (3 << 6))); - } - ASSERT_EQ(bytes[kNumBytes], '\0'); - - - // Test large values - for (int i = 0; i < kNumBytes + 1; ++i) { - bytes[i] = '\0'; - } - BitStreamPutInt(bytes, kNumBytes, 0, 64, (uint64_t)(-1)); - for (int i = 0; i < 64/8; ++i) { - ASSERT_EQ((unsigned char)bytes[i], - (unsigned char)(255)); - } - ASSERT_EQ(bytes[64/8], '\0'); - - -} - -TEST(Coding, BitStreamConvenienceFuncs) { - std::string bytes(1, '\0'); - - // Check that independent changes to byte are preserved. - BitStreamPutInt(&bytes, 0, 2, 3); - BitStreamPutInt(&bytes, 3, 2, 3); - BitStreamPutInt(&bytes, 6, 2, 3); - ASSERT_EQ((unsigned char)bytes[0], (unsigned char)(3 + (3 << 3) + (3 << 6))); - ASSERT_EQ(BitStreamGetInt(&bytes, 0, 2), 3u); - ASSERT_EQ(BitStreamGetInt(&bytes, 3, 2), 3u); - ASSERT_EQ(BitStreamGetInt(&bytes, 6, 2), 3u); - Slice slice(bytes); - ASSERT_EQ(BitStreamGetInt(&slice, 0, 2), 3u); - ASSERT_EQ(BitStreamGetInt(&slice, 3, 2), 3u); - ASSERT_EQ(BitStreamGetInt(&slice, 6, 2), 3u); - - // Test overlapping crossing over byte boundaries - bytes = std::string(2, '\0'); - BitStreamPutInt(&bytes, 6, 4, 15); - ASSERT_EQ((unsigned char)bytes[0], 3 << 6); - ASSERT_EQ((unsigned char)bytes[1], 3); - ASSERT_EQ(BitStreamGetInt(&bytes, 6, 4), 15u); - slice = Slice(bytes); - ASSERT_EQ(BitStreamGetInt(&slice, 6, 4), 15u); - - // Test 64-bit number - bytes = std::string(64/8, '\0'); - BitStreamPutInt(&bytes, 0, 64, (uint64_t)(-1)); - ASSERT_EQ(BitStreamGetInt(&bytes, 0, 64), (uint64_t)(-1)); - slice = Slice(bytes); - ASSERT_EQ(BitStreamGetInt(&slice, 0, 64), (uint64_t)(-1)); -} - } // namespace rocksdb int main(int argc, char** argv) { diff --git a/util/db_info_dummper.cc b/util/db_info_dummper.cc index d5dd97ad2c1..2e0d3448166 100644 --- a/util/db_info_dummper.cc +++ b/util/db_info_dummper.cc @@ -6,7 +6,10 @@ // Must not be included from any .h files to avoid polluting the namespace // with macros. +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc index 3e55488f223..6d228e81df7 100644 --- a/util/dynamic_bloom_test.cc +++ b/util/dynamic_bloom_test.cc @@ -11,7 +11,10 @@ int main() { } #else +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include diff --git a/util/env_posix.cc b/util/env_posix.cc index f9bfc697dfd..cf917e8747d 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -231,7 +231,7 @@ class PosixRandomAccessFile: public RandomAccessFile { PosixRandomAccessFile(const std::string& fname, int fd, const EnvOptions& options) : filename_(fname), fd_(fd), use_os_buffer_(options.use_os_buffer) { - assert(!options.use_mmap_reads); + assert(!options.use_mmap_reads || sizeof(void*) < 8); } virtual ~PosixRandomAccessFile() { close(fd_); } @@ -239,11 +239,23 @@ class PosixRandomAccessFile: public RandomAccessFile { char* scratch) const { Status s; ssize_t r = -1; - do { - r = pread(fd_, scratch, n, static_cast(offset)); - } while (r < 0 && errno == EINTR); - IOSTATS_ADD_IF_POSITIVE(bytes_read, r); - *result = Slice(scratch, (r < 0) ? 0 : r); + size_t left = n; + char* ptr = scratch; + while (left > 0) { + r = pread(fd_, ptr, left, static_cast(offset)); + if (r <= 0) { + if (errno == EINTR) { + continue; + } + break; + } + ptr += r; + offset += r; + left -= r; + } + + IOSTATS_ADD_IF_POSITIVE(bytes_read, n - left); + *result = Slice(scratch, (r < 0) ? 0 : n - left); if (r < 0) { // An error: return a non-ok status s = IOError(filename_, errno); @@ -907,9 +919,23 @@ class PosixRandomRWFile : public RandomRWFile { virtual Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const { Status s; - ssize_t r = pread(fd_, scratch, n, static_cast(offset)); - IOSTATS_ADD_IF_POSITIVE(bytes_read, r); - *result = Slice(scratch, (r < 0) ? 0 : r); + ssize_t r = -1; + size_t left = n; + char* ptr = scratch; + while (left > 0) { + r = pread(fd_, ptr, left, static_cast(offset)); + if (r <= 0) { + if (errno == EINTR) { + continue; + } + break; + } + ptr += r; + offset += r; + left -= r; + } + IOSTATS_ADD_IF_POSITIVE(bytes_read, n - left); + *result = Slice(scratch, (r < 0) ? 0 : n - left); if (r < 0) { s = IOError(filename_, errno); } @@ -1018,15 +1044,12 @@ class PosixFileLock : public FileLock { std::string filename; }; - -namespace { void PthreadCall(const char* label, int result) { if (result != 0) { fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); exit(1); } } -} class PosixEnv : public Env { public: @@ -1724,12 +1747,11 @@ unsigned int PosixEnv::GetThreadPoolQueueLen(Priority pri) const { return thread_pools_[pri].GetQueueLen(); } -namespace { struct StartThreadState { void (*user_function)(void*); void* arg; }; -} + static void* StartThreadWrapper(void* arg) { StartThreadState* state = reinterpret_cast(arg); state->user_function(state->arg); diff --git a/util/env_test.cc b/util/env_test.cc index c0d00ce94dc..3e811a98dd2 100644 --- a/util/env_test.cc +++ b/util/env_test.cc @@ -17,6 +17,11 @@ #include #endif +#ifdef ROCKSDB_FALLOCATE_PRESENT +#include +#include +#endif + #include "rocksdb/env.h" #include "port/port.h" #include "util/coding.h" @@ -392,6 +397,9 @@ TEST(EnvPosixTest, DecreaseNumBgThreads) { } #ifdef OS_LINUX +// Travis doesn't support fallocate or getting unique ID from files for whatever +// reason. +#ifndef TRAVIS // To make sure the Env::GetUniqueId() related tests work correctly, The files // should be stored in regular storage like "hard disk" or "flash device". // Otherwise we cannot get the correct id. @@ -475,6 +483,31 @@ TEST(EnvPosixTest, RandomAccessUniqueID) { #ifdef ROCKSDB_FALLOCATE_PRESENT TEST(EnvPosixTest, AllocateTest) { std::string fname = GetOnDiskTestDir() + "/preallocate_testfile"; + + // Try fallocate in a file to see whether the target file system supports it. + // Skip the test if fallocate is not supported. + std::string fname_test_fallocate = + GetOnDiskTestDir() + "/preallocate_testfile_2"; + int fd = -1; + do { + fd = open(fname_test_fallocate.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); + } while (fd < 0 && errno == EINTR); + ASSERT_GT(fd, 0); + + int alloc_status = fallocate(fd, 0, 0, 1); + + int err_number = 0; + if (alloc_status != 0) { + err_number = errno; + fprintf(stderr, "Warning: fallocate() fails, %s\n", strerror(err_number)); + } + close(fd); + ASSERT_OK(env_->DeleteFile(fname_test_fallocate)); + if (alloc_status != 0 && err_number == EOPNOTSUPP) { + // The filesystem containing the file does not support fallocate + return; + } + EnvOptions soptions; soptions.use_mmap_writes = false; unique_ptr wfile; @@ -507,7 +540,7 @@ TEST(EnvPosixTest, AllocateTest) { // verify that preallocated blocks were deallocated on file close ASSERT_GT(st_blocks, f_stat.st_blocks); } -#endif +#endif // ROCKSDB_FALLOCATE_PRESENT // Returns true if any of the strings in ss are the prefix of another string. bool HasPrefix(const std::unordered_set& ss) { @@ -638,7 +671,8 @@ TEST(EnvPosixTest, InvalidateCache) { // Delete the file ASSERT_OK(env_->DeleteFile(fname)); } -#endif +#endif // not TRAVIS +#endif // OS_LINUX TEST(EnvPosixTest, PosixRandomRWFileTest) { EnvOptions soptions; diff --git a/util/hash.cc b/util/hash.cc index e38c186c3bd..37eaa4057fd 100644 --- a/util/hash.cc +++ b/util/hash.cc @@ -31,14 +31,26 @@ uint32_t Hash(const char* data, size_t n, uint32_t seed) { // Pick up remaining bytes switch (limit - data) { + // Note: It would be better if this was cast to unsigned char, but that + // would be a disk format change since we previously didn't have any cast + // at all (so gcc used signed char). + // To understand the difference between shifting unsigned and signed chars, + // let's use 250 as an example. unsigned char will be 250, while signed char + // will be -6. Bit-wise, they are equivalent: 11111010. However, when + // converting negative number (signed char) to int, it will be converted + // into negative int (of equivalent value, which is -6), while converting + // positive number (unsigned char) will be converted to 250. Bitwise, + // this looks like this: + // signed char 11111010 -> int 11111111111111111111111111111010 + // unsigned char 11111010 -> int 00000000000000000000000011111010 case 3: - h += data[2] << 16; - // fall through + h += static_cast(static_cast(data[2]) << 16); + // fall through case 2: - h += data[1] << 8; - // fall through + h += static_cast(static_cast(data[1]) << 8); + // fall through case 1: - h += data[0]; + h += static_cast(static_cast(data[0])); h *= m; h ^= (h >> r); break; diff --git a/util/hash_cuckoo_rep.cc b/util/hash_cuckoo_rep.cc index a9a79a27428..2ee05faac51 100644 --- a/util/hash_cuckoo_rep.cc +++ b/util/hash_cuckoo_rep.cc @@ -70,7 +70,7 @@ class HashCuckooRep : public MemTableRep { } cuckoo_path_ = reinterpret_cast( - arena_->Allocate(sizeof(int*) * (cuckoo_path_max_depth_ + 1))); + arena_->Allocate(sizeof(int) * (cuckoo_path_max_depth_ + 1))); is_nearly_full_ = false; } diff --git a/util/histogram.cc b/util/histogram.cc index 968769cef59..0dbfba7d629 100644 --- a/util/histogram.cc +++ b/util/histogram.cc @@ -53,7 +53,7 @@ HistogramBucketMapper::HistogramBucketMapper() } } -const size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const { +size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const { if (value >= maxBucketValue_) { return bucketValues_.size() - 1; } else if ( value >= minBucketValue_ ) { diff --git a/util/histogram.h b/util/histogram.h index d95588dc2d5..af3a019d803 100644 --- a/util/histogram.h +++ b/util/histogram.h @@ -23,10 +23,10 @@ class HistogramBucketMapper { HistogramBucketMapper(); // converts a value to the bucket index. - const size_t IndexForValue(const uint64_t value) const; + size_t IndexForValue(const uint64_t value) const; // number of buckets required. - const size_t BucketCount() const { + size_t BucketCount() const { return bucketValues_.size(); } @@ -65,6 +65,8 @@ class HistogramImpl { virtual double StandardDeviation() const; virtual void Data(HistogramData * const data) const; + virtual ~HistogramImpl() {} + private: // To be able to use HistogramImpl as thread local variable, its constructor // has to be static. That's why we're using manually values from BucketMapper diff --git a/util/iostats_context.cc b/util/iostats_context.cc index 61083177923..090813abc4b 100644 --- a/util/iostats_context.cc +++ b/util/iostats_context.cc @@ -9,7 +9,9 @@ namespace rocksdb { +#ifndef IOS_CROSS_COMPILE __thread IOStatsContext iostats_context; +#endif // IOS_CROSS_COMPILE void IOStatsContext::Reset() { thread_pool_id = Env::Priority::TOTAL; diff --git a/util/iostats_context_imp.h b/util/iostats_context_imp.h index ed34037d339..b271ddf1430 100644 --- a/util/iostats_context_imp.h +++ b/util/iostats_context_imp.h @@ -6,6 +6,8 @@ #pragma once #include "rocksdb/iostats_context.h" +#ifndef IOS_CROSS_COMPILE + // increment a specific counter by the specified value #define IOSTATS_ADD(metric, value) \ (iostats_context.metric += value) @@ -30,3 +32,15 @@ #define IOSTATS(metric) \ (iostats_context.metric) + +#else // IOS_CROSS_COMPILE + +#define IOSTATS_ADD(metric, value) +#define IOSTATS_ADD_IF_POSITIVE(metric, value) +#define IOSTATS_RESET(metric) +#define IOSTATS_RESET_ALL() +#define IOSTATS_SET_THREAD_POOL_ID(value) +#define IOSTATS_THREAD_POOL_ID() +#define IOSTATS(metric) 0 + +#endif // IOS_CROSS_COMPILE diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc index b6e108ccc32..9f00757b8da 100644 --- a/util/ldb_cmd.cc +++ b/util/ldb_cmd.cc @@ -14,6 +14,7 @@ #include "rocksdb/write_batch.h" #include "rocksdb/cache.h" #include "util/coding.h" +#include "util/scoped_arena_iterator.h" #include "utilities/ttl/db_ttl_impl.h" #include @@ -219,10 +220,11 @@ Options LDBCommand::PrepareOptionsForOpenDB() { map::const_iterator itr; + BlockBasedTableOptions table_options; int bits; if (ParseIntOption(option_map_, ARG_BLOOM_BITS, bits, exec_state_)) { if (bits > 0) { - opt.filter_policy = NewBloomFilterPolicy(bits); + table_options.filter_policy.reset(NewBloomFilterPolicy(bits)); } else { exec_state_ = LDBCommandExecuteResult::FAILED(ARG_BLOOM_BITS + " must be > 0."); @@ -232,7 +234,8 @@ Options LDBCommand::PrepareOptionsForOpenDB() { int block_size; if (ParseIntOption(option_map_, ARG_BLOCK_SIZE, block_size, exec_state_)) { if (block_size > 0) { - opt.block_size = block_size; + table_options.block_size = block_size; + opt.table_factory.reset(NewBlockBasedTableFactory(table_options)); } else { exec_state_ = LDBCommandExecuteResult::FAILED(ARG_BLOCK_SIZE + " must be > 0."); @@ -538,6 +541,7 @@ void ManifestDumpCommand::DoCommand() { } else { exec_state_ = LDBCommandExecuteResult::FAILED( "Multiple MANIFEST files found; use --path to select one"); + closedir(d); return; } } @@ -560,8 +564,9 @@ void ManifestDumpCommand::DoCommand() { // if VersionSet::DumpManifest() depends on any option done by // SanitizeOptions(), we need to initialize it manually. options.db_paths.emplace_back("dummy", 0); - VersionSet* versions = new VersionSet(dbname, &options, sopt, tc.get()); - Status s = versions->DumpManifest(options, file, verbose_, is_key_hex_); + WriteController wc; + VersionSet versions(dbname, &options, sopt, tc.get(), &wc); + Status s = versions.DumpManifest(options, file, verbose_, is_key_hex_); if (!s.ok()) { printf("Error in processing file %s %s\n", manifestfile.c_str(), s.ToString().c_str()); @@ -737,7 +742,8 @@ void InternalDumpCommand::DoCommand() { uint64_t c=0; uint64_t s1=0,s2=0; // Setup internal key iterator - auto iter = unique_ptr(idb->TEST_NewInternalIterator()); + Arena arena; + ScopedArenaIterator iter(idb->TEST_NewInternalIterator(&arena)); Status st = iter->status(); if (!st.ok()) { exec_state_ = LDBCommandExecuteResult::FAILED("Iterator error:" @@ -1084,7 +1090,8 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt, NewLRUCache(opt.max_open_files - 10, opt.table_cache_numshardbits, opt.table_cache_remove_scan_count_limit)); const InternalKeyComparator cmp(opt.comparator); - VersionSet versions(db_path_, &opt, soptions, tc.get()); + WriteController wc; + VersionSet versions(db_path_, &opt, soptions, tc.get(), &wc); std::vector dummy; ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName, ColumnFamilyOptions(opt)); diff --git a/util/logging.cc b/util/logging.cc index 1b5549d7319..98d96b82bb6 100644 --- a/util/logging.cc +++ b/util/logging.cc @@ -9,7 +9,10 @@ #include "util/logging.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include @@ -42,7 +45,7 @@ int AppendHumanBytes(uint64_t bytes, char* output, int len) { void AppendNumberTo(std::string* str, uint64_t num) { char buf[30]; - snprintf(buf, sizeof(buf), "%llu", (unsigned long long) num); + snprintf(buf, sizeof(buf), "%" PRIu64, num); str->append(buf); } diff --git a/util/logging.h b/util/logging.h index ce02697268d..7ca8ae0a30d 100644 --- a/util/logging.h +++ b/util/logging.h @@ -19,7 +19,6 @@ namespace rocksdb { class Slice; -class WritableFile; // Append a human-readable size in bytes int AppendHumanBytes(uint64_t bytes, char* output, int len); diff --git a/util/mutable_cf_options.h b/util/mutable_cf_options.h new file mode 100644 index 00000000000..39ebe2d8522 --- /dev/null +++ b/util/mutable_cf_options.h @@ -0,0 +1,41 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include "rocksdb/options.h" + +namespace rocksdb { + +struct MutableCFOptions { + explicit MutableCFOptions(const Options& options) + : write_buffer_size(options.write_buffer_size), + arena_block_size(options.arena_block_size), + memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits), + memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes), + memtable_prefix_bloom_huge_page_tlb_size( + options.memtable_prefix_bloom_huge_page_tlb_size), + max_successive_merges(options.max_successive_merges), + filter_deletes(options.filter_deletes) { + } + MutableCFOptions() + : write_buffer_size(0), + arena_block_size(0), + memtable_prefix_bloom_bits(0), + memtable_prefix_bloom_probes(0), + memtable_prefix_bloom_huge_page_tlb_size(0), + max_successive_merges(0), + filter_deletes(false) {} + + size_t write_buffer_size; + size_t arena_block_size; + uint32_t memtable_prefix_bloom_bits; + uint32_t memtable_prefix_bloom_probes; + size_t memtable_prefix_bloom_huge_page_tlb_size; + size_t max_successive_merges; + bool filter_deletes; +}; + +} // namespace rocksdb diff --git a/util/options.cc b/util/options.cc index b76c6603756..28120659bbd 100644 --- a/util/options.cc +++ b/util/options.cc @@ -8,8 +8,12 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "rocksdb/options.h" +#include "rocksdb/immutable_options.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include @@ -17,7 +21,6 @@ #include "rocksdb/compaction_filter.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" -#include "rocksdb/filter_policy.h" #include "rocksdb/memtablerep.h" #include "rocksdb/merge_operator.h" #include "rocksdb/slice.h" @@ -29,6 +32,36 @@ namespace rocksdb { +ImmutableCFOptions::ImmutableCFOptions(const Options& options) + : compaction_style(options.compaction_style), + compaction_options_universal(options.compaction_options_universal), + prefix_extractor(options.prefix_extractor.get()), + comparator(options.comparator), + merge_operator(options.merge_operator.get()), + compaction_filter(options.compaction_filter), + compaction_filter_factory(options.compaction_filter_factory.get()), + compaction_filter_factory_v2(options.compaction_filter_factory_v2.get()), + info_log(options.info_log.get()), + statistics(options.statistics.get()), + env(options.env), + allow_mmap_reads(options.allow_mmap_reads), + allow_mmap_writes(options.allow_mmap_writes), + db_paths(options.db_paths), + memtable_factory(options.memtable_factory.get()), + table_factory(options.table_factory.get()), + table_properties_collector_factories( + options.table_properties_collector_factories), + advise_random_on_open(options.advise_random_on_open), + bloom_locality(options.bloom_locality), + purge_redundant_kvs_while_flush(options.purge_redundant_kvs_while_flush), + min_partial_merge_operands(options.min_partial_merge_operands), + disable_data_sync(options.disableDataSync), + use_fsync(options.use_fsync), + compression(options.compression), + compression_per_level(options.compression_per_level), + compression_opts(options.compression_opts), + access_hint_on_compaction_start(options.access_hint_on_compaction_start) {} + ColumnFamilyOptions::ColumnFamilyOptions() : comparator(BytewiseComparator()), merge_operator(nullptr), @@ -39,14 +72,8 @@ ColumnFamilyOptions::ColumnFamilyOptions() write_buffer_size(4 << 20), max_write_buffer_number(2), min_write_buffer_number_to_merge(1), - block_cache(nullptr), - block_cache_compressed(nullptr), - block_size(4096), - block_restart_interval(16), compression(kSnappyCompression), - filter_policy(nullptr), prefix_extractor(nullptr), - whole_key_filtering(true), num_levels(7), level0_file_num_compaction_trigger(4), level0_slowdown_writes_trigger(20), @@ -60,15 +87,12 @@ ColumnFamilyOptions::ColumnFamilyOptions() expanded_compaction_factor(25), source_compaction_factor(1), max_grandparent_overlap_factor(10), - disable_seek_compaction(true), soft_rate_limit(0.0), hard_rate_limit(0.0), rate_limit_delay_max_milliseconds(1000), - no_block_cache(false), arena_block_size(0), disable_auto_compactions(false), purge_redundant_kvs_while_flush(true), - block_size_deviation(10), compaction_style(kCompactionStyleLevel), verify_checksums_in_compaction(true), filter_deletes(false), @@ -98,16 +122,10 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options) max_write_buffer_number(options.max_write_buffer_number), min_write_buffer_number_to_merge( options.min_write_buffer_number_to_merge), - block_cache(options.block_cache), - block_cache_compressed(options.block_cache_compressed), - block_size(options.block_size), - block_restart_interval(options.block_restart_interval), compression(options.compression), compression_per_level(options.compression_per_level), compression_opts(options.compression_opts), - filter_policy(options.filter_policy), prefix_extractor(options.prefix_extractor), - whole_key_filtering(options.whole_key_filtering), num_levels(options.num_levels), level0_file_num_compaction_trigger( options.level0_file_num_compaction_trigger), @@ -123,16 +141,13 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options) expanded_compaction_factor(options.expanded_compaction_factor), source_compaction_factor(options.source_compaction_factor), max_grandparent_overlap_factor(options.max_grandparent_overlap_factor), - disable_seek_compaction(options.disable_seek_compaction), soft_rate_limit(options.soft_rate_limit), hard_rate_limit(options.hard_rate_limit), rate_limit_delay_max_milliseconds( options.rate_limit_delay_max_milliseconds), - no_block_cache(options.no_block_cache), arena_block_size(options.arena_block_size), disable_auto_compactions(options.disable_auto_compactions), purge_redundant_kvs_while_flush(options.purge_redundant_kvs_while_flush), - block_size_deviation(options.block_size_deviation), compaction_style(options.compaction_style), verify_checksums_in_compaction(options.verify_checksums_in_compaction), compaction_options_universal(options.compaction_options_universal), @@ -175,7 +190,6 @@ DBOptions::DBOptions() statistics(nullptr), disableDataSync(false), use_fsync(false), - db_stats_log_interval(1800), db_log_dir(""), wal_dir(""), delete_obsolete_files_period_micros(6 * 60 * 60 * 1000000UL), @@ -216,7 +230,6 @@ DBOptions::DBOptions(const Options& options) statistics(options.statistics), disableDataSync(options.disableDataSync), use_fsync(options.use_fsync), - db_stats_log_interval(options.db_stats_log_interval), db_paths(options.db_paths), db_log_dir(options.db_log_dir), wal_dir(options.wal_dir), @@ -261,8 +274,8 @@ void DBOptions::Dump(Logger* log) const { Log(log, " Options.disableDataSync: %d", disableDataSync); Log(log, " Options.use_fsync: %d", use_fsync); Log(log, " Options.max_log_file_size: %zu", max_log_file_size); - Log(log, "Options.max_manifest_file_size: %lu", - (unsigned long)max_manifest_file_size); + Log(log, "Options.max_manifest_file_size: %" PRIu64, + max_manifest_file_size); Log(log, " Options.log_file_time_to_roll: %zu", log_file_time_to_roll); Log(log, " Options.keep_log_file_num: %zu", keep_log_file_num); Log(log, " Options.allow_os_buffer: %d", allow_os_buffer); @@ -278,16 +291,16 @@ void DBOptions::Dump(Logger* log) const { table_cache_numshardbits); Log(log, " Options.table_cache_remove_scan_count_limit: %d", table_cache_remove_scan_count_limit); - Log(log, " Options.delete_obsolete_files_period_micros: %lu", - (unsigned long)delete_obsolete_files_period_micros); + Log(log, " Options.delete_obsolete_files_period_micros: %" PRIu64, + delete_obsolete_files_period_micros); Log(log, " Options.max_background_compactions: %d", max_background_compactions); Log(log, " Options.max_background_flushes: %d", max_background_flushes); - Log(log, " Options.WAL_ttl_seconds: %lu", - (unsigned long)WAL_ttl_seconds); - Log(log, " Options.WAL_size_limit_MB: %lu", - (unsigned long)WAL_size_limit_MB); + Log(log, " Options.WAL_ttl_seconds: %" PRIu64, + WAL_ttl_seconds); + Log(log, " Options.WAL_size_limit_MB: %" PRIu64, + WAL_size_limit_MB); Log(log, " Options.manifest_preallocation_size: %zu", manifest_preallocation_size); Log(log, " Options.allow_os_buffer: %d", @@ -310,8 +323,8 @@ void DBOptions::Dump(Logger* log) const { use_adaptive_mutex); Log(log, " Options.rate_limiter: %p", rate_limiter.get()); - Log(log, " Options.bytes_per_sync: %lu", - (unsigned long)bytes_per_sync); + Log(log, " Options.bytes_per_sync: %" PRIu64, + bytes_per_sync); } // DBOptions::Dump void ColumnFamilyOptions::Dump(Logger* log) const { @@ -326,21 +339,10 @@ void ColumnFamilyOptions::Dump(Logger* log) const { compaction_filter_factory_v2->Name()); Log(log, " Options.memtable_factory: %s", memtable_factory->Name()); Log(log, " Options.table_factory: %s", table_factory->Name()); + Log(log, " table_factory options: %s", + table_factory->GetPrintableTableOptions().c_str()); Log(log, " Options.write_buffer_size: %zd", write_buffer_size); Log(log, " Options.max_write_buffer_number: %d", max_write_buffer_number); - Log(log," Options.block_cache: %p", block_cache.get()); - Log(log," Options.block_cache_compressed: %p", - block_cache_compressed.get()); - if (block_cache) { - Log(log," Options.block_cache_size: %zd", - block_cache->GetCapacity()); - } - if (block_cache_compressed) { - Log(log,"Options.block_cache_compressed_size: %zd", - block_cache_compressed->GetCapacity()); - } - Log(log," Options.block_size: %zd", block_size); - Log(log," Options.block_restart_interval: %d", block_restart_interval); if (!compression_per_level.empty()) { for (unsigned int i = 0; i < compression_per_level.size(); i++) { Log(log," Options.compression[%d]: %d", @@ -349,11 +351,8 @@ void ColumnFamilyOptions::Dump(Logger* log) const { } else { Log(log," Options.compression: %d", compression); } - Log(log," Options.filter_policy: %s", - filter_policy == nullptr ? "nullptr" : filter_policy->Name()); Log(log," Options.prefix_extractor: %s", prefix_extractor == nullptr ? "nullptr" : prefix_extractor->Name()); - Log(log," Options.whole_key_filtering: %d", whole_key_filtering); Log(log," Options.num_levels: %d", num_levels); Log(log," Options.min_write_buffer_number_to_merge: %d", min_write_buffer_number_to_merge); @@ -373,28 +372,26 @@ void ColumnFamilyOptions::Dump(Logger* log) const { level0_stop_writes_trigger); Log(log," Options.max_mem_compaction_level: %d", max_mem_compaction_level); - Log(log," Options.target_file_size_base: %d", + Log(log," Options.target_file_size_base: %" PRIu64, target_file_size_base); Log(log," Options.target_file_size_multiplier: %d", target_file_size_multiplier); - Log(log," Options.max_bytes_for_level_base: %lu", - (unsigned long)max_bytes_for_level_base); + Log(log," Options.max_bytes_for_level_base: %" PRIu64, + max_bytes_for_level_base); Log(log," Options.max_bytes_for_level_multiplier: %d", max_bytes_for_level_multiplier); for (int i = 0; i < num_levels; i++) { Log(log,"Options.max_bytes_for_level_multiplier_addtl[%d]: %d", i, max_bytes_for_level_multiplier_additional[i]); } - Log(log," Options.max_sequential_skip_in_iterations: %lu", - (unsigned long)max_sequential_skip_in_iterations); + Log(log," Options.max_sequential_skip_in_iterations: %" PRIu64, + max_sequential_skip_in_iterations); Log(log," Options.expanded_compaction_factor: %d", expanded_compaction_factor); Log(log," Options.source_compaction_factor: %d", source_compaction_factor); Log(log," Options.max_grandparent_overlap_factor: %d", max_grandparent_overlap_factor); - Log(log," Options.no_block_cache: %d", - no_block_cache); Log(log," Options.arena_block_size: %zu", arena_block_size); Log(log," Options.soft_rate_limit: %.2f", @@ -407,8 +404,6 @@ void ColumnFamilyOptions::Dump(Logger* log) const { disable_auto_compactions); Log(log," Options.purge_redundant_kvs_while_flush: %d", purge_redundant_kvs_while_flush); - Log(log," Options.block_size_deviation: %d", - block_size_deviation); Log(log," Options.filter_deletes: %d", filter_deletes); Log(log, " Options.verify_checksums_in_compaction: %d", @@ -425,7 +420,7 @@ void ColumnFamilyOptions::Dump(Logger* log) const { "max_size_amplification_percent: %u", compaction_options_universal.max_size_amplification_percent); Log(log, - "Options.compaction_options_universal.compression_size_percent: %u", + "Options.compaction_options_universal.compression_size_percent: %d", compaction_options_universal.compression_size_percent); Log(log, "Options.compaction_options_fifo.max_table_files_size: %" PRIu64, compaction_options_fifo.max_table_files_size); @@ -497,10 +492,14 @@ Options::PrepareForBulkLoad() } // Optimization functions -ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForPointLookup() { +ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForPointLookup( + uint64_t block_cache_size_mb) { prefix_extractor.reset(NewNoopTransform()); BlockBasedTableOptions block_based_options; - block_based_options.index_type = BlockBasedTableOptions::kBinarySearch; + block_based_options.index_type = BlockBasedTableOptions::kHashSearch; + block_based_options.filter_policy.reset(NewBloomFilterPolicy(10)); + block_based_options.block_cache = + NewLRUCache(block_cache_size_mb * 1024 * 1024); table_factory.reset(new BlockBasedTableFactory(block_based_options)); #ifndef ROCKSDB_LITE memtable_factory.reset(NewHashLinkListRepFactory()); diff --git a/util/options_helper.cc b/util/options_helper.cc new file mode 100644 index 00000000000..d552a2b9e7b --- /dev/null +++ b/util/options_helper.cc @@ -0,0 +1,318 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include "rocksdb/options.h" +#include "util/options_helper.h" + +namespace rocksdb { + +namespace { +CompressionType ParseCompressionType(const std::string& type) { + if (type == "kNoCompression") { + return kNoCompression; + } else if (type == "kSnappyCompression") { + return kSnappyCompression; + } else if (type == "kZlibCompression") { + return kZlibCompression; + } else if (type == "kBZip2Compression") { + return kBZip2Compression; + } else if (type == "kLZ4Compression") { + return kLZ4Compression; + } else if (type == "kLZ4HCCompression") { + return kLZ4HCCompression; + } else { + throw "unknown compression type: " + type; + } + return kNoCompression; +} + +bool ParseBoolean(const std::string& type, const std::string& value) { + if (value == "true" || value == "1") { + return true; + } else if (value == "false" || value == "0") { + return false; + } else { + throw type; + } +} +uint32_t ParseInt(const std::string& value) { + return std::stoi(value); +} + +uint32_t ParseUint32(const std::string& value) { + return std::stoul(value); +} + +uint64_t ParseUint64(const std::string& value) { + return std::stoull(value); +} + +int64_t ParseInt64(const std::string& value) { + return std::stol(value); +} + +double ParseDouble(const std::string& value) { + return std::stod(value); +} + +CompactionStyle ParseCompactionStyle(const std::string& type) { + if (type == "kCompactionStyleLevel") { + return kCompactionStyleLevel; + } else if (type == "kCompactionStyleUniversal") { + return kCompactionStyleUniversal; + } else if (type == "kCompactionStyleFIFO") { + return kCompactionStyleFIFO; + } else { + throw "unknown compaction style: " + type; + } + return kCompactionStyleLevel; +} +} // anonymouse namespace + +template +bool ParseMemtableOption(const std::string& name, const std::string& value, + OptionsType* new_options) { + if (name == "write_buffer_size") { + new_options->write_buffer_size = ParseInt64(value); + } else if (name == "arena_block_size") { + new_options->arena_block_size = ParseInt64(value); + } else if (name == "memtable_prefix_bloom_bits") { + new_options->memtable_prefix_bloom_bits = stoul(value); + } else if (name == "memtable_prefix_bloom_probes") { + new_options->memtable_prefix_bloom_probes = stoul(value); + } else if (name == "memtable_prefix_bloom_huge_page_tlb_size") { + new_options->memtable_prefix_bloom_huge_page_tlb_size = + ParseInt64(value); + } else if (name == "max_successive_merges") { + new_options->max_successive_merges = ParseInt64(value); + } else if (name == "filter_deletes") { + new_options->filter_deletes = ParseBoolean(name, value); + } else { + return false; + } + return true; +} + +bool GetMutableOptionsFromStrings( + const MutableCFOptions& base_options, + const std::unordered_map& options_map, + MutableCFOptions* new_options) { + assert(new_options); + *new_options = base_options; + try { + for (const auto& o : options_map) { + if (ParseMemtableOption(o.first, o.second, new_options)) { + } else { + return false; + } + } + } catch (std::exception) { + return false; + } + return true; +} + +bool GetOptionsFromStrings( + const Options& base_options, + const std::unordered_map& options_map, + Options* new_options) { + assert(new_options); + *new_options = base_options; + for (const auto& o : options_map) { + try { + if (ParseMemtableOption(o.first, o.second, new_options)) { + } else if (o.first == "max_write_buffer_number") { + new_options->max_write_buffer_number = ParseInt(o.second); + } else if (o.first == "min_write_buffer_number_to_merge") { + new_options->min_write_buffer_number_to_merge = ParseInt(o.second); + } else if (o.first == "compression") { + new_options->compression = ParseCompressionType(o.second); + } else if (o.first == "compression_per_level") { + new_options->compression_per_level.clear(); + size_t start = 0; + while (true) { + size_t end = o.second.find_first_of(':', start); + if (end == std::string::npos) { + new_options->compression_per_level.push_back( + ParseCompressionType(o.second.substr(start))); + break; + } else { + new_options->compression_per_level.push_back( + ParseCompressionType(o.second.substr(start, end - start))); + start = end + 1; + } + } + } else if (o.first == "compression_opts") { + size_t start = 0; + size_t end = o.second.find_first_of(':'); + if (end == std::string::npos) { + throw o.first; + } + new_options->compression_opts.window_bits = + ParseInt(o.second.substr(start, end - start)); + start = end + 1; + end = o.second.find_first_of(':', start); + if (end == std::string::npos) { + throw o.first; + } + new_options->compression_opts.level = + ParseInt(o.second.substr(start, end - start)); + start = end + 1; + if (start >= o.second.size()) { + throw o.first; + } + new_options->compression_opts.strategy = + ParseInt(o.second.substr(start, o.second.size() - start)); + } else if (o.first == "num_levels") { + new_options->num_levels = ParseInt(o.second); + } else if (o.first == "level0_file_num_compaction_trigger") { + new_options->level0_file_num_compaction_trigger = ParseInt(o.second); + } else if (o.first == "level0_slowdown_writes_trigger") { + new_options->level0_slowdown_writes_trigger = ParseInt(o.second); + } else if (o.first == "level0_stop_writes_trigger") { + new_options->level0_stop_writes_trigger = ParseInt(o.second); + } else if (o.first == "max_mem_compaction_level") { + new_options->max_mem_compaction_level = ParseInt(o.second); + } else if (o.first == "target_file_size_base") { + new_options->target_file_size_base = ParseUint64(o.second); + } else if (o.first == "target_file_size_multiplier") { + new_options->target_file_size_multiplier = ParseInt(o.second); + } else if (o.first == "max_bytes_for_level_base") { + new_options->max_bytes_for_level_base = ParseUint64(o.second); + } else if (o.first == "max_bytes_for_level_multiplier") { + new_options->max_bytes_for_level_multiplier = ParseInt(o.second); + } else if (o.first == "max_bytes_for_level_multiplier_additional") { + new_options->max_bytes_for_level_multiplier_additional.clear(); + size_t start = 0; + while (true) { + size_t end = o.second.find_first_of(':', start); + if (end == std::string::npos) { + new_options->max_bytes_for_level_multiplier_additional.push_back( + ParseInt(o.second.substr(start))); + break; + } else { + new_options->max_bytes_for_level_multiplier_additional.push_back( + ParseInt(o.second.substr(start, end - start))); + start = end + 1; + } + } + } else if (o.first == "expanded_compaction_factor") { + new_options->expanded_compaction_factor = ParseInt(o.second); + } else if (o.first == "source_compaction_factor") { + new_options->source_compaction_factor = ParseInt(o.second); + } else if (o.first == "max_grandparent_overlap_factor") { + new_options->max_grandparent_overlap_factor = ParseInt(o.second); + } else if (o.first == "soft_rate_limit") { + new_options->soft_rate_limit = ParseDouble(o.second); + } else if (o.first == "hard_rate_limit") { + new_options->hard_rate_limit = ParseDouble(o.second); + } else if (o.first == "disable_auto_compactions") { + new_options->disable_auto_compactions = ParseBoolean(o.first, o.second); + } else if (o.first == "purge_redundant_kvs_while_flush") { + new_options->purge_redundant_kvs_while_flush = + ParseBoolean(o.first, o.second); + } else if (o.first == "compaction_style") { + new_options->compaction_style = ParseCompactionStyle(o.second); + } else if (o.first == "verify_checksums_in_compaction") { + new_options->verify_checksums_in_compaction = + ParseBoolean(o.first, o.second); + } else if (o.first == "compaction_options_universal") { + // TODO(ljin): add support + throw o.first; + } else if (o.first == "compaction_options_fifo") { + new_options->compaction_options_fifo.max_table_files_size + = ParseUint64(o.second); + } else if (o.first == "max_sequential_skip_in_iterations") { + new_options->max_sequential_skip_in_iterations = ParseUint64(o.second); + } else if (o.first == "inplace_update_support") { + new_options->inplace_update_support = ParseBoolean(o.first, o.second); + } else if (o.first == "inplace_update_num_locks") { + new_options->inplace_update_num_locks = ParseInt64(o.second); + } else if (o.first == "bloom_locality") { + new_options->bloom_locality = ParseUint32(o.second); + } else if (o.first == "min_partial_merge_operands") { + new_options->min_partial_merge_operands = ParseUint32(o.second); + } else if (o.first == "create_if_missing") { + new_options->create_if_missing = ParseBoolean(o.first, o.second); + } else if (o.first == "create_missing_column_families") { + new_options->create_missing_column_families = + ParseBoolean(o.first, o.second); + } else if (o.first == "error_if_exists") { + new_options->error_if_exists = ParseBoolean(o.first, o.second); + } else if (o.first == "paranoid_checks") { + new_options->paranoid_checks = ParseBoolean(o.first, o.second); + } else if (o.first == "max_open_files") { + new_options->max_open_files = ParseInt(o.second); + } else if (o.first == "max_total_wal_size") { + new_options->max_total_wal_size = ParseUint64(o.second); + } else if (o.first == "disable_data_sync") { + new_options->disableDataSync = ParseBoolean(o.first, o.second); + } else if (o.first == "use_fsync") { + new_options->use_fsync = ParseBoolean(o.first, o.second); + } else if (o.first == "db_paths") { + // TODO(ljin): add support + throw o.first; + } else if (o.first == "db_log_dir") { + new_options->db_log_dir = o.second; + } else if (o.first == "wal_dir") { + new_options->wal_dir = o.second; + } else if (o.first == "delete_obsolete_files_period_micros") { + new_options->delete_obsolete_files_period_micros = + ParseUint64(o.second); + } else if (o.first == "max_background_compactions") { + new_options->max_background_compactions = ParseInt(o.second); + } else if (o.first == "max_background_flushes") { + new_options->max_background_flushes = ParseInt(o.second); + } else if (o.first == "max_log_file_size") { + new_options->max_log_file_size = ParseInt64(o.second); + } else if (o.first == "log_file_time_to_roll") { + new_options->log_file_time_to_roll = ParseInt64(o.second); + } else if (o.first == "keep_log_file_num") { + new_options->keep_log_file_num = ParseInt64(o.second); + } else if (o.first == "max_manifest_file_size") { + new_options->max_manifest_file_size = ParseUint64(o.second); + } else if (o.first == "table_cache_numshardbits") { + new_options->table_cache_numshardbits = ParseInt(o.second); + } else if (o.first == "table_cache_remove_scan_count_limit") { + new_options->table_cache_remove_scan_count_limit = ParseInt(o.second); + } else if (o.first == "WAL_ttl_seconds") { + new_options->WAL_ttl_seconds = ParseUint64(o.second); + } else if (o.first == "WAL_size_limit_MB") { + new_options->WAL_size_limit_MB = ParseUint64(o.second); + } else if (o.first == "manifest_preallocation_size") { + new_options->manifest_preallocation_size = ParseInt64(o.second); + } else if (o.first == "allow_os_buffer") { + new_options->allow_os_buffer = ParseBoolean(o.first, o.second); + } else if (o.first == "allow_mmap_reads") { + new_options->allow_mmap_reads = ParseBoolean(o.first, o.second); + } else if (o.first == "allow_mmap_writes") { + new_options->allow_mmap_writes = ParseBoolean(o.first, o.second); + } else if (o.first == "is_fd_close_on_exec") { + new_options->is_fd_close_on_exec = ParseBoolean(o.first, o.second); + } else if (o.first == "skip_log_error_on_recovery") { + new_options->skip_log_error_on_recovery = + ParseBoolean(o.first, o.second); + } else if (o.first == "stats_dump_period_sec") { + new_options->stats_dump_period_sec = ParseUint32(o.second); + } else if (o.first == "advise_random_on_open") { + new_options->advise_random_on_open = ParseBoolean(o.first, o.second); + } else if (o.first == "use_adaptive_mutex") { + new_options->use_adaptive_mutex = ParseBoolean(o.first, o.second); + } else if (o.first == "allow_thread_local") { + new_options->allow_thread_local = ParseBoolean(o.first, o.second); + } else if (o.first == "bytes_per_sync") { + new_options->bytes_per_sync = ParseUint64(o.second); + } else { + return false; + } + } catch (std::exception) { + return false; + } + } + return true; +} + +} // namespace rocksdb diff --git a/util/options_helper.h b/util/options_helper.h new file mode 100644 index 00000000000..c04d2a5d7c7 --- /dev/null +++ b/util/options_helper.h @@ -0,0 +1,18 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include +#include "util/mutable_cf_options.h" + +namespace rocksdb { + +bool GetMutableOptionsFromStrings( + const MutableCFOptions& base_options, + const std::unordered_map& options_map, + MutableCFOptions* new_options); + +} // namespace rocksdb diff --git a/util/options_test.cc b/util/options_test.cc index be07a83f529..eee285e2a5d 100644 --- a/util/options_test.cc +++ b/util/options_test.cc @@ -7,7 +7,11 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + +#include #include #include @@ -72,6 +76,180 @@ TEST(OptionsTest, LooseCondition) { // Both tight amplifications PrintAndGetOptions(128 * 1024 * 1024, 4, 8); } + +TEST(OptionsTest, GetOptionsFromStringsTest) { + std::unordered_map options_map = { + {"write_buffer_size", "1"}, + {"max_write_buffer_number", "2"}, + {"min_write_buffer_number_to_merge", "3"}, + {"compression", "kSnappyCompression"}, + {"compression_per_level", "kNoCompression:" + "kSnappyCompression:" + "kZlibCompression:" + "kBZip2Compression:" + "kLZ4Compression:" + "kLZ4HCCompression"}, + {"compression_opts", "4:5:6"}, + {"num_levels", "7"}, + {"level0_file_num_compaction_trigger", "8"}, + {"level0_slowdown_writes_trigger", "9"}, + {"level0_stop_writes_trigger", "10"}, + {"max_mem_compaction_level", "11"}, + {"target_file_size_base", "12"}, + {"target_file_size_multiplier", "13"}, + {"max_bytes_for_level_base", "14"}, + {"max_bytes_for_level_multiplier", "15"}, + {"max_bytes_for_level_multiplier_additional", "16:17:18"}, + {"expanded_compaction_factor", "19"}, + {"source_compaction_factor", "20"}, + {"max_grandparent_overlap_factor", "21"}, + {"soft_rate_limit", "1.1"}, + {"hard_rate_limit", "2.1"}, + {"arena_block_size", "22"}, + {"disable_auto_compactions", "true"}, + {"purge_redundant_kvs_while_flush", "1"}, + {"compaction_style", "kCompactionStyleLevel"}, + {"verify_checksums_in_compaction", "false"}, + {"compaction_options_fifo", "23"}, + {"filter_deletes", "0"}, + {"max_sequential_skip_in_iterations", "24"}, + {"inplace_update_support", "true"}, + {"inplace_update_num_locks", "25"}, + {"memtable_prefix_bloom_bits", "26"}, + {"memtable_prefix_bloom_probes", "27"}, + {"memtable_prefix_bloom_huge_page_tlb_size", "28"}, + {"bloom_locality", "29"}, + {"max_successive_merges", "30"}, + {"min_partial_merge_operands", "31"}, + {"create_if_missing", "false"}, + {"create_missing_column_families", "true"}, + {"error_if_exists", "false"}, + {"paranoid_checks", "true"}, + {"max_open_files", "32"}, + {"max_total_wal_size", "33"}, + {"disable_data_sync", "false"}, + {"use_fsync", "true"}, + {"db_log_dir", "/db_log_dir"}, + {"wal_dir", "/wal_dir"}, + {"delete_obsolete_files_period_micros", "34"}, + {"max_background_compactions", "35"}, + {"max_background_flushes", "36"}, + {"max_log_file_size", "37"}, + {"log_file_time_to_roll", "38"}, + {"keep_log_file_num", "39"}, + {"max_manifest_file_size", "40"}, + {"table_cache_numshardbits", "41"}, + {"table_cache_remove_scan_count_limit", "42"}, + {"WAL_ttl_seconds", "43"}, + {"WAL_size_limit_MB", "44"}, + {"manifest_preallocation_size", "45"}, + {"allow_os_buffer", "false"}, + {"allow_mmap_reads", "true"}, + {"allow_mmap_writes", "false"}, + {"is_fd_close_on_exec", "true"}, + {"skip_log_error_on_recovery", "false"}, + {"stats_dump_period_sec", "46"}, + {"advise_random_on_open", "true"}, + {"use_adaptive_mutex", "false"}, + {"allow_thread_local", "true"}, + {"bytes_per_sync", "47"}, + }; + + Options base_opt; + Options new_opt; + ASSERT_TRUE(GetOptionsFromStrings(base_opt, options_map, &new_opt)); + ASSERT_EQ(new_opt.write_buffer_size, 1U); + ASSERT_EQ(new_opt.max_write_buffer_number, 2); + ASSERT_EQ(new_opt.min_write_buffer_number_to_merge, 3); + ASSERT_EQ(new_opt.compression, kSnappyCompression); + ASSERT_EQ(new_opt.compression_per_level.size(), 6U); + ASSERT_EQ(new_opt.compression_per_level[0], kNoCompression); + ASSERT_EQ(new_opt.compression_per_level[1], kSnappyCompression); + ASSERT_EQ(new_opt.compression_per_level[2], kZlibCompression); + ASSERT_EQ(new_opt.compression_per_level[3], kBZip2Compression); + ASSERT_EQ(new_opt.compression_per_level[4], kLZ4Compression); + ASSERT_EQ(new_opt.compression_per_level[5], kLZ4HCCompression); + ASSERT_EQ(new_opt.compression_opts.window_bits, 4); + ASSERT_EQ(new_opt.compression_opts.level, 5); + ASSERT_EQ(new_opt.compression_opts.strategy, 6); + ASSERT_EQ(new_opt.num_levels, 7); + ASSERT_EQ(new_opt.level0_file_num_compaction_trigger, 8); + ASSERT_EQ(new_opt.level0_slowdown_writes_trigger, 9); + ASSERT_EQ(new_opt.level0_stop_writes_trigger, 10); + ASSERT_EQ(new_opt.max_mem_compaction_level, 11); + ASSERT_EQ(new_opt.target_file_size_base, static_cast(12)); + ASSERT_EQ(new_opt.target_file_size_multiplier, 13); + ASSERT_EQ(new_opt.max_bytes_for_level_base, 14U); + ASSERT_EQ(new_opt.max_bytes_for_level_multiplier, 15); + ASSERT_EQ(new_opt.max_bytes_for_level_multiplier_additional.size(), 3U); + ASSERT_EQ(new_opt.max_bytes_for_level_multiplier_additional[0], 16); + ASSERT_EQ(new_opt.max_bytes_for_level_multiplier_additional[1], 17); + ASSERT_EQ(new_opt.max_bytes_for_level_multiplier_additional[2], 18); + ASSERT_EQ(new_opt.expanded_compaction_factor, 19); + ASSERT_EQ(new_opt.source_compaction_factor, 20); + ASSERT_EQ(new_opt.max_grandparent_overlap_factor, 21); + ASSERT_EQ(new_opt.soft_rate_limit, 1.1); + ASSERT_EQ(new_opt.hard_rate_limit, 2.1); + ASSERT_EQ(new_opt.arena_block_size, 22U); + ASSERT_EQ(new_opt.disable_auto_compactions, true); + ASSERT_EQ(new_opt.purge_redundant_kvs_while_flush, true); + ASSERT_EQ(new_opt.compaction_style, kCompactionStyleLevel); + ASSERT_EQ(new_opt.verify_checksums_in_compaction, false); + ASSERT_EQ(new_opt.compaction_options_fifo.max_table_files_size, + static_cast(23)); + ASSERT_EQ(new_opt.filter_deletes, false); + ASSERT_EQ(new_opt.max_sequential_skip_in_iterations, + static_cast(24)); + ASSERT_EQ(new_opt.inplace_update_support, true); + ASSERT_EQ(new_opt.inplace_update_num_locks, 25U); + ASSERT_EQ(new_opt.memtable_prefix_bloom_bits, 26U); + ASSERT_EQ(new_opt.memtable_prefix_bloom_probes, 27U); + ASSERT_EQ(new_opt.memtable_prefix_bloom_huge_page_tlb_size, 28U); + ASSERT_EQ(new_opt.bloom_locality, 29U); + ASSERT_EQ(new_opt.max_successive_merges, 30U); + ASSERT_EQ(new_opt.min_partial_merge_operands, 31U); + ASSERT_EQ(new_opt.create_if_missing, false); + ASSERT_EQ(new_opt.create_missing_column_families, true); + ASSERT_EQ(new_opt.error_if_exists, false); + ASSERT_EQ(new_opt.paranoid_checks, true); + ASSERT_EQ(new_opt.max_open_files, 32); + ASSERT_EQ(new_opt.max_total_wal_size, static_cast(33)); + ASSERT_EQ(new_opt.disableDataSync, false); + ASSERT_EQ(new_opt.use_fsync, true); + ASSERT_EQ(new_opt.db_log_dir, "/db_log_dir"); + ASSERT_EQ(new_opt.wal_dir, "/wal_dir"); + ASSERT_EQ(new_opt.delete_obsolete_files_period_micros, + static_cast(34)); + ASSERT_EQ(new_opt.max_background_compactions, 35); + ASSERT_EQ(new_opt.max_background_flushes, 36); + ASSERT_EQ(new_opt.max_log_file_size, 37U); + ASSERT_EQ(new_opt.log_file_time_to_roll, 38U); + ASSERT_EQ(new_opt.keep_log_file_num, 39U); + ASSERT_EQ(new_opt.max_manifest_file_size, static_cast(40)); + ASSERT_EQ(new_opt.table_cache_numshardbits, 41); + ASSERT_EQ(new_opt.table_cache_remove_scan_count_limit, 42); + ASSERT_EQ(new_opt.WAL_ttl_seconds, static_cast(43)); + ASSERT_EQ(new_opt.WAL_size_limit_MB, static_cast(44)); + ASSERT_EQ(new_opt.manifest_preallocation_size, 45U); + ASSERT_EQ(new_opt.allow_os_buffer, false); + ASSERT_EQ(new_opt.allow_mmap_reads, true); + ASSERT_EQ(new_opt.allow_mmap_writes, false); + ASSERT_EQ(new_opt.is_fd_close_on_exec, true); + ASSERT_EQ(new_opt.skip_log_error_on_recovery, false); + ASSERT_EQ(new_opt.stats_dump_period_sec, 46U); + ASSERT_EQ(new_opt.advise_random_on_open, true); + ASSERT_EQ(new_opt.use_adaptive_mutex, false); + ASSERT_EQ(new_opt.allow_thread_local, true); + ASSERT_EQ(new_opt.bytes_per_sync, static_cast(47)); + + options_map["write_buffer_size"] = "hello"; + ASSERT_TRUE(!GetOptionsFromStrings(base_opt, options_map, &new_opt)); + options_map["write_buffer_size"] = "1"; + ASSERT_TRUE(GetOptionsFromStrings(base_opt, options_map, &new_opt)); + options_map["unknown_option"] = "1"; + ASSERT_TRUE(!GetOptionsFromStrings(base_opt, options_map, &new_opt)); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/util/perf_context_imp.h b/util/perf_context_imp.h index dc4ae95e5c0..e397901056f 100644 --- a/util/perf_context_imp.h +++ b/util/perf_context_imp.h @@ -11,11 +11,10 @@ namespace rocksdb { #if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE) -#define PERF_TIMER_DECLARE() -#define PERF_TIMER_START(metric) -#define PERF_TIMER_AUTO(metric) +#define PERF_TIMER_GUARD(metric) #define PERF_TIMER_MEASURE(metric) #define PERF_TIMER_STOP(metric) +#define PERF_TIMER_START(metric) #define PERF_COUNTER_ADD(metric, value) #else @@ -24,10 +23,15 @@ extern __thread PerfLevel perf_level; class PerfStepTimer { public: - PerfStepTimer() + PerfStepTimer(uint64_t* metric) : enabled_(perf_level >= PerfLevel::kEnableTime), env_(enabled_ ? Env::Default() : nullptr), - start_(0) { + start_(0), + metric_(metric) { + } + + ~PerfStepTimer() { + Stop(); } void Start() { @@ -36,17 +40,17 @@ class PerfStepTimer { } } - void Measure(uint64_t* metric) { + void Measure() { if (start_) { uint64_t now = env_->NowNanos(); - *metric += now - start_; + *metric_ += now - start_; start_ = now; } } - void Stop(uint64_t* metric) { + void Stop() { if (start_) { - *metric += env_->NowNanos() - start_; + *metric_ += env_->NowNanos() - start_; start_ = 0; } } @@ -55,29 +59,25 @@ class PerfStepTimer { const bool enabled_; Env* const env_; uint64_t start_; + uint64_t* metric_; }; -// Declare the local timer object to be used later on -#define PERF_TIMER_DECLARE() \ - PerfStepTimer perf_step_timer; +// Stop the timer and update the metric +#define PERF_TIMER_STOP(metric) \ + perf_step_timer_ ## metric.Stop(); -// Set start time of the timer #define PERF_TIMER_START(metric) \ - perf_step_timer.Start(); + perf_step_timer_ ## metric.Start(); // Declare and set start time of the timer -#define PERF_TIMER_AUTO(metric) \ - PerfStepTimer perf_step_timer; \ - perf_step_timer.Start(); +#define PERF_TIMER_GUARD(metric) \ + PerfStepTimer perf_step_timer_ ## metric(&(perf_context.metric)); \ + perf_step_timer_ ## metric.Start(); // Update metric with time elapsed since last START. start time is reset // to current timestamp. #define PERF_TIMER_MEASURE(metric) \ - perf_step_timer.Measure(&(perf_context.metric)); - -// Update metric with time elapsed since last START. But start time is not set. -#define PERF_TIMER_STOP(metric) \ - perf_step_timer.Stop(&(perf_context.metric)); + perf_step_timer_ ## metric.Measure(); // Increase metric value #define PERF_COUNTER_ADD(metric, value) \ diff --git a/util/rate_limiter.cc b/util/rate_limiter.cc index cde86f3c9e6..47f96de84bc 100644 --- a/util/rate_limiter.cc +++ b/util/rate_limiter.cc @@ -60,7 +60,7 @@ GenericRateLimiter::~GenericRateLimiter() { } void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri) { - assert(bytes < refill_bytes_per_period_); + assert(bytes <= refill_bytes_per_period_); MutexLock g(&request_mutex_); if (stop_) { diff --git a/util/rate_limiter_test.cc b/util/rate_limiter_test.cc index 1b72e4ed0b2..9d6cfb7e6a0 100644 --- a/util/rate_limiter_test.cc +++ b/util/rate_limiter_test.cc @@ -7,7 +7,10 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include "util/testharness.h" diff --git a/util/scoped_arena_iterator.h b/util/scoped_arena_iterator.h new file mode 100644 index 00000000000..2021d2dc226 --- /dev/null +++ b/util/scoped_arena_iterator.h @@ -0,0 +1,28 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include "rocksdb/iterator.h" + +namespace rocksdb { +class ScopedArenaIterator { + public: + explicit ScopedArenaIterator(Iterator* iter = nullptr) : iter_(iter) {} + + Iterator* operator->() { return iter_; } + + void set(Iterator* iter) { iter_ = iter; } + + Iterator* get() { return iter_; } + + ~ScopedArenaIterator() { iter_->~Iterator(); } + + private: + Iterator* iter_; +}; +} // namespace rocksdb diff --git a/util/statistics.cc b/util/statistics.cc index 24957c9b6fc..9d828a6feb5 100644 --- a/util/statistics.cc +++ b/util/statistics.cc @@ -5,7 +5,10 @@ // #include "util/statistics.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include "rocksdb/statistics.h" #include "port/likely.h" diff --git a/util/testutil.cc b/util/testutil.cc index 13e781e6468..20f22c2dc45 100644 --- a/util/testutil.cc +++ b/util/testutil.cc @@ -9,6 +9,7 @@ #include "util/testutil.h" +#include "port/port.h" #include "util/random.h" namespace rocksdb { @@ -22,6 +23,15 @@ Slice RandomString(Random* rnd, int len, std::string* dst) { return Slice(*dst); } +extern std::string RandomHumanReadableString(Random* rnd, int len) { + std::string ret; + ret.resize(len); + for (int i = 0; i < len; ++i) { + ret[i] = static_cast('a' + rnd->Uniform(26)); + } + return ret; +} + std::string RandomKey(Random* rnd, int len) { // Make sure to generate a wide variety of characters so we // test the boundary conditions for short-key optimizations. @@ -52,5 +62,50 @@ extern Slice CompressibleString(Random* rnd, double compressed_fraction, return Slice(*dst); } +namespace { +class Uint64ComparatorImpl : public Comparator { + public: + Uint64ComparatorImpl() { } + + virtual const char* Name() const override { + return "rocksdb.Uint64Comparator"; + } + + virtual int Compare(const Slice& a, const Slice& b) const override { + assert(a.size() == sizeof(uint64_t) && b.size() == sizeof(uint64_t)); + const uint64_t* left = reinterpret_cast(a.data()); + const uint64_t* right = reinterpret_cast(b.data()); + if (*left == *right) { + return 0; + } else if (*left < *right) { + return -1; + } else { + return 1; + } + } + + virtual void FindShortestSeparator(std::string* start, + const Slice& limit) const override { + return; + } + + virtual void FindShortSuccessor(std::string* key) const override { + return; + } +}; +} // namespace + +static port::OnceType once = LEVELDB_ONCE_INIT; +static const Comparator* uint64comp; + +static void InitModule() { + uint64comp = new Uint64ComparatorImpl; +} + +const Comparator* Uint64Comparator() { + port::InitOnce(&once, InitModule); + return uint64comp; +} + } // namespace test } // namespace rocksdb diff --git a/util/testutil.h b/util/testutil.h index 4fc8c0f5b32..eff0d7e7d8d 100644 --- a/util/testutil.h +++ b/util/testutil.h @@ -21,6 +21,8 @@ namespace test { // references the generated data. extern Slice RandomString(Random* rnd, int len, std::string* dst); +extern std::string RandomHumanReadableString(Random* rnd, int len); + // Return a random key with the specified length that may contain interesting // characters (e.g. \x00, \xff, etc.). extern std::string RandomKey(Random* rnd, int len); @@ -76,5 +78,12 @@ class PlainInternalKeyComparator : public InternalKeyComparator { } }; +// Returns a user key comparator that can be used for comparing two uint64_t +// slices. Instead of comparing slices byte-wise, it compares all the 8 bytes +// at once. Assumes same endian-ness is used though the database's lifetime. +// Symantics of comparison would differ from Bytewise comparator in little +// endian machines. +extern const Comparator* Uint64Comparator(); + } // namespace test } // namespace rocksdb diff --git a/util/vectorrep.cc b/util/vectorrep.cc index 599076c304f..e61b8ad085d 100644 --- a/util/vectorrep.cc +++ b/util/vectorrep.cc @@ -106,7 +106,6 @@ class VectorRep : public MemTableRep { void VectorRep::Insert(KeyHandle handle) { auto* key = static_cast(handle); - assert(!Contains(key)); WriteLock l(&rwlock_); assert(!immutable_); bucket_->push_back(key); diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc index 436f4c2d685..20ec9db85e6 100644 --- a/utilities/backupable/backupable_db.cc +++ b/utilities/backupable/backupable_db.cc @@ -15,7 +15,9 @@ #include "util/crc32c.h" #include "rocksdb/transaction_log.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif #include #include @@ -70,6 +72,27 @@ class BackupRateLimiter { }; } // namespace +void BackupStatistics::IncrementNumberSuccessBackup() { + number_success_backup++; +} +void BackupStatistics::IncrementNumberFailBackup() { + number_fail_backup++; +} + +uint32_t BackupStatistics::GetNumberSuccessBackup() const { + return number_success_backup; +} +uint32_t BackupStatistics::GetNumberFailBackup() const { + return number_fail_backup; +} + +std::string BackupStatistics::ToString() const { + char result[50]; + snprintf(result, sizeof(result), "# success backup: %u, # fail backup: %u", + GetNumberSuccessBackup(), GetNumberFailBackup()); + return result; +} + void BackupableDBOptions::Dump(Logger* logger) const { Log(logger, " Options.backup_dir: %s", backup_dir.c_str()); Log(logger, " Options.backup_env: %p", backup_env); @@ -142,6 +165,9 @@ class BackupEngineImpl : public BackupEngine { uint64_t GetSize() const { return size_; } + uint32_t GetNumberFiles() { + return files_.size(); + } void SetSequenceNumber(uint64_t sequence_number) { sequence_number_ = sequence_number; } @@ -286,6 +312,7 @@ class BackupEngineImpl : public BackupEngine { static const size_t kDefaultCopyFileBufferSize = 5 * 1024 * 1024LL; // 5MB size_t copy_file_buffer_size_; bool read_only_; + BackupStatistics backup_statistics_; }; BackupEngine* BackupEngine::NewBackupEngine( @@ -441,6 +468,8 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) { new_backup.RecordTimestamp(); new_backup.SetSequenceNumber(sequence_number); + auto start_backup = backup_env_-> NowMicros(); + Log(options_.info_log, "Started the backup process -- creating backup %u", new_backup_id); @@ -505,6 +534,8 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) { GetAbsolutePath(GetPrivateFileRel(new_backup_id, false))); } + auto backup_time = backup_env_->NowMicros() - start_backup; + if (s.ok()) { // persist the backup metadata on the disk s = new_backup.StoreToFile(options_.sync); @@ -535,9 +566,15 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) { } } + if (s.ok()) { + backup_statistics_.IncrementNumberSuccessBackup(); + } if (!s.ok()) { + backup_statistics_.IncrementNumberFailBackup(); // clean all the files we might have created Log(options_.info_log, "Backup failed -- %s", s.ToString().c_str()); + Log(options_.info_log, "Backup Statistics %s\n", + backup_statistics_.ToString().c_str()); backups_.erase(new_backup_id); GarbageCollection(true); return s; @@ -547,6 +584,17 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) { // in the LATEST_BACKUP file latest_backup_id_ = new_backup_id; Log(options_.info_log, "Backup DONE. All is good"); + + // backup_speed is in byte/second + double backup_speed = new_backup.GetSize() / (1.048576 * backup_time); + Log(options_.info_log, "Backup number of files: %u", + new_backup.GetNumberFiles()); + Log(options_.info_log, "Backup size: %" PRIu64 " bytes", + new_backup.GetSize()); + Log(options_.info_log, "Backup time: %" PRIu64 " microseconds", backup_time); + Log(options_.info_log, "Backup speed: %.3f MB/s", backup_speed); + Log(options_.info_log, "Backup Statistics %s", + backup_statistics_.ToString().c_str()); return s; } @@ -582,8 +630,9 @@ void BackupEngineImpl::GetBackupInfo(std::vector* backup_info) { backup_info->reserve(backups_.size()); for (auto& backup : backups_) { if (!backup.second.Empty()) { - backup_info->push_back(BackupInfo( - backup.first, backup.second.GetTimestamp(), backup.second.GetSize())); + backup_info->push_back(BackupInfo( + backup.first, backup.second.GetTimestamp(), backup.second.GetSize(), + backup.second.GetNumberFiles())); } } } diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index 1d876cd5010..a585d1a9cfb 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -916,7 +916,7 @@ TEST(BackupableDBTest, RateLimiting) { auto backup_time = env_->NowMicros() - start_backup; auto rate_limited_backup_time = (bytes_written * kMicrosPerSec) / backupable_options_->backup_rate_limit; - ASSERT_GT(backup_time, 0.9 * rate_limited_backup_time); + ASSERT_GT(backup_time, 0.8 * rate_limited_backup_time); CloseBackupableDB(); @@ -927,7 +927,7 @@ TEST(BackupableDBTest, RateLimiting) { CloseRestoreDB(); auto rate_limited_restore_time = (bytes_written * kMicrosPerSec) / backupable_options_->restore_rate_limit; - ASSERT_GT(restore_time, 0.9 * rate_limited_restore_time); + ASSERT_GT(restore_time, 0.8 * rate_limited_restore_time); AssertBackupConsistency(0, 0, 100000, 100010); } diff --git a/utilities/document/document_db.cc b/utilities/document/document_db.cc index 1e333f12992..8e15a52ca8b 100644 --- a/utilities/document/document_db.cc +++ b/utilities/document/document_db.cc @@ -8,6 +8,7 @@ #include "rocksdb/utilities/document_db.h" #include "rocksdb/cache.h" +#include "rocksdb/table.h" #include "rocksdb/filter_policy.h" #include "rocksdb/comparator.h" #include "rocksdb/db.h" @@ -735,6 +736,7 @@ class DocumentDBImpl : public DocumentDB { CreateColumnFamily(ColumnFamilyOptions(rocksdb_options_), InternalSecondaryIndexName(index.name), &cf_handle); if (!s.ok()) { + delete index_obj; return s; } @@ -1100,7 +1102,9 @@ Options GetRocksDBOptionsFromOptions(const DocumentDBOptions& options) { rocksdb_options.max_background_flushes = 1; rocksdb_options.write_buffer_size = options.memtable_size; rocksdb_options.max_write_buffer_number = 6; - rocksdb_options.block_cache = NewLRUCache(options.cache_size); + BlockBasedTableOptions table_options; + table_options.block_cache = NewLRUCache(options.cache_size); + rocksdb_options.table_factory.reset(NewBlockBasedTableFactory(table_options)); return rocksdb_options; } } // namespace diff --git a/utilities/document/json_document.cc b/utilities/document/json_document.cc index 641f4ee09ed..4368b759d58 100644 --- a/utilities/document/json_document.cc +++ b/utilities/document/json_document.cc @@ -6,7 +6,10 @@ #include "rocksdb/utilities/json_document.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include diff --git a/utilities/geodb/geodb_impl.cc b/utilities/geodb/geodb_impl.cc index f63c91c3e59..6c13fd69167 100644 --- a/utilities/geodb/geodb_impl.cc +++ b/utilities/geodb/geodb_impl.cc @@ -7,7 +7,9 @@ #include "utilities/geodb/geodb_impl.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif #include #include diff --git a/utilities/spatialdb/spatial_db.cc b/utilities/spatialdb/spatial_db.cc index a0dff51200b..9c44027c824 100644 --- a/utilities/spatialdb/spatial_db.cc +++ b/utilities/spatialdb/spatial_db.cc @@ -7,7 +7,10 @@ #include "rocksdb/utilities/spatial_db.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include @@ -16,6 +19,11 @@ #include #include "rocksdb/cache.h" +#include "rocksdb/options.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/statistics.h" +#include "rocksdb/table.h" #include "rocksdb/db.h" #include "rocksdb/utilities/stackable_db.h" #include "util/coding.h" @@ -214,6 +222,7 @@ std::string FeatureSet::DebugString() const { switch (iter.second.type()) { case Variant::kNull: out.append("null"); + break; case Variant::kBool: if (iter.second.get_bool()) { out.append("true"); @@ -243,13 +252,76 @@ std::string FeatureSet::DebugString() const { return out + "}"; } +class ValueGetter { + public: + ValueGetter() {} + virtual ~ValueGetter() {} + + virtual bool Get(uint64_t id) = 0; + virtual const Slice value() const = 0; + + virtual Status status() const = 0; +}; + +class ValueGetterFromDB : public ValueGetter { + public: + ValueGetterFromDB(DB* db, ColumnFamilyHandle* cf) : db_(db), cf_(cf) {} + + virtual bool Get(uint64_t id) override { + std::string encoded_id; + PutFixed64BigEndian(&encoded_id, id); + status_ = db_->Get(ReadOptions(), cf_, encoded_id, &value_); + if (status_.IsNotFound()) { + status_ = Status::Corruption("Index inconsistency"); + return false; + } + + return true; + } + + virtual const Slice value() const override { return value_; } + + virtual Status status() const override { return status_; } + + private: + std::string value_; + DB* db_; + ColumnFamilyHandle* cf_; + Status status_; +}; + +class ValueGetterFromIterator : public ValueGetter { + public: + explicit ValueGetterFromIterator(Iterator* iterator) : iterator_(iterator) {} + + virtual bool Get(uint64_t id) override { + std::string encoded_id; + PutFixed64BigEndian(&encoded_id, id); + iterator_->Seek(encoded_id); + + if (!iterator_->Valid() || iterator_->key() != Slice(encoded_id)) { + status_ = Status::Corruption("Index inconsistency"); + return false; + } + + return true; + } + + virtual const Slice value() const override { return iterator_->value(); } + + virtual Status status() const override { return status_; } + + private: + std::unique_ptr iterator_; + Status status_; +}; + class SpatialIndexCursor : public Cursor { public: // tile_box is inclusive - SpatialIndexCursor(Iterator* spatial_iterator, Iterator* data_iterator, + SpatialIndexCursor(Iterator* spatial_iterator, ValueGetter* value_getter, const BoundingBox& tile_bbox, uint32_t tile_bits) - : data_iterator_(data_iterator), - valid_(true) { + : value_getter_(value_getter), valid_(true) { // calculate quad keys we'll need to query std::vector quad_keys; quad_keys.reserve((tile_bbox.max_x - tile_bbox.min_x + 1) * @@ -328,7 +400,7 @@ class SpatialIndexCursor : public Cursor { if (!status_.ok()) { return status_; } - return data_iterator_->status(); + return value_getter_->status(); } private: @@ -355,32 +427,23 @@ class SpatialIndexCursor : public Cursor { return true; } - // doesn't return anything, but sets valid_ and status_ on corruption void ExtractData() { assert(valid_); - std::string encoded_id; - PutFixed64BigEndian(&encoded_id, *primary_keys_iterator_); + valid_ = value_getter_->Get(*primary_keys_iterator_); - data_iterator_->Seek(encoded_id); - - if (!data_iterator_->Valid() || - data_iterator_->key() != Slice(encoded_id)) { - status_ = Status::Corruption("Index inconsistency"); - valid_ = false; - return; + if (valid_) { + Slice data = value_getter_->value(); + current_feature_set_.Clear(); + if (!GetLengthPrefixedSlice(&data, ¤t_blob_) || + !current_feature_set_.Deserialize(data)) { + status_ = Status::Corruption("Primary key column family corruption"); + valid_ = false; + } } - Slice data = data_iterator_->value(); - current_feature_set_.Clear(); - if (!GetLengthPrefixedSlice(&data, ¤t_blob_) || - !current_feature_set_.Deserialize(data)) { - status_ = Status::Corruption("Primary key column family corruption"); - valid_ = false; - return; - } } - unique_ptr data_iterator_; + unique_ptr value_getter_; bool valid_; Status status_; @@ -426,10 +489,11 @@ class SpatialDBImpl : public SpatialDB { DB* db, ColumnFamilyHandle* data_column_family, const std::vector>& spatial_indexes, - uint64_t next_id) + uint64_t next_id, bool read_only) : SpatialDB(db), data_column_family_(data_column_family), - next_id_(next_id) { + next_id_(next_id), + read_only_(read_only) { for (const auto& index : spatial_indexes) { name_to_index_.insert( {index.first.name, IndexColumnFamily(index.first, index.second)}); @@ -453,6 +517,7 @@ class SpatialDBImpl : public SpatialDB { return Status::InvalidArgument("Spatial indexes can't be empty"); } + const size_t kWriteOutEveryBytes = 1024 * 1024; // 1MB uint64_t id = next_id_.fetch_add(1); for (const auto& si : spatial_indexes) { @@ -474,6 +539,13 @@ class SpatialDBImpl : public SpatialDB { &key, GetQuadKeyFromTile(x, y, spatial_index.tile_bits)); PutFixed64BigEndian(&key, id); batch.Put(itr->second.column_family, key, Slice()); + if (batch.GetDataSize() >= kWriteOutEveryBytes) { + Status s = Write(write_options, &batch); + batch.Clear(); + if (!s.ok()) { + return s; + } + } } } } @@ -490,6 +562,7 @@ class SpatialDBImpl : public SpatialDB { } virtual Status Compact() override { + // TODO(icanadi) maybe do this in parallel? Status s, t; for (auto& iter : name_to_index_) { t = Flush(FlushOptions(), iter.second.column_family); @@ -520,17 +593,26 @@ class SpatialDBImpl : public SpatialDB { return new ErrorCursor(Status::InvalidArgument( "Spatial index " + spatial_index + " not found")); } + const auto& si = itr->second.index; + Iterator* spatial_iterator; + ValueGetter* value_getter; - std::vector iterators; - Status s = NewIterators(read_options, - {data_column_family_, itr->second.column_family}, - &iterators); - if (!s.ok()) { - return new ErrorCursor(s); - } + if (read_only_) { + spatial_iterator = NewIterator(read_options, itr->second.column_family); + value_getter = new ValueGetterFromDB(this, data_column_family_); + } else { + std::vector iterators; + Status s = NewIterators(read_options, + {data_column_family_, itr->second.column_family}, + &iterators); + if (!s.ok()) { + return new ErrorCursor(s); + } - const auto& si = itr->second.index; - return new SpatialIndexCursor(iterators[1], iterators[0], + spatial_iterator = iterators[1]; + value_getter = new ValueGetterFromIterator(iterators[0]); + } + return new SpatialIndexCursor(spatial_iterator, value_getter, GetTileBoundingBox(si, bbox), si.tile_bits); } @@ -547,19 +629,67 @@ class SpatialDBImpl : public SpatialDB { std::unordered_map name_to_index_; std::atomic next_id_; + bool read_only_; }; namespace { -Options GetRocksDBOptionsFromOptions(const SpatialDBOptions& options) { - Options rocksdb_options; - rocksdb_options.OptimizeLevelStyleCompaction(); - rocksdb_options.IncreaseParallelism(options.num_threads); - rocksdb_options.block_cache = NewLRUCache(options.cache_size); +DBOptions GetDBOptions(const SpatialDBOptions& options) { + DBOptions db_options; + db_options.max_open_files = 50000; + db_options.max_background_compactions = 3 * options.num_threads / 4; + db_options.max_background_flushes = + options.num_threads - db_options.max_background_compactions; + db_options.env->SetBackgroundThreads(db_options.max_background_compactions, + Env::LOW); + db_options.env->SetBackgroundThreads(db_options.max_background_flushes, + Env::HIGH); + db_options.statistics = CreateDBStatistics(); if (options.bulk_load) { - rocksdb_options.PrepareForBulkLoad(); + db_options.stats_dump_period_sec = 600; + db_options.disableDataSync = true; + } else { + db_options.stats_dump_period_sec = 1800; // 30min } - return rocksdb_options; + return db_options; } + +ColumnFamilyOptions GetColumnFamilyOptions(const SpatialDBOptions& options, + std::shared_ptr block_cache) { + ColumnFamilyOptions column_family_options; + column_family_options.write_buffer_size = 128 * 1024 * 1024; // 128MB + column_family_options.max_write_buffer_number = 4; + column_family_options.max_bytes_for_level_base = 256 * 1024 * 1024; // 256MB + column_family_options.target_file_size_base = 64 * 1024 * 1024; // 64MB + column_family_options.level0_file_num_compaction_trigger = 2; + column_family_options.level0_slowdown_writes_trigger = 16; + column_family_options.level0_slowdown_writes_trigger = 32; + // only compress levels >= 2 + column_family_options.compression_per_level.resize( + column_family_options.num_levels); + for (int i = 0; i < column_family_options.num_levels; ++i) { + if (i < 2) { + column_family_options.compression_per_level[i] = kNoCompression; + } else { + column_family_options.compression_per_level[i] = kLZ4Compression; + } + } + BlockBasedTableOptions table_options; + table_options.block_cache = block_cache; + column_family_options.table_factory.reset( + NewBlockBasedTableFactory(table_options)); + return column_family_options; +} + +ColumnFamilyOptions OptimizeOptionsForDataColumnFamily( + ColumnFamilyOptions options, std::shared_ptr block_cache) { + options.prefix_extractor.reset(NewNoopTransform()); + BlockBasedTableOptions block_based_options; + block_based_options.index_type = BlockBasedTableOptions::kHashSearch; + block_based_options.block_cache = block_cache; + options.table_factory.reset(NewBlockBasedTableFactory(block_based_options)); + return options; +} + } // namespace class MetadataStorage { @@ -605,26 +735,30 @@ class MetadataStorage { Status SpatialDB::Create( const SpatialDBOptions& options, const std::string& name, const std::vector& spatial_indexes) { - Options rocksdb_options = GetRocksDBOptionsFromOptions(options); - rocksdb_options.create_if_missing = true; - rocksdb_options.create_missing_column_families = true; - rocksdb_options.error_if_exists = true; + DBOptions db_options = GetDBOptions(options); + db_options.create_if_missing = true; + db_options.create_missing_column_families = true; + db_options.error_if_exists = true; + + auto block_cache = NewLRUCache(options.cache_size); + ColumnFamilyOptions column_family_options = + GetColumnFamilyOptions(options, block_cache); std::vector column_families; column_families.push_back(ColumnFamilyDescriptor( - kDefaultColumnFamilyName, ColumnFamilyOptions(rocksdb_options))); - column_families.push_back(ColumnFamilyDescriptor( - kMetadataColumnFamilyName, ColumnFamilyOptions(rocksdb_options))); + kDefaultColumnFamilyName, + OptimizeOptionsForDataColumnFamily(column_family_options, block_cache))); + column_families.push_back( + ColumnFamilyDescriptor(kMetadataColumnFamilyName, column_family_options)); for (const auto& index : spatial_indexes) { column_families.emplace_back(GetSpatialIndexColumnFamilyName(index.name), - ColumnFamilyOptions(rocksdb_options)); + column_family_options); } std::vector handles; DB* base_db; - Status s = DB::Open(DBOptions(rocksdb_options), name, column_families, - &handles, &base_db); + Status s = DB::Open(db_options, name, column_families, &handles, &base_db); if (!s.ok()) { return s; } @@ -646,13 +780,15 @@ Status SpatialDB::Create( Status SpatialDB::Open(const SpatialDBOptions& options, const std::string& name, SpatialDB** db, bool read_only) { - Options rocksdb_options = GetRocksDBOptionsFromOptions(options); + DBOptions db_options = GetDBOptions(options); + auto block_cache = NewLRUCache(options.cache_size); + ColumnFamilyOptions column_family_options = + GetColumnFamilyOptions(options, block_cache); Status s; std::vector existing_column_families; std::vector spatial_indexes; - s = DB::ListColumnFamilies(DBOptions(rocksdb_options), name, - &existing_column_families); + s = DB::ListColumnFamilies(db_options, name, &existing_column_families); if (!s.ok()) { return s; } @@ -665,22 +801,22 @@ Status SpatialDB::Open(const SpatialDBOptions& options, const std::string& name, std::vector column_families; column_families.push_back(ColumnFamilyDescriptor( - kDefaultColumnFamilyName, ColumnFamilyOptions(rocksdb_options))); - column_families.push_back(ColumnFamilyDescriptor( - kMetadataColumnFamilyName, ColumnFamilyOptions(rocksdb_options))); + kDefaultColumnFamilyName, + OptimizeOptionsForDataColumnFamily(column_family_options, block_cache))); + column_families.push_back( + ColumnFamilyDescriptor(kMetadataColumnFamilyName, column_family_options)); for (const auto& index : spatial_indexes) { column_families.emplace_back(GetSpatialIndexColumnFamilyName(index), - ColumnFamilyOptions(rocksdb_options)); + column_family_options); } std::vector handles; DB* base_db; if (read_only) { - s = DB::OpenForReadOnly(DBOptions(rocksdb_options), name, column_families, - &handles, &base_db); + s = DB::OpenForReadOnly(db_options, name, column_families, &handles, + &base_db); } else { - s = DB::Open(DBOptions(rocksdb_options), name, column_families, &handles, - &base_db); + s = DB::Open(db_options, name, column_families, &handles, &base_db); } if (!s.ok()) { return s; @@ -717,13 +853,13 @@ Status SpatialDB::Open(const SpatialDBOptions& options, const std::string& name, for (auto h : handles) { delete h; } - delete db; + delete base_db; return s; } // I don't need metadata column family any more, so delete it delete handles[1]; - *db = new SpatialDBImpl(base_db, handles[0], index_cf, next_id); + *db = new SpatialDBImpl(base_db, handles[0], index_cf, next_id, read_only); return Status::OK(); } diff --git a/utilities/spatialdb/spatial_db_test.cc b/utilities/spatialdb/spatial_db_test.cc index 4cd2c8eed96..166920b57d9 100644 --- a/utilities/spatialdb/spatial_db_test.cc +++ b/utilities/spatialdb/spatial_db_test.cc @@ -151,41 +151,53 @@ TEST(SpatialDBTest, FeatureSetTest) { } TEST(SpatialDBTest, SimpleTest) { - ASSERT_OK(SpatialDB::Create( - SpatialDBOptions(), dbname_, - {SpatialIndexOptions("index", BoundingBox(0, 0, 128, 128), 3)})); - ASSERT_OK(SpatialDB::Open(SpatialDBOptions(), dbname_, &db_)); - - ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox(33, 17, 63, 79), - "one", FeatureSet(), {"index"})); - ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox(65, 65, 111, 111), - "two", FeatureSet(), {"index"})); - ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox(1, 49, 127, 63), - "three", FeatureSet(), {"index"})); - ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox(20, 100, 21, 101), - "four", FeatureSet(), {"index"})); - ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox(81, 33, 127, 63), - "five", FeatureSet(), {"index"})); - ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox(1, 65, 47, 95), - "six", FeatureSet(), {"index"})); - - AssertCursorResults(BoundingBox(33, 17, 47, 31), "index", {"one"}); - AssertCursorResults(BoundingBox(17, 33, 79, 63), "index", - {"one", "three"}); - AssertCursorResults(BoundingBox(17, 81, 63, 111), "index", - {"four", "six"}); - AssertCursorResults(BoundingBox(85, 86, 85, 86), "index", {"two"}); - AssertCursorResults(BoundingBox(33, 1, 127, 111), "index", - {"one", "two", "three", "five", "six"}); - // even though the bounding box doesn't intersect, we got "four" back because - // it's in the same tile - AssertCursorResults(BoundingBox(18, 98, 19, 99), "index", {"four"}); - AssertCursorResults(BoundingBox(130, 130, 131, 131), "index", {}); - AssertCursorResults(BoundingBox(81, 17, 127, 31), "index", {}); - AssertCursorResults(BoundingBox(90, 50, 91, 51), "index", - {"three", "five"}); + // iter 0 -- not read only + // iter 1 -- read only + for (int iter = 0; iter < 2; ++iter) { + DestroyDB(dbname_, Options()); + ASSERT_OK(SpatialDB::Create( + SpatialDBOptions(), dbname_, + {SpatialIndexOptions("index", BoundingBox(0, 0, 128, 128), + 3)})); + ASSERT_OK(SpatialDB::Open(SpatialDBOptions(), dbname_, &db_)); + + ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox(33, 17, 63, 79), + "one", FeatureSet(), {"index"})); + ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox(65, 65, 111, 111), + "two", FeatureSet(), {"index"})); + ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox(1, 49, 127, 63), + "three", FeatureSet(), {"index"})); + ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox(20, 100, 21, 101), + "four", FeatureSet(), {"index"})); + ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox(81, 33, 127, 63), + "five", FeatureSet(), {"index"})); + ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox(1, 65, 47, 95), + "six", FeatureSet(), {"index"})); + + if (iter == 1) { + delete db_; + ASSERT_OK(SpatialDB::Open(SpatialDBOptions(), dbname_, &db_, true)); + } - delete db_; + AssertCursorResults(BoundingBox(33, 17, 47, 31), "index", {"one"}); + AssertCursorResults(BoundingBox(17, 33, 79, 63), "index", + {"one", "three"}); + AssertCursorResults(BoundingBox(17, 81, 63, 111), "index", + {"four", "six"}); + AssertCursorResults(BoundingBox(85, 86, 85, 86), "index", {"two"}); + AssertCursorResults(BoundingBox(33, 1, 127, 111), "index", + {"one", "two", "three", "five", "six"}); + // even though the bounding box doesn't intersect, we got "four" back + // because + // it's in the same tile + AssertCursorResults(BoundingBox(18, 98, 19, 99), "index", {"four"}); + AssertCursorResults(BoundingBox(130, 130, 131, 131), "index", {}); + AssertCursorResults(BoundingBox(81, 17, 127, 31), "index", {}); + AssertCursorResults(BoundingBox(90, 50, 91, 51), "index", + {"three", "five"}); + + delete db_; + } } namespace { diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc new file mode 100644 index 00000000000..2caa2e4ccc6 --- /dev/null +++ b/utilities/write_batch_with_index/write_batch_with_index.cc @@ -0,0 +1,320 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "rocksdb/utilities/write_batch_with_index.h" +#include "rocksdb/comparator.h" +#include "db/column_family.h" +#include "db/skiplist.h" +#include "util/arena.h" + +namespace rocksdb { +namespace { +class ReadableWriteBatch : public WriteBatch { + public: + explicit ReadableWriteBatch(size_t reserved_bytes = 0) + : WriteBatch(reserved_bytes) {} + // Retrieve some information from a write entry in the write batch, given + // the start offset of the write entry. + Status GetEntryFromDataOffset(size_t data_offset, WriteType* type, Slice* Key, + Slice* value, Slice* blob) const; +}; + +// Key used by skip list, as the binary searchable index of WriteBatchWithIndex. +struct WriteBatchIndexEntry { + WriteBatchIndexEntry(size_t o, uint32_t c) + : offset(o), column_family(c), search_key(nullptr) {} + WriteBatchIndexEntry(const Slice* sk, uint32_t c) + : offset(0), column_family(c), search_key(sk) {} + + size_t offset; // offset of an entry in write batch's string buffer. + uint32_t column_family; // column family of the entry + const Slice* search_key; // if not null, instead of reading keys from + // write batch, use it to compare. This is used + // for lookup key. +}; + +class WriteBatchEntryComparator { + public: + WriteBatchEntryComparator(const Comparator* default_comparator, + const ReadableWriteBatch* write_batch) + : default_comparator_(default_comparator), write_batch_(write_batch) {} + // Compare a and b. Return a negative value if a is less than b, 0 if they + // are equal, and a positive value if a is greater than b + int operator()(const WriteBatchIndexEntry* entry1, + const WriteBatchIndexEntry* entry2) const; + + void SetComparatorForCF(uint32_t column_family_id, + const Comparator* comparator) { + cf_comparator_map_[column_family_id] = comparator; + } + + private: + const Comparator* default_comparator_; + std::unordered_map cf_comparator_map_; + const ReadableWriteBatch* write_batch_; +}; + +typedef SkipList + WriteBatchEntrySkipList; + +class WBWIIteratorImpl : public WBWIIterator { + public: + WBWIIteratorImpl(uint32_t column_family_id, + WriteBatchEntrySkipList* skip_list, + const ReadableWriteBatch* write_batch) + : column_family_id_(column_family_id), + skip_list_iter_(skip_list), + write_batch_(write_batch), + valid_(false) {} + + virtual ~WBWIIteratorImpl() {} + + virtual bool Valid() const override { return valid_; } + + virtual void Seek(const Slice& key) override { + valid_ = true; + WriteBatchIndexEntry search_entry(&key, column_family_id_); + skip_list_iter_.Seek(&search_entry); + ReadEntry(); + } + + virtual void Next() override { + skip_list_iter_.Next(); + ReadEntry(); + } + + virtual const WriteEntry& Entry() const override { return current_; } + + virtual Status status() const override { return status_; } + + private: + uint32_t column_family_id_; + WriteBatchEntrySkipList::Iterator skip_list_iter_; + const ReadableWriteBatch* write_batch_; + Status status_; + bool valid_; + WriteEntry current_; + + void ReadEntry() { + if (!status_.ok() || !skip_list_iter_.Valid()) { + valid_ = false; + return; + } + const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key(); + if (iter_entry == nullptr || + iter_entry->column_family != column_family_id_) { + valid_ = false; + return; + } + Slice blob; + status_ = write_batch_->GetEntryFromDataOffset( + iter_entry->offset, ¤t_.type, ¤t_.key, ¤t_.value, + &blob); + if (!status_.ok()) { + valid_ = false; + } else if (current_.type != kPutRecord && current_.type != kDeleteRecord && + current_.type != kMergeRecord) { + valid_ = false; + status_ = Status::Corruption("write batch index is corrupted"); + } + } +}; +} // namespace + +struct WriteBatchWithIndex::Rep { + Rep(const Comparator* index_comparator, size_t reserved_bytes = 0) + : write_batch(reserved_bytes), + comparator(index_comparator, &write_batch), + skip_list(comparator, &arena) {} + ReadableWriteBatch write_batch; + WriteBatchEntryComparator comparator; + Arena arena; + WriteBatchEntrySkipList skip_list; + + WriteBatchIndexEntry* GetEntry(ColumnFamilyHandle* column_family) { + uint32_t cf_id = GetColumnFamilyID(column_family); + const auto* cf_cmp = GetColumnFamilyUserComparator(column_family); + if (cf_cmp != nullptr) { + comparator.SetComparatorForCF(cf_id, cf_cmp); + } + + return GetEntryWithCfId(cf_id); + } + + WriteBatchIndexEntry* GetEntryWithCfId(uint32_t column_family_id) { + auto* mem = arena.Allocate(sizeof(WriteBatchIndexEntry)); + auto* index_entry = new (mem) + WriteBatchIndexEntry(write_batch.GetDataSize(), column_family_id); + return index_entry; + } +}; + +Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset, + WriteType* type, Slice* Key, + Slice* value, + Slice* blob) const { + if (type == nullptr || Key == nullptr || value == nullptr || + blob == nullptr) { + return Status::InvalidArgument("Output parameters cannot be null"); + } + + if (data_offset >= GetDataSize()) { + return Status::InvalidArgument("data offset exceed write batch size"); + } + Slice input = Slice(rep_.data() + data_offset, rep_.size() - data_offset); + char tag; + uint32_t column_family; + Status s = + ReadRecordFromWriteBatch(&input, &tag, &column_family, Key, value, blob); + + switch (tag) { + case kTypeColumnFamilyValue: + case kTypeValue: + *type = kPutRecord; + break; + case kTypeColumnFamilyDeletion: + case kTypeDeletion: + *type = kDeleteRecord; + break; + case kTypeColumnFamilyMerge: + case kTypeMerge: + *type = kMergeRecord; + break; + case kTypeLogData: + *type = kLogDataRecord; + break; + default: + return Status::Corruption("unknown WriteBatch tag"); + } + return Status::OK(); +} + +WriteBatchWithIndex::WriteBatchWithIndex( + const Comparator* default_index_comparator, size_t reserved_bytes) + : rep(new Rep(default_index_comparator, reserved_bytes)) {} + +WriteBatchWithIndex::~WriteBatchWithIndex() { delete rep; } + +WriteBatch* WriteBatchWithIndex::GetWriteBatch() { return &rep->write_batch; } + +WBWIIterator* WriteBatchWithIndex::NewIterator() { + return new WBWIIteratorImpl(0, &(rep->skip_list), &rep->write_batch); +} + +WBWIIterator* WriteBatchWithIndex::NewIterator( + ColumnFamilyHandle* column_family) { + return new WBWIIteratorImpl(GetColumnFamilyID(column_family), + &(rep->skip_list), &rep->write_batch); +} + +void WriteBatchWithIndex::Put(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + auto* index_entry = rep->GetEntry(column_family); + rep->write_batch.Put(column_family, key, value); + rep->skip_list.Insert(index_entry); +} + +void WriteBatchWithIndex::Put(const Slice& key, const Slice& value) { + auto* index_entry = rep->GetEntryWithCfId(0); + rep->write_batch.Put(key, value); + rep->skip_list.Insert(index_entry); +} + +void WriteBatchWithIndex::Merge(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + auto* index_entry = rep->GetEntry(column_family); + rep->write_batch.Merge(column_family, key, value); + rep->skip_list.Insert(index_entry); +} + +void WriteBatchWithIndex::Merge(const Slice& key, const Slice& value) { + auto* index_entry = rep->GetEntryWithCfId(0); + rep->write_batch.Merge(key, value); + rep->skip_list.Insert(index_entry); +} + +void WriteBatchWithIndex::PutLogData(const Slice& blob) { + rep->write_batch.PutLogData(blob); +} + +void WriteBatchWithIndex::Delete(ColumnFamilyHandle* column_family, + const Slice& key) { + auto* index_entry = rep->GetEntry(column_family); + rep->write_batch.Delete(column_family, key); + rep->skip_list.Insert(index_entry); +} + +void WriteBatchWithIndex::Delete(const Slice& key) { + auto* index_entry = rep->GetEntryWithCfId(0); + rep->write_batch.Delete(key); + rep->skip_list.Insert(index_entry); +} + +void WriteBatchWithIndex::Delete(ColumnFamilyHandle* column_family, + const SliceParts& key) { + auto* index_entry = rep->GetEntry(column_family); + rep->write_batch.Delete(column_family, key); + rep->skip_list.Insert(index_entry); +} + +void WriteBatchWithIndex::Delete(const SliceParts& key) { + auto* index_entry = rep->GetEntryWithCfId(0); + rep->write_batch.Delete(key); + rep->skip_list.Insert(index_entry); +} + +int WriteBatchEntryComparator::operator()( + const WriteBatchIndexEntry* entry1, + const WriteBatchIndexEntry* entry2) const { + if (entry1->column_family > entry2->column_family) { + return 1; + } else if (entry1->column_family < entry2->column_family) { + return -1; + } + + Status s; + Slice key1, key2; + if (entry1->search_key == nullptr) { + Slice value, blob; + WriteType write_type; + s = write_batch_->GetEntryFromDataOffset(entry1->offset, &write_type, &key1, + &value, &blob); + if (!s.ok()) { + return 1; + } + } else { + key1 = *(entry1->search_key); + } + if (entry2->search_key == nullptr) { + Slice value, blob; + WriteType write_type; + s = write_batch_->GetEntryFromDataOffset(entry2->offset, &write_type, &key2, + &value, &blob); + if (!s.ok()) { + return -1; + } + } else { + key2 = *(entry2->search_key); + } + + int cmp; + auto comparator_for_cf = cf_comparator_map_.find(entry1->column_family); + if (comparator_for_cf != cf_comparator_map_.end()) { + cmp = comparator_for_cf->second->Compare(key1, key2); + } else { + cmp = default_comparator_->Compare(key1, key2); + } + + if (cmp != 0) { + return cmp; + } else if (entry1->offset > entry2->offset) { + return 1; + } else if (entry1->offset < entry2->offset) { + return -1; + } + return 0; +} + +} // namespace rocksdb diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc new file mode 100644 index 00000000000..ad8c110c126 --- /dev/null +++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc @@ -0,0 +1,341 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + + +#include +#include +#include "db/column_family.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "util/testharness.h" + +namespace rocksdb { + +namespace { +class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl { + public: + explicit ColumnFamilyHandleImplDummy(int id, const Comparator* comparator) + : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), + id_(id), + comparator_(comparator) {} + uint32_t GetID() const override { return id_; } + const Comparator* user_comparator() const override { return comparator_; } + + private: + uint32_t id_; + const Comparator* comparator_; +}; + +struct Entry { + std::string key; + std::string value; + WriteType type; +}; + +struct TestHandler : public WriteBatch::Handler { + std::map> seen; + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + Entry e; + e.key = key.ToString(); + e.value = value.ToString(); + e.type = kPutRecord; + seen[column_family_id].push_back(e); + return Status::OK(); + } + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + Entry e; + e.key = key.ToString(); + e.value = value.ToString(); + e.type = kMergeRecord; + seen[column_family_id].push_back(e); + return Status::OK(); + } + virtual void LogData(const Slice& blob) {} + virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) { + Entry e; + e.key = key.ToString(); + e.value = ""; + e.type = kDeleteRecord; + seen[column_family_id].push_back(e); + return Status::OK(); + } +}; +} // namespace anonymous + +class WriteBatchWithIndexTest {}; + +TEST(WriteBatchWithIndexTest, TestValueAsSecondaryIndex) { + Entry entries[] = {{"aaa", "0005", kPutRecord}, + {"b", "0002", kPutRecord}, + {"cdd", "0002", kMergeRecord}, + {"aab", "00001", kPutRecord}, + {"cc", "00005", kPutRecord}, + {"cdd", "0002", kPutRecord}, + {"aab", "0003", kPutRecord}, + {"cc", "00005", kDeleteRecord}, }; + + // In this test, we insert to column family `data`, and + // to column family `index`. Then iterator them in order + // and seek them by key. + + // Sort entries by key + std::map> data_map; + // Sort entries by value + std::map> index_map; + for (auto& e : entries) { + data_map[e.key].push_back(&e); + index_map[e.value].push_back(&e); + } + + WriteBatchWithIndex batch(nullptr, 20); + ColumnFamilyHandleImplDummy data(6, BytewiseComparator()); + ColumnFamilyHandleImplDummy index(8, BytewiseComparator()); + for (auto& e : entries) { + if (e.type == kPutRecord) { + batch.Put(&data, e.key, e.value); + batch.Put(&index, e.value, e.key); + } else if (e.type == kMergeRecord) { + batch.Merge(&data, e.key, e.value); + batch.Put(&index, e.value, e.key); + } else { + assert(e.type == kDeleteRecord); + std::unique_ptr iter(batch.NewIterator(&data)); + iter->Seek(e.key); + ASSERT_OK(iter->status()); + auto& write_entry = iter->Entry(); + ASSERT_EQ(e.key, write_entry.key.ToString()); + ASSERT_EQ(e.value, write_entry.value.ToString()); + batch.Delete(&data, e.key); + batch.Put(&index, e.value, ""); + } + } + + // Iterator all keys + { + std::unique_ptr iter(batch.NewIterator(&data)); + iter->Seek(""); + for (auto pair : data_map) { + for (auto v : pair.second) { + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + auto& write_entry = iter->Entry(); + ASSERT_EQ(pair.first, write_entry.key.ToString()); + ASSERT_EQ(v->type, write_entry.type); + if (write_entry.type != kDeleteRecord) { + ASSERT_EQ(v->value, write_entry.value.ToString()); + } + iter->Next(); + } + } + ASSERT_TRUE(!iter->Valid()); + } + + // Iterator all indexes + { + std::unique_ptr iter(batch.NewIterator(&index)); + iter->Seek(""); + for (auto pair : index_map) { + for (auto v : pair.second) { + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + auto& write_entry = iter->Entry(); + ASSERT_EQ(pair.first, write_entry.key.ToString()); + if (v->type != kDeleteRecord) { + ASSERT_EQ(v->key, write_entry.value.ToString()); + ASSERT_EQ(v->value, write_entry.key.ToString()); + } + iter->Next(); + } + } + ASSERT_TRUE(!iter->Valid()); + } + + // Seek to every key + { + std::unique_ptr iter(batch.NewIterator(&data)); + + // Seek the keys one by one in reverse order + for (auto pair = data_map.rbegin(); pair != data_map.rend(); ++pair) { + iter->Seek(pair->first); + ASSERT_OK(iter->status()); + for (auto v : pair->second) { + ASSERT_TRUE(iter->Valid()); + auto& write_entry = iter->Entry(); + ASSERT_EQ(pair->first, write_entry.key.ToString()); + ASSERT_EQ(v->type, write_entry.type); + if (write_entry.type != kDeleteRecord) { + ASSERT_EQ(v->value, write_entry.value.ToString()); + } + iter->Next(); + ASSERT_OK(iter->status()); + } + } + } + + // Seek to every index + { + std::unique_ptr iter(batch.NewIterator(&index)); + + // Seek the keys one by one in reverse order + for (auto pair = index_map.rbegin(); pair != index_map.rend(); ++pair) { + iter->Seek(pair->first); + ASSERT_OK(iter->status()); + for (auto v : pair->second) { + ASSERT_TRUE(iter->Valid()); + auto& write_entry = iter->Entry(); + ASSERT_EQ(pair->first, write_entry.key.ToString()); + ASSERT_EQ(v->value, write_entry.key.ToString()); + if (v->type != kDeleteRecord) { + ASSERT_EQ(v->key, write_entry.value.ToString()); + } + iter->Next(); + ASSERT_OK(iter->status()); + } + } + } + + // Verify WriteBatch can be iterated + TestHandler handler; + batch.GetWriteBatch()->Iterate(&handler); + + // Verify data column family + { + ASSERT_EQ(sizeof(entries) / sizeof(Entry), + handler.seen[data.GetID()].size()); + size_t i = 0; + for (auto e : handler.seen[data.GetID()]) { + auto write_entry = entries[i++]; + ASSERT_EQ(e.type, write_entry.type); + ASSERT_EQ(e.key, write_entry.key); + if (e.type != kDeleteRecord) { + ASSERT_EQ(e.value, write_entry.value); + } + } + } + + // Verify index column family + { + ASSERT_EQ(sizeof(entries) / sizeof(Entry), + handler.seen[index.GetID()].size()); + size_t i = 0; + for (auto e : handler.seen[index.GetID()]) { + auto write_entry = entries[i++]; + ASSERT_EQ(e.key, write_entry.value); + if (write_entry.type != kDeleteRecord) { + ASSERT_EQ(e.value, write_entry.key); + } + } + } +} + +class ReverseComparator : public Comparator { + public: + ReverseComparator() {} + + virtual const char* Name() const override { + return "rocksdb.ReverseComparator"; + } + + virtual int Compare(const Slice& a, const Slice& b) const override { + return 0 - BytewiseComparator()->Compare(a, b); + } + + virtual void FindShortestSeparator(std::string* start, + const Slice& limit) const {} + virtual void FindShortSuccessor(std::string* key) const {} +}; + +TEST(WriteBatchWithIndexTest, TestComparatorForCF) { + ReverseComparator reverse_cmp; + ColumnFamilyHandleImplDummy cf1(6, nullptr); + ColumnFamilyHandleImplDummy reverse_cf(66, &reverse_cmp); + ColumnFamilyHandleImplDummy cf2(88, BytewiseComparator()); + WriteBatchWithIndex batch(BytewiseComparator(), 20); + + batch.Put(&cf1, "ddd", ""); + batch.Put(&cf2, "aaa", ""); + batch.Put(&cf2, "eee", ""); + batch.Put(&cf1, "ccc", ""); + batch.Put(&reverse_cf, "a11", ""); + batch.Put(&cf1, "bbb", ""); + batch.Put(&reverse_cf, "a33", ""); + batch.Put(&reverse_cf, "a22", ""); + + { + std::unique_ptr iter(batch.NewIterator(&cf1)); + iter->Seek(""); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bbb", iter->Entry().key.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("ccc", iter->Entry().key.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("ddd", iter->Entry().key.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + } + + { + std::unique_ptr iter(batch.NewIterator(&cf2)); + iter->Seek(""); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("aaa", iter->Entry().key.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("eee", iter->Entry().key.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + } + + { + std::unique_ptr iter(batch.NewIterator(&reverse_cf)); + iter->Seek(""); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("z"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a33", iter->Entry().key.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a22", iter->Entry().key.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a11", iter->Entry().key.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("a22"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a22", iter->Entry().key.ToString()); + + iter->Seek("a13"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a11", iter->Entry().key.ToString()); + } +} + +} // namespace + +int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }