diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 66726b4fe..b0ce8702b 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -172,7 +172,7 @@ jobs: - name: test a streaming index if: success() || failure() run: | - ${{ env.diskann_built_tests }}/test_streaming_scenario --data_type int8 --dist_fn l2 --data_path rand_int8_10D_10K_norm50.0.bin --index_path_prefix index_stream -R 64 -L 600 --alpha 1.2 --insert_threads 4 --consolidate_threads 4 --max_points_to_insert 10000 --active_window 4000 --consolidate_interval 2000 --start_point_norm 200 + ${{ env.diskann_built_tests }}/test_streaming_scenario --data_type int8 --dist_fn l2 --data_path ./rand_int8_10D_10K_norm50.0.bin --index_path_prefix index_stream -R 64 -L 600 --alpha 1.2 --insert_threads 4 --consolidate_threads 4 --max_points_to_insert 10000 --active_window 4000 --consolidate_interval 2000 --start_point_norm 200 ${{ env.diskann_built_utils }}/compute_groundtruth --data_type int8 --dist_fn l2 --base_file index_stream.after-streaming-act4000-cons2000-max10000.data --query_file rand_int8_10D_1K_norm50.0.bin --K 100 --gt_file gt100_base-act4000-cons2000-max10000 --tags_file index_stream.after-streaming-act4000-cons2000-max10000.tags ${{ env.diskann_built_tests }}/search_memory_index --data_type int8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix index_stream.after-streaming-act4000-cons2000-max10000 --result_path res_stream --query_file ./rand_int8_10D_1K_norm50.0.bin --gt_file gt100_base-act4000-cons2000-max10000 -K 10 -L 20 40 60 80 100 -T 64 --dynamic true --tags 1 diff --git a/include/abstract_data_store.h b/include/abstract_data_store.h new file mode 100644 index 000000000..71ce319fc --- /dev/null +++ b/include/abstract_data_store.h @@ -0,0 +1,110 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include + +#include "types.h" +#include "windows_customizations.h" + +namespace diskann +{ + +template class AbstractDataStore +{ + public: + AbstractDataStore(const location_t capacity, const size_t dim); + + // Return number of points returned + virtual location_t load(const std::string &filename) = 0; + + // Why does store take num_pts? Since store only has capacity, but we allow + // resizing we can end up in a situation where the store has spare capacity. + // To optimize disk utilization, we pass the number of points that are "true" + // points, so that the store can discard the empty locations before saving. + virtual size_t save(const std::string &filename, const location_t num_pts) = 0; + + DISKANN_DLLEXPORT virtual location_t capacity() const; + + DISKANN_DLLEXPORT virtual size_t get_dims() const; + + // Implementers can choose to return _dim if they are not + // concerned about memory alignment. + // Some distance metrics (like l2) need data vectors to be aligned, so we + // align the dimension by padding zeros. + virtual size_t get_aligned_dim() const = 0; + + // populate the store with vectors (either from a pointer or bin file), + // potentially after pre-processing the vectors if the metric deems so + // e.g., normalizing vectors for cosine distance over floating-point vectors + // useful for bulk or static index building. + virtual void populate_data(const data_t *vectors, const location_t num_pts) = 0; + virtual void populate_data(const std::string &filename, const size_t offset) = 0; + + // save the first num_pts many vectors back to bin file + // note: cannot undo the pre-processing done in populate data + virtual void extract_data_to_bin(const std::string &filename, const location_t num_pts) = 0; + + // Returns the updated capacity of the datastore. Clients should check + // if resize actually changed the capacity to new_num_points before + // proceeding with operations. See the code below: + // auto new_capcity = data_store->resize(new_num_points); + // if ( new_capacity >= new_num_points) { + // //PROCEED + // else + // //ERROR. + virtual location_t resize(const location_t new_num_points); + + // operations on vectors + // like populate_data function, but over one vector at a time useful for + // streaming setting + virtual void get_vector(const location_t i, data_t *dest) const = 0; + virtual void set_vector(const location_t i, const data_t *const vector) = 0; + virtual void prefetch_vector(const location_t loc) = 0; + + // internal shuffle operations to move around vectors + // will bulk-move all the vectors in [old_start_loc, old_start_loc + + // num_points) to [new_start_loc, new_start_loc + num_points) and set the old + // positions to zero vectors. + virtual void move_vectors(const location_t old_start_loc, const location_t new_start_loc, + const location_t num_points) = 0; + + // same as above, without resetting the vectors in [from_loc, from_loc + + // num_points) to zero + virtual void copy_vectors(const location_t from_loc, const location_t to_loc, const location_t num_points) = 0; + + // metric specific operations + + virtual float get_distance(const data_t *query, const location_t loc) const = 0; + virtual void get_distance(const data_t *query, const location_t *locations, const uint32_t location_count, + float *distances) const = 0; + virtual float get_distance(const location_t loc1, const location_t loc2) const = 0; + + // stats of the data stored in store + // Returns the point in the dataset that is closest to the mean of all points + // in the dataset + virtual location_t calculate_medoid() const = 0; + + // search helpers + // if the base data is aligned per the request of the metric, this will tell + // how to align the query vector in a consistent manner + virtual size_t get_alignment_factor() const = 0; + + protected: + // Expand the datastore to new_num_points. Returns the new capacity created, + // which should be == new_num_points in the normal case. Implementers can also + // return _capacity to indicate that there are not implementing this method. + virtual location_t expand(const location_t new_num_points) = 0; + + // Shrink the datastore to new_num_points. It is NOT an error if shrink + // doesn't reduce the capacity so callers need to check this correctly. See + // also for "default" implementation + virtual location_t shrink(const location_t new_num_points) = 0; + + location_t _capacity; + size_t _dim; +}; + +} // namespace diskann diff --git a/include/abstract_graph_store.h b/include/abstract_graph_store.h new file mode 100644 index 000000000..f7735b79a --- /dev/null +++ b/include/abstract_graph_store.h @@ -0,0 +1,31 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include + +#include "types.h" + +namespace diskann +{ + +class AbstractGraphStore +{ + public: + AbstractGraphStore(const size_t max_pts) : _capacity(max_pts) + { + } + + virtual int load(const std::string &index_path_prefix) = 0; + virtual int store(const std::string &index_path_prefix) = 0; + + virtual void get_adj_list(const location_t i, std::vector &neighbors) = 0; + virtual void set_adj_list(const location_t i, std::vector &neighbors) = 0; + + private: + size_t _capacity; +}; + +} // namespace diskann diff --git a/include/distance.h b/include/distance.h index e04be7ee2..8b20e586b 100644 --- a/include/distance.h +++ b/include/distance.h @@ -1,5 +1,6 @@ #pragma once #include "windows_customizations.h" +#include namespace diskann { @@ -14,21 +15,77 @@ enum Metric template class Distance { public: - virtual float compare(const T *a, const T *b, uint32_t length) const = 0; - virtual ~Distance() + DISKANN_DLLEXPORT Distance(diskann::Metric dist_metric) : _distance_metric(dist_metric) { } + + // distance comparison function + DISKANN_DLLEXPORT virtual float compare(const T *a, const T *b, uint32_t length) const = 0; + + // Needed only for COSINE-BYTE and INNER_PRODUCT-BYTE + DISKANN_DLLEXPORT virtual float compare(const T *a, const T *b, const float normA, const float normB, + uint32_t length) const; + + // For MIPS, normalization adds an extra dimension to the vectors. + // This function lets callers know if the normalization process + // changes the dimension. + DISKANN_DLLEXPORT virtual uint32_t post_normalization_dimension(uint32_t orig_dimension) const; + + DISKANN_DLLEXPORT virtual diskann::Metric get_metric() const; + + // This is for efficiency. If no normalization is required, the callers + // can simply ignore the normalize_data_for_build() function. + DISKANN_DLLEXPORT virtual bool preprocessing_required() const; + + // Check the preprocessing_required() function before calling this. + // Clients can call the function like this: + // + // if (metric->preprocessing_required()){ + // T* normalized_data_batch; + // Split data into batches of batch_size and for each, call: + // metric->preprocess_base_points(data_batch, batch_size); + // + // TODO: This does not take into account the case for SSD inner product + // where the dimensions change after normalization. + DISKANN_DLLEXPORT virtual void preprocess_base_points(T *original_data, const size_t orig_dim, + const size_t num_points); + + // Invokes normalization for a single vector during search. The scratch space + // has to be created by the caller keeping track of the fact that + // normalization might change the dimension of the query vector. + DISKANN_DLLEXPORT virtual void preprocess_query(const T *query_vec, const size_t query_dim, T *scratch_query); + + // If an algorithm has a requirement that some data be aligned to a certain + // boundary it can use this function to indicate that requirement. Currently, + // we are setting it to 8 because that works well for AVX2. If we have AVX512 + // implementations of distance algos, they might have to set this to 16 + // (depending on how they are implemented) + DISKANN_DLLEXPORT virtual size_t get_required_alignment() const; + + // Providing a default implementation for the virtual destructor because we + // don't expect most metric implementations to need it. + DISKANN_DLLEXPORT virtual ~Distance(); + + protected: + diskann::Metric _distance_metric; + size_t _alignment_factor = 8; }; class DistanceCosineInt8 : public Distance { public: + DistanceCosineInt8() : Distance(diskann::Metric::COSINE) + { + } DISKANN_DLLEXPORT virtual float compare(const int8_t *a, const int8_t *b, uint32_t length) const; }; class DistanceL2Int8 : public Distance { public: + DistanceL2Int8() : Distance(diskann::Metric::L2) + { + } DISKANN_DLLEXPORT virtual float compare(const int8_t *a, const int8_t *b, uint32_t size) const; }; @@ -36,18 +93,28 @@ class DistanceL2Int8 : public Distance class AVXDistanceL2Int8 : public Distance { public: + AVXDistanceL2Int8() : Distance(diskann::Metric::L2) + { + } DISKANN_DLLEXPORT virtual float compare(const int8_t *a, const int8_t *b, uint32_t length) const; }; class DistanceCosineFloat : public Distance { public: + DistanceCosineFloat() : Distance(diskann::Metric::COSINE) + { + } DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t length) const; }; class DistanceL2Float : public Distance { public: + DistanceL2Float() : Distance(diskann::Metric::L2) + { + } + #ifdef _WINDOWS DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t size) const; #else @@ -58,46 +125,49 @@ class DistanceL2Float : public Distance class AVXDistanceL2Float : public Distance { public: + AVXDistanceL2Float() : Distance(diskann::Metric::L2) + { + } DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t length) const; }; -class SlowDistanceL2Float : public Distance +template class SlowDistanceL2 : public Distance { public: - DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t length) const; + SlowDistanceL2() : Distance(diskann::Metric::L2) + { + } + DISKANN_DLLEXPORT virtual float compare(const T *a, const T *b, uint32_t length) const; }; class SlowDistanceCosineUInt8 : public Distance { public: + SlowDistanceCosineUInt8() : Distance(diskann::Metric::COSINE) + { + } DISKANN_DLLEXPORT virtual float compare(const uint8_t *a, const uint8_t *b, uint32_t length) const; }; class DistanceL2UInt8 : public Distance { public: + DistanceL2UInt8() : Distance(diskann::Metric::L2) + { + } DISKANN_DLLEXPORT virtual float compare(const uint8_t *a, const uint8_t *b, uint32_t size) const; }; -// Simple implementations for non-AVX machines. Compiler can optimize. -template class SlowDistanceL2Int : public Distance +template class DistanceInnerProduct : public Distance { public: - // Implementing here because this is a template function - DISKANN_DLLEXPORT virtual float compare(const T *a, const T *b, uint32_t length) const + DistanceInnerProduct() : Distance(diskann::Metric::INNER_PRODUCT) { - uint32_t result = 0; - for (uint32_t i = 0; i < length; i++) - { - result += ((int32_t)((int16_t)a[i] - (int16_t)b[i])) * ((int32_t)((int16_t)a[i] - (int16_t)b[i])); - } - return (float)result; } -}; -template class DistanceInnerProduct : public Distance -{ - public: + DistanceInnerProduct(diskann::Metric metric) : Distance(metric) + { + } inline float inner_product(const T *a, const T *b, unsigned size) const; inline float compare(const T *a, const T *b, unsigned size) const @@ -115,6 +185,9 @@ template class DistanceFastL2 : public DistanceInnerProduct // currently defined only for float. // templated for future use. public: + DistanceFastL2() : DistanceInnerProduct(diskann::Metric::FAST_L2) + { + } float norm(const T *a, unsigned size) const; float compare(const T *a, const T *b, float norm, unsigned size) const; }; @@ -122,6 +195,9 @@ template class DistanceFastL2 : public DistanceInnerProduct class AVXDistanceInnerProductFloat : public Distance { public: + AVXDistanceInnerProductFloat() : Distance(diskann::Metric::INNER_PRODUCT) + { + } DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t length) const; }; @@ -130,13 +206,28 @@ class AVXNormalizedCosineDistanceFloat : public Distance private: AVXDistanceInnerProductFloat _innerProduct; + protected: + void normalize_and_copy(const float *a, uint32_t length, float *a_norm) const; + public: + AVXNormalizedCosineDistanceFloat() : Distance(diskann::Metric::COSINE) + { + } DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t length) const { // Inner product returns negative values to indicate distance. // This will ensure that cosine is between -1 and 1. return 1.0f + _innerProduct.compare(a, b, length); } + DISKANN_DLLEXPORT virtual uint32_t post_normalization_dimension(uint32_t orig_dimension) const override; + + DISKANN_DLLEXPORT virtual bool preprocessing_required() const; + + DISKANN_DLLEXPORT virtual void preprocess_base_points(float *original_data, const size_t orig_dim, + const size_t num_points) override; + + DISKANN_DLLEXPORT virtual void preprocess_query(const float *query_vec, const size_t query_dim, + float *scratch_query_vector) override; }; template Distance *get_distance_function(Metric m); diff --git a/include/in_mem_data_store.h b/include/in_mem_data_store.h new file mode 100644 index 000000000..70d1fa28f --- /dev/null +++ b/include/in_mem_data_store.h @@ -0,0 +1,81 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include +#include + +#include "tsl/robin_map.h" +#include "tsl/robin_set.h" +#include "tsl/sparse_map.h" +// #include "boost/dynamic_bitset.hpp" + +#include "abstract_data_store.h" + +#include "distance.h" +#include "natural_number_map.h" +#include "natural_number_set.h" + +namespace diskann +{ +template class InMemDataStore : public AbstractDataStore +{ + public: + InMemDataStore(const location_t capacity, const size_t dim, std::shared_ptr> distance_fn); + virtual ~InMemDataStore(); + + virtual location_t load(const std::string &filename) override; + virtual size_t save(const std::string &filename, const location_t num_points) override; + + virtual size_t get_aligned_dim() const override; + + // Populate internal data from unaligned data while doing alignment and any + // normalization that is required. + virtual void populate_data(const data_t *vectors, const location_t num_pts) override; + virtual void populate_data(const std::string &filename, const size_t offset) override; + + virtual void extract_data_to_bin(const std::string &filename, const location_t num_pts) override; + + virtual void get_vector(const location_t i, data_t *target) const override; + virtual void set_vector(const location_t i, const data_t *const vector) override; + virtual void prefetch_vector(const location_t loc) override; + + virtual void move_vectors(const location_t old_location_start, const location_t new_location_start, + const location_t num_points) override; + virtual void copy_vectors(const location_t from_loc, const location_t to_loc, const location_t num_points) override; + + virtual float get_distance(const data_t *query, const location_t loc) const override; + virtual float get_distance(const location_t loc1, const location_t loc2) const override; + virtual void get_distance(const data_t *query, const location_t *locations, const uint32_t location_count, + float *distances) const override; + + virtual location_t calculate_medoid() const override; + + virtual Distance *get_dist_fn(); + + virtual size_t get_alignment_factor() const override; + + protected: + virtual location_t expand(const location_t new_size) override; + virtual location_t shrink(const location_t new_size) override; + + virtual location_t load_impl(const std::string &filename); +#ifdef EXEC_ENV_OLS + virtual location_t load_impl(AlignedFileReader &reader); +#endif + + private: + data_t *_data = nullptr; + + size_t _aligned_dim; + + // It may seem weird to put distance metric along with the data store class, + // but this gives us perf benefits as the datastore can do distance + // computations during search and compute norms of vectors internally without + // have to copy data back and forth. + std::shared_ptr> _distance_fn; + + // in case we need to save vector norms for optimization + std::shared_ptr _pre_computed_norms; +}; + +} // namespace diskann \ No newline at end of file diff --git a/include/in_mem_graph_store.h b/include/in_mem_graph_store.h new file mode 100644 index 000000000..98a9e4dc5 --- /dev/null +++ b/include/in_mem_graph_store.h @@ -0,0 +1,23 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include "abstract_graph_store.h" + +namespace diskann +{ + +class InMemGraphStore : public AbstractGraphStore +{ + public: + InMemGraphStore(const size_t max_pts); + + int load(const std::string &index_path_prefix); + int store(const std::string &index_path_prefix); + + void get_adj_list(const location_t i, std::vector &neighbors); + void set_adj_list(const location_t i, std::vector &neighbors); +}; + +} // namespace diskann diff --git a/include/index.h b/include/index.h index b49059192..9090580f0 100644 --- a/include/index.h +++ b/include/index.h @@ -18,6 +18,7 @@ #include "utils.h" #include "windows_customizations.h" #include "scratch.h" +#include "in_mem_data_store.h" #define OVERHEAD_FACTOR 1.1 #define EXPAND_IF_FULL 0 @@ -311,10 +312,10 @@ template clas private: // Distance functions Metric _dist_metric = diskann::L2; - Distance *_distance = nullptr; + std::shared_ptr> _distance; // Data - T *_data = nullptr; + std::unique_ptr> _data_store; char *_opt_graph = nullptr; // Graph related data structures @@ -322,13 +323,14 @@ template clas // Dimensions size_t _dim = 0; - size_t _aligned_dim = 0; size_t _nd = 0; // number of active points i.e. existing in the graph size_t _max_points = 0; // total number of points in given data set - // Number of points which are used as initial candidates when iterating to - // closest point(s). These are not visible externally and won't be returned - // by search. DiskANN forces at least 1 frozen point for dynamic index. - // The frozen points have consecutive locations. See also _start below. + + // _num_frozen_pts is the number of points which are used as initial + // candidates when iterating to closest point(s). These are not visible + // externally and won't be returned by search. At least 1 frozen point is + // needed for a dynamic index. The frozen points have consecutive locations. + // See also _start below. size_t _num_frozen_pts = 0; size_t _max_range_of_loaded_graph = 0; size_t _node_size; @@ -395,7 +397,7 @@ template clas std::unique_ptr> _delete_set; bool _data_compacted = true; // true if data has been compacted - bool _is_saved = false; // Gopal. Checking if the index is already saved. + bool _is_saved = false; // Checking if the index is already saved. bool _conc_consolidate = false; // use _lock while searching // Acquire locks in the order below when acquiring multiple locks diff --git a/include/natural_number_map.h b/include/natural_number_map.h index 3bbdcb640..820ac3fdf 100644 --- a/include/natural_number_map.h +++ b/include/natural_number_map.h @@ -7,7 +7,7 @@ #include #include -#include "boost_dynamic_bitset_fwd.h" +#include namespace diskann { diff --git a/include/scratch.h b/include/scratch.h index 4fa144e7d..4ee345aa9 100644 --- a/include/scratch.h +++ b/include/scratch.h @@ -27,6 +27,7 @@ namespace diskann { + // // Scratch space for in-memory index based search // @@ -34,8 +35,9 @@ template class InMemQueryScratch { public: ~InMemQueryScratch(); - InMemQueryScratch(uint32_t search_l, uint32_t indexing_l, uint32_t r, uint32_t maxc, size_t dim, - bool init_pq_scratch = false); + // REFACTOR TODO: move all parameters to a new class. + InMemQueryScratch(uint32_t search_l, uint32_t indexing_l, uint32_t r, uint32_t maxc, size_t dim, size_t aligned_dim, + size_t alignment_factor, bool init_pq_scratch = false); void resize_for_new_L(uint32_t new_search_l); void clear(); diff --git a/include/utils.h b/include/utils.h index d517050d2..b484c2aeb 100644 --- a/include/utils.h +++ b/include/utils.h @@ -1006,18 +1006,23 @@ inline bool validate_index_file_size(std::ifstream &in) return true; } -// This function is valid only for float data type. -template inline void normalize(T *arr, size_t dim) +template inline float get_norm(T *arr, const size_t dim) { float sum = 0.0f; for (uint32_t i = 0; i < dim; i++) { sum += arr[i] * arr[i]; } - sum = sqrt(sum); + return sqrt(sum); +} + +// This function is valid only for float data type. +template inline void normalize(T *arr, const size_t dim) +{ + float norm = get_norm(arr, dim); for (uint32_t i = 0; i < dim; i++) { - arr[i] = (T)(arr[i] / sum); + arr[i] = (T)(arr[i] / norm); } } diff --git a/python/src/diskann_bindings.cpp b/python/src/diskann_bindings.cpp index 6a977cac7..9b6b97bc0 100644 --- a/python/src/diskann_bindings.cpp +++ b/python/src/diskann_bindings.cpp @@ -161,8 +161,9 @@ template struct DynamicInMemIndex _index = new Index(m, dim, max_points, true, // dynamic_index index_parameters, // used for insert - initial_search_list_size, // used to prepare the scratch space for searching. can / may be - // expanded if the search asks for a larger L. + initial_search_list_size, // used to prepare the scratch space for + // searching. can / may be expanded if the + // search asks for a larger L. search_threads, // also used for the scratch space true, // enable_tags concurrent_consolidate, diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f802da879..598ea1705 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -8,8 +8,10 @@ if(MSVC) add_subdirectory(dll) else() #file(GLOB CPP_SOURCES *.cpp) - set(CPP_SOURCES ann_exception.cpp disk_utils.cpp distance.cpp index.cpp + set(CPP_SOURCES abstract_data_store.cpp ann_exception.cpp disk_utils.cpp + distance.cpp index.cpp in_mem_graph_store.cpp in_mem_data_store.cpp linux_aligned_file_reader.cpp math_utils.cpp natural_number_map.cpp + in_mem_data_store.cpp in_mem_graph_store.cpp natural_number_set.cpp memory_mapper.cpp partition.cpp pq.cpp pq_flash_index.cpp scratch.cpp logger.cpp utils.cpp filter_utils.cpp) if (RESTAPI) diff --git a/src/abstract_data_store.cpp b/src/abstract_data_store.cpp new file mode 100644 index 000000000..a980bd545 --- /dev/null +++ b/src/abstract_data_store.cpp @@ -0,0 +1,46 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include + +#include "abstract_data_store.h" + +namespace diskann +{ + +template +AbstractDataStore::AbstractDataStore(const location_t capacity, const size_t dim) + : _capacity(capacity), _dim(dim) +{ +} + +template location_t AbstractDataStore::capacity() const +{ + return _capacity; +} + +template size_t AbstractDataStore::get_dims() const +{ + return _dim; +} + +template location_t AbstractDataStore::resize(const location_t new_num_points) +{ + if (new_num_points > _capacity) + { + return expand(new_num_points); + } + else if (new_num_points < _capacity) + { + return shrink(new_num_points); + } + else + { + return _capacity; + } +} + +template DISKANN_DLLEXPORT class AbstractDataStore; +template DISKANN_DLLEXPORT class AbstractDataStore; +template DISKANN_DLLEXPORT class AbstractDataStore; +} // namespace diskann diff --git a/src/disk_utils.cpp b/src/disk_utils.cpp index 55a6efbc1..aadeb6dd1 100644 --- a/src/disk_utils.cpp +++ b/src/disk_utils.cpp @@ -1210,7 +1210,8 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const if (param_list.size() >= 9 && atoi(param_list[8].c_str()) <= MAX_PQ_CHUNKS && atoi(param_list[8].c_str()) > 0) { - std::cout << "Use quantized dimension (QD) to overwrite derived quantized dimension from search_DRAM_budget (B)" + std::cout << "Use quantized dimension (QD) to overwrite derived quantized " + "dimension from search_DRAM_budget (B)" << std::endl; num_pq_chunks = atoi(param_list[8].c_str()); } diff --git a/src/distance.cpp b/src/distance.cpp index 99676bb6f..31ab9d3ff 100644 --- a/src/distance.cpp +++ b/src/distance.cpp @@ -22,6 +22,49 @@ namespace diskann { +// +// Base Class Implementatons +// +template +float Distance::compare(const T *a, const T *b, const float normA, const float normB, uint32_t length) const +{ + throw std::logic_error("This function is not implemented."); +} + +template uint32_t Distance::post_normalization_dimension(uint32_t orig_dimension) const +{ + return orig_dimension; +} + +template diskann::Metric Distance::get_metric() const +{ + return _distance_metric; +} + +template bool Distance::preprocessing_required() const +{ + return false; +} + +template +void Distance::preprocess_base_points(T *original_data, const size_t orig_dim, const size_t num_points) +{ +} + +template void Distance::preprocess_query(const T *query_vec, const size_t query_dim, T *scratch_query) +{ + std::memcpy(scratch_query, query_vec, query_dim * sizeof(T)); +} + +template size_t Distance::get_required_alignment() const +{ + return _alignment_factor; +} + +template Distance::~Distance() +{ +} + // // Cosine distance functions. // @@ -181,12 +224,12 @@ float DistanceL2Float::compare(const float *a, const float *b, uint32_t size) co return result; } -float SlowDistanceL2Float::compare(const float *a, const float *b, uint32_t length) const +template float SlowDistanceL2::compare(const T *a, const T *b, uint32_t length) const { float result = 0.0f; for (uint32_t i = 0; i < length; i++) { - result += (a[i] - b[i]) * (a[i] - b[i]); + result += ((float)(a[i] - b[i])) * (a[i] - b[i]); } return result; } @@ -522,6 +565,40 @@ float AVXDistanceInnerProductFloat::compare(const float *a, const float *b, uint return -result; } +uint32_t AVXNormalizedCosineDistanceFloat::post_normalization_dimension(uint32_t orig_dimension) const +{ + return orig_dimension; +} +bool AVXNormalizedCosineDistanceFloat::preprocessing_required() const +{ + return true; +} +void AVXNormalizedCosineDistanceFloat::preprocess_base_points(float *original_data, const size_t orig_dim, + const size_t num_points) +{ + for (uint32_t i = 0; i < num_points; i++) + { + normalize((float *)(original_data + i * orig_dim), orig_dim); + } +} + +void AVXNormalizedCosineDistanceFloat::preprocess_query(const float *query_vec, const size_t query_dim, + float *query_scratch) +{ + normalize_and_copy(query_vec, (uint32_t)query_dim, query_scratch); +} + +void AVXNormalizedCosineDistanceFloat::normalize_and_copy(const float *query_vec, const uint32_t query_dim, + float *query_target) const +{ + float norm = get_norm(query_vec, query_dim); + + for (uint32_t i = 0; i < query_dim; i++) + { + query_target[i] = query_vec[i] / norm; + } +} + // Get the right distance function for the given metric. template <> diskann::Distance *get_distance_function(diskann::Metric m) { @@ -540,7 +617,7 @@ template <> diskann::Distance *get_distance_function(diskann::Metric m) else { diskann::cout << "L2: Older CPU. Using slow distance computation" << std::endl; - return new diskann::SlowDistanceL2Float(); + return new diskann::SlowDistanceL2(); } } else if (m == diskann::Metric::COSINE) @@ -592,7 +669,7 @@ template <> diskann::Distance *get_distance_function(diskann::Metric m) diskann::cout << "Older CPU. Using slow distance computation " "SlowDistanceL2Int." << std::endl; - return new diskann::SlowDistanceL2Int(); + return new diskann::SlowDistanceL2(); } } else if (m == diskann::Metric::COSINE) @@ -649,4 +726,8 @@ template DISKANN_DLLEXPORT class DistanceFastL2; template DISKANN_DLLEXPORT class DistanceFastL2; template DISKANN_DLLEXPORT class DistanceFastL2; +template DISKANN_DLLEXPORT class SlowDistanceL2; +template DISKANN_DLLEXPORT class SlowDistanceL2; +template DISKANN_DLLEXPORT class SlowDistanceL2; + } // namespace diskann diff --git a/src/dll/CMakeLists.txt b/src/dll/CMakeLists.txt index 1423d6c2c..e02996f32 100644 --- a/src/dll/CMakeLists.txt +++ b/src/dll/CMakeLists.txt @@ -1,8 +1,9 @@ #Copyright(c) Microsoft Corporation.All rights reserved. #Licensed under the MIT license. -add_library(${PROJECT_NAME} SHARED dllmain.cpp ../partition.cpp ../pq.cpp ../pq_flash_index.cpp ../logger.cpp ../utils.cpp - ../windows_aligned_file_reader.cpp ../distance.cpp ../memory_mapper.cpp ../index.cpp ../math_utils.cpp ../disk_utils.cpp ../filter_utils.cpp +add_library(${PROJECT_NAME} SHARED dllmain.cpp ../abstract_data_store.cpp ../partition.cpp ../pq.cpp ../pq_flash_index.cpp ../logger.cpp ../utils.cpp + ../windows_aligned_file_reader.cpp ../distance.cpp ../memory_mapper.cpp ../index.cpp + ../in_mem_data_store.cpp ../in_mem_graph_store.cpp ../math_utils.cpp ../disk_utils.cpp ../filter_utils.cpp ../ann_exception.cpp ../natural_number_set.cpp ../natural_number_map.cpp ../scratch.cpp) set(TARGET_DIR "$<$:${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}>$<$:${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}>") diff --git a/src/in_mem_data_store.cpp b/src/in_mem_data_store.cpp new file mode 100644 index 000000000..b2f708263 --- /dev/null +++ b/src/in_mem_data_store.cpp @@ -0,0 +1,369 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include +#include "in_mem_data_store.h" + +#include "utils.h" + +namespace diskann +{ + +template +InMemDataStore::InMemDataStore(const location_t num_points, const size_t dim, + std::shared_ptr> distance_fn) + : AbstractDataStore(num_points, dim), _distance_fn(distance_fn) +{ + _aligned_dim = ROUND_UP(dim, _distance_fn->get_required_alignment()); + alloc_aligned(((void **)&_data), this->_capacity * _aligned_dim * sizeof(data_t), 8 * sizeof(data_t)); + std::memset(_data, 0, this->_capacity * _aligned_dim * sizeof(data_t)); +} + +template InMemDataStore::~InMemDataStore() +{ + if (_data != nullptr) + { + aligned_free(this->_data); + } +} + +template size_t InMemDataStore::get_aligned_dim() const +{ + return _aligned_dim; +} + +template size_t InMemDataStore::get_alignment_factor() const +{ + return _distance_fn->get_required_alignment(); +} + +template location_t InMemDataStore::load(const std::string &filename) +{ + return load_impl(filename); +} + +#ifdef EXEC_ENV_OLS +template location_t Index::load_impl(AlignedFileReader &reader) +{ + size_t file_dim, file_num_points; + + diskann::get_bin_metadata(reader, file_num_points, file_dim); + + if (file_dim != _dim) + { + std::stringstream stream; + stream << "ERROR: Driver requests loading " << _dim << " dimension," + << "but file has " << file_dim << " dimension." << std::endl; + diskann::cerr << stream.str() << std::endl; + aligned_free(_data); + throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); + } + + if (file_num_points > _max_points + _num_frozen_pts) + { + resize(file_num_points - _num_frozen_pts); + } + + return file_num_points; +} +#endif + +template location_t InMemDataStore::load_impl(const std::string &filename) +{ + size_t file_dim, file_num_points; + if (!file_exists(filename)) + { + std::stringstream stream; + stream << "ERROR: data file " << filename << " does not exist." << std::endl; + diskann::cerr << stream.str() << std::endl; + aligned_free(_data); + throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); + } + diskann::get_bin_metadata(filename, file_num_points, file_dim); + + if (file_dim != this->_dim) + { + std::stringstream stream; + stream << "ERROR: Driver requests loading " << this->_dim << " dimension," + << "but file has " << file_dim << " dimension." << std::endl; + diskann::cerr << stream.str() << std::endl; + aligned_free(_data); + throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); + } + + if (file_num_points > this->capacity()) + { + this->resize((location_t)file_num_points); + } + + copy_aligned_data_from_file(filename.c_str(), _data, file_num_points, file_dim, _aligned_dim); + + return (location_t)file_num_points; +} + +template size_t InMemDataStore::save(const std::string &filename, const location_t num_points) +{ + return save_data_in_base_dimensions(filename, _data, num_points, this->get_dims(), this->get_aligned_dim(), 0U); +} + +template void InMemDataStore::populate_data(const data_t *vectors, const location_t num_pts) +{ + memset(_data, 0, _aligned_dim * sizeof(data_t) * num_pts); + for (location_t i = 0; i < num_pts; i++) + { + std::memmove(_data + i * _aligned_dim, vectors + i * this->_dim, this->_dim * sizeof(data_t)); + } + + if (_distance_fn->preprocessing_required()) + { + _distance_fn->preprocess_base_points(_data, this->_aligned_dim, num_pts); + } +} + +template void InMemDataStore::populate_data(const std::string &filename, const size_t offset) +{ + size_t npts, ndim; + copy_aligned_data_from_file(filename.c_str(), _data, npts, ndim, _aligned_dim, offset); + + if ((location_t)npts > this->capacity()) + { + std::stringstream ss; + ss << "Number of points in the file: " << filename + << " is greater than the capacity of data store: " << this->capacity() + << ". Must invoke resize before calling populate_data()" << std::endl; + throw diskann::ANNException(ss.str(), -1); + } + + if ((location_t)ndim != this->get_dims()) + { + std::stringstream ss; + ss << "Number of dimensions of a point in the file: " << filename + << " is not equal to dimensions of data store: " << this->capacity() << "." << std::endl; + throw diskann::ANNException(ss.str(), -1); + } + + if (_distance_fn->preprocessing_required()) + { + _distance_fn->preprocess_base_points(_data, this->_aligned_dim, this->capacity()); + } +} + +template +void InMemDataStore::extract_data_to_bin(const std::string &filename, const location_t num_points) +{ + save_data_in_base_dimensions(filename, _data, num_points, this->get_dims(), this->get_aligned_dim(), 0U); +} + +template void InMemDataStore::get_vector(const location_t i, data_t *dest) const +{ + memcpy(dest, _data + i * _aligned_dim, this->_dim * sizeof(data_t)); +} + +template void InMemDataStore::set_vector(const location_t loc, const data_t *const vector) +{ + size_t offset_in_data = loc * _aligned_dim; + memset(_data + offset_in_data, 0, _aligned_dim * sizeof(data_t)); + memcpy(_data + offset_in_data, vector, this->_dim * sizeof(data_t)); + if (_distance_fn->preprocessing_required()) + { + _distance_fn->preprocess_base_points(_data + offset_in_data, _aligned_dim, 1); + } +} + +template void InMemDataStore::prefetch_vector(const location_t loc) +{ + diskann::prefetch_vector((const char *)_data + _aligned_dim * (size_t)loc, sizeof(data_t) * _aligned_dim); +} + +template float InMemDataStore::get_distance(const data_t *query, const location_t loc) const +{ + return _distance_fn->compare(query, _data + _aligned_dim * loc, (uint32_t)_aligned_dim); +} + +template +void InMemDataStore::get_distance(const data_t *query, const location_t *locations, + const uint32_t location_count, float *distances) const +{ + for (location_t i = 0; i < location_count; i++) + { + distances[i] = _distance_fn->compare(query, _data + locations[i] * _aligned_dim, (uint32_t)this->_aligned_dim); + } +} + +template +float InMemDataStore::get_distance(const location_t loc1, const location_t loc2) const +{ + return _distance_fn->compare(_data + loc1 * _aligned_dim, _data + loc2 * _aligned_dim, + (uint32_t)this->_aligned_dim); +} + +template location_t InMemDataStore::expand(const location_t new_size) +{ + if (new_size == this->capacity()) + { + return this->capacity(); + } + else if (new_size < this->capacity()) + { + std::stringstream ss; + ss << "Cannot 'expand' datastore when new capacity (" << new_size << ") < existing capacity(" + << this->capacity() << ")" << std::endl; + throw diskann::ANNException(ss.str(), -1); + } +#ifndef _WINDOWS + data_t *new_data; + alloc_aligned((void **)&new_data, new_size * _aligned_dim * sizeof(data_t), 8 * sizeof(data_t)); + memcpy(new_data, _data, this->capacity() * _aligned_dim * sizeof(data_t)); + aligned_free(_data); + _data = new_data; +#else + realloc_aligned((void **)&_data, new_size * _aligned_dim * sizeof(data_t), 8 * sizeof(data_t)); +#endif + this->_capacity = new_size; + return this->_capacity; +} + +template location_t InMemDataStore::shrink(const location_t new_size) +{ + if (new_size == this->capacity()) + { + return this->capacity(); + } + else if (new_size > this->capacity()) + { + std::stringstream ss; + ss << "Cannot 'shrink' datastore when new capacity (" << new_size << ") > existing capacity(" + << this->capacity() << ")" << std::endl; + throw diskann::ANNException(ss.str(), -1); + } +#ifndef _WINDOWS + data_t *new_data; + alloc_aligned((void **)&new_data, new_size * _aligned_dim * sizeof(data_t), 8 * sizeof(data_t)); + memcpy(new_data, _data, new_size * _aligned_dim * sizeof(data_t)); + aligned_free(_data); + _data = new_data; +#else + realloc_aligned((void **)&_data, new_size * _aligned_dim * sizeof(data_t), 8 * sizeof(data_t)); +#endif + this->_capacity = new_size; + return this->_capacity; +} + +template +void InMemDataStore::move_vectors(const location_t old_location_start, const location_t new_location_start, + const location_t num_locations) +{ + if (num_locations == 0 || old_location_start == new_location_start) + { + return; + } + + /* // Update pointers to the moved nodes. Note: the computation is correct + even + // when new_location_start < old_location_start given the C++ uint32_t + // integer arithmetic rules. + const uint32_t location_delta = new_location_start - old_location_start; + */ + // The [start, end) interval which will contain obsolete points to be + // cleared. + uint32_t mem_clear_loc_start = old_location_start; + uint32_t mem_clear_loc_end_limit = old_location_start + num_locations; + + if (new_location_start < old_location_start) + { + // If ranges are overlapping, make sure not to clear the newly copied + // data. + if (mem_clear_loc_start < new_location_start + num_locations) + { + // Clear only after the end of the new range. + mem_clear_loc_start = new_location_start + num_locations; + } + } + else + { + // If ranges are overlapping, make sure not to clear the newly copied + // data. + if (mem_clear_loc_end_limit > new_location_start) + { + // Clear only up to the beginning of the new range. + mem_clear_loc_end_limit = new_location_start; + } + } + + // Use memmove to handle overlapping ranges. + copy_vectors(old_location_start, new_location_start, num_locations); + memset(_data + _aligned_dim * mem_clear_loc_start, 0, + sizeof(data_t) * _aligned_dim * (mem_clear_loc_end_limit - mem_clear_loc_start)); +} + +template +void InMemDataStore::copy_vectors(const location_t from_loc, const location_t to_loc, + const location_t num_points) +{ + assert(from_loc < this->_capacity); + assert(to_loc < this->_capacity); + assert(num_points < this->_capacity); + memmove(_data + _aligned_dim * to_loc, _data + _aligned_dim * from_loc, num_points * _aligned_dim * sizeof(data_t)); +} + +template location_t InMemDataStore::calculate_medoid() const +{ + // allocate and init centroid + float *center = new float[_aligned_dim]; + for (size_t j = 0; j < _aligned_dim; j++) + center[j] = 0; + + for (size_t i = 0; i < this->capacity(); i++) + for (size_t j = 0; j < _aligned_dim; j++) + center[j] += (float)_data[i * _aligned_dim + j]; + + for (size_t j = 0; j < _aligned_dim; j++) + center[j] /= (float)this->capacity(); + + // compute all to one distance + float *distances = new float[this->capacity()]; + + // TODO: REFACTOR. Removing pragma might make this slow. Must revisit. + // Problem is that we need to pass num_threads here, it is not clear + // if data store must be aware of threads! + // #pragma omp parallel for schedule(static, 65536) + for (int64_t i = 0; i < (int64_t)this->capacity(); i++) + { + // extract point and distance reference + float &dist = distances[i]; + const data_t *cur_vec = _data + (i * (size_t)_aligned_dim); + dist = 0; + float diff = 0; + for (size_t j = 0; j < _aligned_dim; j++) + { + diff = (center[j] - (float)cur_vec[j]) * (center[j] - (float)cur_vec[j]); + dist += diff; + } + } + // find imin + uint32_t min_idx = 0; + float min_dist = distances[0]; + for (uint32_t i = 1; i < this->capacity(); i++) + { + if (distances[i] < min_dist) + { + min_idx = i; + min_dist = distances[i]; + } + } + + delete[] distances; + delete[] center; + return min_idx; +} + +template Distance *InMemDataStore::get_dist_fn() +{ + return this->_distance_fn.get(); +} + +template DISKANN_DLLEXPORT class InMemDataStore; +template DISKANN_DLLEXPORT class InMemDataStore; +template DISKANN_DLLEXPORT class InMemDataStore; + +} // namespace diskann \ No newline at end of file diff --git a/src/in_mem_graph_store.cpp b/src/in_mem_graph_store.cpp new file mode 100644 index 000000000..e9bfd4e9e --- /dev/null +++ b/src/in_mem_graph_store.cpp @@ -0,0 +1,31 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include "in_mem_graph_store.h" +#include "utils.h" + +namespace diskann +{ + +InMemGraphStore::InMemGraphStore(const size_t max_pts) : AbstractGraphStore(max_pts) +{ +} + +int InMemGraphStore::load(const std::string &index_path_prefix) +{ + return 0; +} +int InMemGraphStore::store(const std::string &index_path_prefix) +{ + return 0; +} + +void InMemGraphStore::get_adj_list(const location_t i, std::vector &neighbors) +{ +} + +void InMemGraphStore::set_adj_list(const location_t i, std::vector &neighbors) +{ +} + +} // namespace diskann diff --git a/src/index.cpp b/src/index.cpp index ce8b4e1e8..ccb87939e 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -74,9 +74,6 @@ Index::Index(Metric m, const size_t dim, const size_t max_point -1, __FUNCSIG__, __FILE__, __LINE__); } - // data stored to _nd * aligned_dim matrix with necessary zero-padding - _aligned_dim = ROUND_UP(_dim, 8); - if (dynamic_index && _num_frozen_pts == 0) { _num_frozen_pts = 1; @@ -96,17 +93,16 @@ Index::Index(Metric m, const size_t dim, const size_t max_point alloc_aligned(((void **)&_pq_data), total_internal_points * _num_pq_chunks * sizeof(char), 8 * sizeof(char)); std::memset(_pq_data, 0, total_internal_points * _num_pq_chunks * sizeof(char)); } - alloc_aligned(((void **)&_data), total_internal_points * _aligned_dim * sizeof(T), 8 * sizeof(T)); - std::memset(_data, 0, total_internal_points * _aligned_dim * sizeof(T)); _start = (uint32_t)_max_points; _final_graph.resize(total_internal_points); + // This should come from a factory. if (m == diskann::Metric::COSINE && std::is_floating_point::value) { // This is safe because T is float inside the if block. - this->_distance = (Distance *)new AVXNormalizedCosineDistanceFloat(); + this->_distance.reset((Distance *)new AVXNormalizedCosineDistanceFloat()); this->_normalize_vecs = true; diskann::cout << "Normalizing vectors and using L2 for cosine " "AVXNormalizedCosineDistanceFloat()." @@ -114,8 +110,12 @@ Index::Index(Metric m, const size_t dim, const size_t max_point } else { - this->_distance = get_distance_function(m); + this->_distance.reset((Distance *)get_distance_function(m)); } + // REFACTOR: TODO This should move to a factory method. + + _data_store = + std::make_unique>((location_t)total_internal_points, _dim, this->_distance); _locks = std::vector(total_internal_points); @@ -139,16 +139,13 @@ template Index::~I LockGuard lg(lock); } - if (this->_distance != nullptr) - { - delete this->_distance; - this->_distance = nullptr; - } - if (this->_data != nullptr) - { - aligned_free(this->_data); - this->_data = nullptr; - } + // if (this->_distance != nullptr) + //{ + // delete this->_distance; + // this->_distance = nullptr; + // } + // REFACTOR + if (_opt_graph != nullptr) { delete[] _opt_graph; @@ -167,7 +164,8 @@ void Index::initialize_query_scratch(uint32_t num_threads, uint { for (uint32_t i = 0; i < num_threads; i++) { - auto scratch = new InMemQueryScratch(search_l, indexing_l, r, maxc, dim, _pq_dist); + auto scratch = new InMemQueryScratch(search_l, indexing_l, r, maxc, dim, _data_store->get_aligned_dim(), + _data_store->get_alignment_factor(), _pq_dist); _query_scratch.push(scratch); } } @@ -215,7 +213,7 @@ template size_t Indexsave(data_file, (location_t)(_nd + _num_frozen_pts)); } // save the graph index on a file as an adjacency list. For each point, @@ -441,7 +439,6 @@ size_t Index::load_data(std::string filename) std::stringstream stream; stream << "ERROR: data file " << filename << " does not exist." << std::endl; diskann::cerr << stream.str() << std::endl; - aligned_free(_data); throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); } diskann::get_bin_metadata(filename, file_num_points, file_dim); @@ -456,7 +453,6 @@ size_t Index::load_data(std::string filename) stream << "ERROR: Driver requests loading " << _dim << " dimension," << "but file has " << file_dim << " dimension." << std::endl; diskann::cerr << stream.str() << std::endl; - aligned_free(_data); throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); } @@ -467,9 +463,12 @@ size_t Index::load_data(std::string filename) } #ifdef EXEC_ENV_OLS + + // REFACTOR TODO: Must figure out how to support aligned reader in a clean + // manner. copy_aligned_data_from_file(reader, _data, file_num_points, file_dim, _aligned_dim); #else - copy_aligned_data_from_file(filename.c_str(), _data, file_num_points, file_dim, _aligned_dim); + _data_store->load(filename); // offset == 0. #endif return file_num_points; } @@ -559,7 +558,6 @@ void Index::load(const char *filename, uint32_t num_threads, ui << graph_num_pts << " from graph, and " << tags_file_num_pts << " tags, with num_frozen_pts being set to " << _num_frozen_pts << " in constructor." << std::endl; diskann::cerr << stream.str() << std::endl; - aligned_free(_data); throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); } @@ -711,7 +709,6 @@ size_t Index::load_graph(std::string filename, size_t expected_ << std::endl; } diskann::cerr << stream.str() << std::endl; - aligned_free(_data); throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); } @@ -802,8 +799,9 @@ template int Index return -1; } - size_t location = _tag_to_location[tag]; - memcpy((void *)vec, (void *)(_data + location * _aligned_dim), (size_t)_dim * sizeof(T)); + location_t location = _tag_to_location[tag]; + _data_store->get_vector(location, vec); + return 0; } @@ -816,49 +814,9 @@ template uint32_t Indexcalculate_medoid(); } template std::vector Index::get_init_ids() @@ -892,12 +850,16 @@ std::pair Index::iterate_to_fixed_point( std::vector &id_scratch = scratch->id_scratch(); std::vector &dist_scratch = scratch->dist_scratch(); assert(id_scratch.size() == 0); + + // REFACTOR + // T *aligned_query = scratch->aligned_query(); + // memcpy(aligned_query, query, _dim * sizeof(T)); + // if (_normalize_vecs) + //{ + // normalize((float *)aligned_query, _dim); + // } + T *aligned_query = scratch->aligned_query(); - memcpy(aligned_query, query, _dim * sizeof(T)); - if (_normalize_vecs) - { - normalize((float *)aligned_query, _dim); - } float *query_float = nullptr; float *query_rotated = nullptr; @@ -999,9 +961,13 @@ std::pair Index::iterate_to_fixed_point( float distance; if (_pq_dist) + { pq_dist_lookup(pq_coord_scratch, 1, this->_num_pq_chunks, pq_dists, &distance); + } else - distance = _distance->compare(_data + _aligned_dim * (size_t)id, aligned_query, (uint32_t)_aligned_dim); + { + distance = _data_store->get_distance(aligned_query, id); + } Neighbor nn = Neighbor(id, distance); best_L_nodes.insert(nn); } @@ -1101,12 +1067,10 @@ std::pair Index::iterate_to_fixed_point( if (m + 1 < id_scratch.size()) { auto nextn = id_scratch[m + 1]; - diskann::prefetch_vector((const char *)_data + _aligned_dim * (size_t)nextn, - sizeof(T) * _aligned_dim); + _data_store->prefetch_vector(nextn); } - dist_scratch.push_back( - _distance->compare(aligned_query, _data + _aligned_dim * (size_t)id, (uint32_t)_aligned_dim)); + dist_scratch.push_back(_data_store->get_distance(aligned_query, id)); } } cmps += (uint32_t)id_scratch.size(); @@ -1131,16 +1095,18 @@ void Index::search_for_point_and_prune(int location, uint32_t L if (!use_filter) { - iterate_to_fixed_point(_data + _aligned_dim * location, Lindex, init_ids, scratch, false, unused_filter_label, - false); + _data_store->get_vector(location, scratch->aligned_query()); + iterate_to_fixed_point(scratch->aligned_query(), Lindex, init_ids, scratch, false, unused_filter_label, false); } else { std::vector filter_specific_start_nodes; for (auto &x : _pts_to_labels[location]) filter_specific_start_nodes.emplace_back(_label_to_medoid_id[x]); - iterate_to_fixed_point(_data + _aligned_dim * location, filteredLindex, filter_specific_start_nodes, scratch, - true, _pts_to_labels[location], false); + + _data_store->get_vector(location, scratch->aligned_query()); + iterate_to_fixed_point(scratch->aligned_query(), filteredLindex, filter_specific_start_nodes, scratch, true, + _pts_to_labels[location], false); } auto &pool = scratch->pool(); @@ -1236,8 +1202,7 @@ void Index::occlude_list(const uint32_t location, std::vectorcompare(_data + _aligned_dim * (size_t)iter2->id, - _data + _aligned_dim * (size_t)iter->id, (uint32_t)_aligned_dim); + float djk = _data_store->get_distance(iter2->id, iter->id); if (_dist_metric == diskann::Metric::L2 || _dist_metric == diskann::Metric::COSINE) { occlude_factor[t] = (djk == 0) ? std::numeric_limits::max() @@ -1284,8 +1249,7 @@ void Index::prune_neighbors(const uint32_t location, std::vecto if (_pq_dist) { for (auto &ngh : pool) - ngh.distance = _distance->compare(_data + _aligned_dim * (size_t)ngh.id, - _data + _aligned_dim * (size_t)location, (uint32_t)_aligned_dim); + ngh.distance = _data_store->get_distance(ngh.id, location); } // sort the pool based on distance to query and prune it with occlude_list @@ -1357,8 +1321,7 @@ void Index::inter_insert(uint32_t n, std::vector &pru { if (dummy_visited.find(cur_nbr) == dummy_visited.end() && cur_nbr != des) { - float dist = _distance->compare(_data + _aligned_dim * (size_t)des, - _data + _aligned_dim * (size_t)cur_nbr, (uint32_t)_aligned_dim); + float dist = _data_store->get_distance(des, cur_nbr); dummy_pool.emplace_back(Neighbor(cur_nbr, dist)); dummy_visited.insert(cur_nbr); } @@ -1479,8 +1442,7 @@ void Index::link(IndexWriteParameters ¶meters) { if (dummy_visited.find(cur_nbr) == dummy_visited.end() && cur_nbr != node) { - float dist = _distance->compare(_data + _aligned_dim * (size_t)node, - _data + _aligned_dim * (size_t)cur_nbr, (uint32_t)_aligned_dim); + float dist = _data_store->get_distance(node, cur_nbr); dummy_pool.emplace_back(Neighbor(cur_nbr, dist)); dummy_visited.insert(cur_nbr); } @@ -1504,6 +1466,7 @@ void Index::prune_all_neighbors(const uint32_t max_degree, cons { const uint32_t range = max_degree; const uint32_t maxc = max_occlusion_size; + _filtered_index = true; diskann::Timer timer; @@ -1525,8 +1488,7 @@ void Index::prune_all_neighbors(const uint32_t max_degree, cons { if (dummy_visited.find(cur_nbr) == dummy_visited.end() && cur_nbr != node) { - float dist = _distance->compare(_data + _aligned_dim * (size_t)node, - _data + _aligned_dim * (size_t)cur_nbr, (uint32_t)_aligned_dim); + float dist = _data_store->get_distance((location_t)node, (location_t)cur_nbr); dummy_pool.emplace_back(Neighbor(cur_nbr, dist)); dummy_visited.insert(cur_nbr); } @@ -1564,6 +1526,7 @@ void Index::prune_all_neighbors(const uint32_t max_degree, cons } } +// REFACTOR template void Index::set_start_points(const T *data, size_t data_count) { @@ -1572,10 +1535,15 @@ void Index::set_start_points(const T *data, size_t data_count) if (_nd > 0) throw ANNException("Can not set starting point for a non-empty index", -1, __FUNCSIG__, __FILE__, __LINE__); - if (data_count != _num_frozen_pts * _aligned_dim) + if (data_count != _num_frozen_pts * _dim) throw ANNException("Invalid number of points", -1, __FUNCSIG__, __FILE__, __LINE__); - memcpy(_data + _aligned_dim * _max_points, data, _aligned_dim * sizeof(T) * _num_frozen_pts); + // memcpy(_data + _aligned_dim * _max_points, data, _aligned_dim * + // sizeof(T) * _num_frozen_pts); + for (location_t i = 0; i < _num_frozen_pts; i++) + { + _data_store->set_vector((location_t)(i + _max_points), data + i * _dim); + } _has_built = true; diskann::cout << "Index start points set: #" << _num_frozen_pts << std::endl; } @@ -1587,8 +1555,8 @@ void Index::set_start_points_at_random(T radius, uint32_t rando std::normal_distribution<> d{0.0, 1.0}; std::vector points_data; - points_data.reserve(_aligned_dim * _num_frozen_pts); - std::vector real_vec(_aligned_dim); + points_data.reserve(_dim * _num_frozen_pts); + std::vector real_vec(_dim); for (size_t frozen_point = 0; frozen_point < _num_frozen_pts; frozen_point++) { @@ -1622,7 +1590,6 @@ void Index::build_with_data_populated(IndexWriteParameters &par stream << "ERROR: Driver requests loading " << _nd << " points from file," << "but tags vector is of size " << tags.size() << "." << std::endl; diskann::cerr << stream.str() << std::endl; - aligned_free(_data); throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); } if (_enable_tags) @@ -1641,7 +1608,8 @@ void Index::build_with_data_populated(IndexWriteParameters &par if (_query_scratch.size() == 0) { - initialize_query_scratch(5 + num_threads_index, index_L, index_L, index_R, maxc, _aligned_dim); + initialize_query_scratch(5 + num_threads_index, index_L, index_L, index_R, maxc, + _data_store->get_aligned_dim()); } generate_frozen_point(); @@ -1684,15 +1652,17 @@ void Index::build(const T *data, const size_t num_points_to_loa std::unique_lock tl(_tag_lock); _nd = num_points_to_load; - memcpy((char *)_data, (char *)data, _aligned_dim * _nd * sizeof(T)); + _data_store->populate_data(data, (location_t)num_points_to_load); - if (_normalize_vecs) - { - for (size_t i = 0; i < num_points_to_load; i++) - { - normalize(_data + _aligned_dim * i, _aligned_dim); - } - } + // REFACTOR + // memcpy((char *)_data, (char *)data, _aligned_dim * _nd * sizeof(T)); + // if (_normalize_vecs) + //{ + // for (size_t i = 0; i < num_points_to_load; i++) + // { + // normalize(_data + _aligned_dim * i, _aligned_dim); + // } + // } } build_with_data_populated(parameters, tags); @@ -1730,8 +1700,6 @@ void Index::build(const char *filename, const size_t num_points if (_pq_dist) aligned_free(_pq_data); - else - aligned_free(_data); throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); } @@ -1743,8 +1711,6 @@ void Index::build(const char *filename, const size_t num_points if (_pq_dist) aligned_free(_pq_data); - else - aligned_free(_data); throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); } @@ -1757,8 +1723,6 @@ void Index::build(const char *filename, const size_t num_points if (_pq_dist) aligned_free(_pq_data); - else - aligned_free(_data); throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); } @@ -1784,15 +1748,7 @@ void Index::build(const char *filename, const size_t num_points #endif } - copy_aligned_data_from_file(filename, _data, file_num_points, file_dim, _aligned_dim); - if (_normalize_vecs) - { - for (size_t i = 0; i < file_num_points; i++) - { - normalize(_data + _aligned_dim * i, _aligned_dim); - } - } - + _data_store->populate_data(filename, 0U); diskann::cout << "Using only first " << num_points_to_load << " from file.. " << std::endl; { @@ -1947,8 +1903,8 @@ void Index::build_filtered_index(const char *filename, const st _label_to_medoid_id.clear(); size_t num_points_labels = 0; parse_label_file(label_file, - num_points_labels); // determines medoid for each label and - // identifies the points to label mapping + num_points_labels); // determines medoid for each label and identifies + // the points to label mapping std::unordered_map> label_to_points; @@ -2035,7 +1991,9 @@ std::pair Index::search(const T *query, con std::shared_lock lock(_update_lock); - auto retval = iterate_to_fixed_point(query, L, init_ids, scratch, false, unused_filter_label, true); + _distance->preprocess_query(query, _data_store->get_dims(), scratch->aligned_query()); + auto retval = + iterate_to_fixed_point(scratch->aligned_query(), L, init_ids, scratch, false, unused_filter_label, true); NeighborPriorityQueue &best_L_nodes = scratch->best_l_nodes(); @@ -2109,10 +2067,11 @@ std::pair Index::search_with_filters(const } filter_vec.emplace_back(filter_label); - T *aligned_query = scratch->aligned_query(); - memcpy(aligned_query, query, _dim * sizeof(T)); - - auto retval = iterate_to_fixed_point(aligned_query, L, init_ids, scratch, true, filter_vec, true); + // REFACTOR + // T *aligned_query = scratch->aligned_query(); + // memcpy(aligned_query, query, _dim * sizeof(T)); + _distance->preprocess_query(query, _data_store->get_dims(), scratch->aligned_query()); + auto retval = iterate_to_fixed_point(scratch->aligned_query(), L, init_ids, scratch, true, filter_vec, true); auto best_L_nodes = scratch->best_l_nodes(); @@ -2171,7 +2130,9 @@ size_t Index::search_with_tags(const T *query, const uint64_t K const std::vector init_ids = get_init_ids(); const std::vector unused_filter_label; - iterate_to_fixed_point(query, L, init_ids, scratch, false, unused_filter_label, true); + _distance->preprocess_query(query, _data_store->get_dims(), scratch->aligned_query()); + iterate_to_fixed_point(scratch->aligned_query(), L, init_ids, scratch, false, unused_filter_label, true); + NeighborPriorityQueue &best_L_nodes = scratch->best_l_nodes(); assert(best_L_nodes.size() <= L); @@ -2189,7 +2150,7 @@ size_t Index::search_with_tags(const T *query, const uint64_t K if (res_vectors.size() > 0) { - memcpy(res_vectors[pos], _data + ((size_t)node.id) * _aligned_dim, _dim * sizeof(T)); + _data_store->get_vector(node.id, res_vectors[pos]); } if (distances != nullptr) @@ -2248,7 +2209,7 @@ template void Indexcopy_vectors((location_t)res, (location_t)_max_points, 1); } } @@ -2286,7 +2247,7 @@ inline void Index::process_delete(const tsl::robin_set &expanded_nghrs_vec = scratch->expanded_nodes_vec(); // If this condition were not true, deadlock could result - assert(old_delete_set.find(loc) == old_delete_set.end()); + assert(old_delete_set.find((uint32_t)loc) == old_delete_set.end()); std::vector adj_list; { @@ -2332,9 +2293,7 @@ inline void Index::process_delete(const tsl::robin_setcompare(_data + _aligned_dim * loc, _data + _aligned_dim * ngh, (uint32_t)_aligned_dim)); + expanded_nghrs_vec.emplace_back(ngh, _data_store->get_distance((location_t)loc, (location_t)ngh)); } std::sort(expanded_nghrs_vec.begin(), expanded_nghrs_vec.end()); std::vector &occlude_list_output = scratch->occlude_list_output(); @@ -2538,8 +2497,7 @@ template void Indexcopy_vectors(old, new_location[old], 1); } } else @@ -2570,7 +2528,6 @@ template void Index::reposition_points(uint32_t old_location_start, uint mem_clear_loc_end_limit = new_location_start; } } - - // Use memmove to handle overlapping ranges. - memmove(_data + _aligned_dim * new_location_start, _data + _aligned_dim * old_location_start, - sizeof(T) * _aligned_dim * num_locations); - memset(_data + _aligned_dim * mem_clear_loc_start, 0, - sizeof(T) * _aligned_dim * (mem_clear_loc_end_limit - mem_clear_loc_start)); + _data_store->move_vectors(old_location_start, new_location_start, num_locations); } template void Index::reposition_frozen_point_to_end() @@ -2728,15 +2680,7 @@ template void Indexresize((location_t)new_internal_points); _final_graph.resize(new_internal_points); _locks = std::vector(new_internal_points); @@ -2829,15 +2773,7 @@ int Index::insert_point(const T *point, const TagT tag) } tl.unlock(); - // Copy the vector in to the data array - auto offset_data = _data + (size_t)_aligned_dim * location; - memset((void *)offset_data, 0, sizeof(T) * _aligned_dim); - memcpy((void *)offset_data, point, sizeof(T) * _dim); - - if (_normalize_vecs) - { - normalize((float *)offset_data, _dim); - } + _data_store->set_vector(location, point); // Find and add appropriate graph edges ScratchStoreManager> manager(_query_scratch); @@ -3005,6 +2941,13 @@ template void Index void +// Index::optimize_index_layout() +//{ // use after build or load +//} + +// REFACTOR: This should be an OptimizedDataStore class template void Index::optimize_index_layout() { // use after build or load if (_dynamic_index) @@ -3013,17 +2956,20 @@ template void Indexget_aligned_dim()]; + std::memset(cur_vec, 0, _data_store->get_aligned_dim() * sizeof(float)); + _data_len = (_data_store->get_aligned_dim() + 1) * sizeof(float); _neighbor_len = (_max_observed_degree + 1) * sizeof(uint32_t); _node_size = _data_len + _neighbor_len; _opt_graph = new char[_node_size * _nd]; - DistanceFastL2 *dist_fast = (DistanceFastL2 *)_distance; + DistanceFastL2 *dist_fast = (DistanceFastL2 *)_data_store->get_dist_fn(); for (uint32_t i = 0; i < _nd; i++) { char *cur_node_offset = _opt_graph + i * _node_size; - float cur_norm = dist_fast->norm(_data + i * _aligned_dim, (uint32_t)_aligned_dim); + _data_store->get_vector(i, (T *)cur_vec); + float cur_norm = dist_fast->norm((T *)cur_vec, (uint32_t)_data_store->get_aligned_dim()); std::memcpy(cur_node_offset, &cur_norm, sizeof(float)); - std::memcpy(cur_node_offset + sizeof(float), _data + i * _aligned_dim, _data_len - sizeof(float)); + std::memcpy(cur_node_offset + sizeof(float), cur_vec, _data_len - sizeof(float)); cur_node_offset += _data_len; uint32_t k = (uint32_t)_final_graph[i].size(); @@ -3033,12 +2979,21 @@ template void Index +// void Index::search_with_optimized_layout(const T *query, +// size_t K, size_t L, uint32_t *indices) +//{ +//} + template void Index::search_with_optimized_layout(const T *query, size_t K, size_t L, uint32_t *indices) { - DistanceFastL2 *dist_fast = (DistanceFastL2 *)_distance; + DistanceFastL2 *dist_fast = (DistanceFastL2 *)_data_store->get_dist_fn(); NeighborPriorityQueue retset(L); std::vector init_ids(L); @@ -3081,7 +3036,7 @@ void Index::search_with_optimized_layout(const T *query, size_t T *x = (T *)(_opt_graph + _node_size * id); float norm_x = *x; x++; - float dist = dist_fast->compare(x, query, norm_x, (uint32_t)_aligned_dim); + float dist = dist_fast->compare(x, query, norm_x, (uint32_t)_data_store->get_aligned_dim()); retset.insert(Neighbor(id, dist)); flags[id] = true; L++; @@ -3106,7 +3061,7 @@ void Index::search_with_optimized_layout(const T *query, size_t T *data = (T *)(_opt_graph + _node_size * id); float norm = *data; data++; - float dist = dist_fast->compare(query, data, norm, (uint32_t)_aligned_dim); + float dist = dist_fast->compare(query, data, norm, (uint32_t)_data_store->get_aligned_dim()); Neighbor nn(id, dist); retset.insert(nn); } diff --git a/src/scratch.cpp b/src/scratch.cpp index dbbf226eb..a12d697c4 100644 --- a/src/scratch.cpp +++ b/src/scratch.cpp @@ -13,7 +13,7 @@ namespace diskann // template InMemQueryScratch::InMemQueryScratch(uint32_t search_l, uint32_t indexing_l, uint32_t r, uint32_t maxc, size_t dim, - bool init_pq_scratch) + size_t aligned_dim, size_t alignment_factor, bool init_pq_scratch) : _L(0), _R(r), _maxc(maxc) { if (search_l == 0 || indexing_l == 0 || r == 0 || dim == 0) @@ -24,8 +24,7 @@ InMemQueryScratch::InMemQueryScratch(uint32_t search_l, uint32_t indexing_l, throw diskann::ANNException(ss.str(), -1); } - auto aligned_dim = ROUND_UP(dim, 8); - alloc_aligned(((void **)&_aligned_query), aligned_dim * sizeof(T), 8 * sizeof(T)); + alloc_aligned(((void **)&_aligned_query), aligned_dim * sizeof(T), alignment_factor * sizeof(T)); memset(_aligned_query, 0, aligned_dim * sizeof(T)); if (init_pq_scratch) diff --git a/src/windows_aligned_file_reader.cpp b/src/windows_aligned_file_reader.cpp index 0f1fd4ee3..9b1a024bd 100644 --- a/src/windows_aligned_file_reader.cpp +++ b/src/windows_aligned_file_reader.cpp @@ -11,7 +11,6 @@ void WindowsAlignedFileReader::open(const std::string &fname) { - #ifdef UNICODE m_filename = std::wstring(fname.begin(), fname.end()); #else diff --git a/tests/build_memory_index.cpp b/tests/build_memory_index.cpp index 8d68feb0e..3712350c3 100644 --- a/tests/build_memory_index.cpp +++ b/tests/build_memory_index.cpp @@ -76,7 +76,8 @@ int main(int argc, char **argv) { desc.add_options()("help,h", "Print information on arguments"); desc.add_options()("data_type", po::value(&data_type)->required(), "data type "); - desc.add_options()("dist_fn", po::value(&dist_fn)->required(), "distance function "); + desc.add_options()("dist_fn", po::value(&dist_fn)->required(), + "distance function "); desc.add_options()("data_path", po::value(&data_path)->required(), "Input data file in bin format"); desc.add_options()("index_path_prefix", po::value(&index_path_prefix)->required(), diff --git a/tests/search_disk_index.cpp b/tests/search_disk_index.cpp index a2528c92b..02d734c74 100644 --- a/tests/search_disk_index.cpp +++ b/tests/search_disk_index.cpp @@ -79,7 +79,9 @@ int search_disk_index(diskann::Metric &metric, const std::string &index_path_pre filtered_search = true; if (query_filters.size() != 1 && query_filters.size() != query_num) { - std::cout << "Error. Mismatch in number of queries and size of query filters file" << std::endl; + std::cout << "Error. Mismatch in number of queries and size of query " + "filters file" + << std::endl; return -1; // To return -1 or some other error handling? } } diff --git a/tests/search_memory_index.cpp b/tests/search_memory_index.cpp index 8fa17d4e8..02c96db24 100644 --- a/tests/search_memory_index.cpp +++ b/tests/search_memory_index.cpp @@ -59,7 +59,9 @@ int search_memory_index(diskann::Metric &metric, const std::string &index_path, filtered_search = true; if (query_filters.size() != 1 && query_filters.size() != query_num) { - std::cout << "Error. Mismatch in number of queries and size of query filters file" << std::endl; + std::cout << "Error. Mismatch in number of queries and size of query " + "filters file" + << std::endl; return -1; // To return -1 or some other error handling? } } diff --git a/tests/test_insert_deletes_consolidate.cpp b/tests/test_insert_deletes_consolidate.cpp index 40d2c6344..c42972402 100644 --- a/tests/test_insert_deletes_consolidate.cpp +++ b/tests/test_insert_deletes_consolidate.cpp @@ -142,7 +142,6 @@ void build_incremental_index(const std::string &data_path, const uint32_t L, con const std::string &save_path, size_t points_to_delete_from_beginning, size_t start_deletes_after, bool concurrent) { - diskann::IndexWriteParameters params = diskann::IndexWriteParametersBuilder(L, R) .with_max_occlusion_size(500) // C = 500 .with_alpha(alpha) diff --git a/tests/test_streaming_scenario.cpp b/tests/test_streaming_scenario.cpp index 4d308b75c..e1fe80c83 100644 --- a/tests/test_streaming_scenario.cpp +++ b/tests/test_streaming_scenario.cpp @@ -194,6 +194,8 @@ void build_incremental_index(const std::string &data_path, const uint32_t L, con size_t num_points; diskann::get_bin_metadata(data_path, num_points, dim); + diskann::cout << "metadata: file " << data_path << " has " << num_points << " points in " << dim << " dims" + << std::endl; aligned_dim = ROUND_UP(dim, 8); if (max_points_to_insert == 0) diff --git a/tests/utils/compute_groundtruth.cpp b/tests/utils/compute_groundtruth.cpp index ac448c09b..991f29ff6 100644 --- a/tests/utils/compute_groundtruth.cpp +++ b/tests/utils/compute_groundtruth.cpp @@ -505,8 +505,10 @@ int main(int argc, char **argv) desc.add_options()("query_file", po::value(&query_file)->required(), "File containing the query vectors in binary format"); desc.add_options()("gt_file", po::value(>_file)->required(), - "File name for the writing ground truth in binary format, please don' append .bin at end if " - "no filter_label or filter_label_file is provided it will save the file with '.bin' at end." + "File name for the writing ground truth in binary " + "format, please don' append .bin at end if " + "no filter_label or filter_label_file is provided it " + "will save the file with '.bin' at end." "else it will save the file as filename_label.bin"); desc.add_options()("K", po::value(&K)->required(), "Number of ground truth nearest neighbors to compute"); diff --git a/tests/utils/compute_groundtruth_for_filters.cpp b/tests/utils/compute_groundtruth_for_filters.cpp index d101a07ff..eb962257d 100644 --- a/tests/utils/compute_groundtruth_for_filters.cpp +++ b/tests/utils/compute_groundtruth_for_filters.cpp @@ -670,8 +670,10 @@ int main(int argc, char **argv) desc.add_options()("universal_label", po::value(&universal_label)->default_value(""), "Universal label, if using it, only in conjunction with label_file"); desc.add_options()("gt_file", po::value(>_file)->required(), - "File name for the writing ground truth in binary format, please don' append .bin at end if " - "no filter_label or filter_label_file is provided it will save the file with '.bin' at end." + "File name for the writing ground truth in binary " + "format, please don' append .bin at end if " + "no filter_label or filter_label_file is provided it " + "will save the file with '.bin' at end." "else it will save the file as filename_label.bin"); desc.add_options()("K", po::value(&K)->required(), "Number of ground truth nearest neighbors to compute"); diff --git a/tests/utils/generate_synthetic_labels.cpp b/tests/utils/generate_synthetic_labels.cpp index 9c659c4ca..3de2130fb 100644 --- a/tests/utils/generate_synthetic_labels.cpp +++ b/tests/utils/generate_synthetic_labels.cpp @@ -103,7 +103,8 @@ int main(int argc, char **argv) desc.add_options()("num_labels,L", po::value(&num_labels)->required(), "Number of unique labels, up to 5000"); desc.add_options()("distribution_type,DT", po::value(&distribution_type)->default_value("random"), - "Distribution function for labels defaults to random"); + "Distribution function for labels defaults " + "to random"); po::variables_map vm; po::store(po::parse_command_line(argc, argv, desc), vm);