diff --git a/include/index.h b/include/index.h index 228eee13e..962f758e3 100644 --- a/include/index.h +++ b/include/index.h @@ -75,7 +75,7 @@ template clas DISKANN_DLLEXPORT Index(Metric m, const size_t dim, const size_t max_points = 1, const bool dynamic_index = false, const bool enable_tags = false, const bool concurrent_consolidate = false, const bool pq_dist_build = false, const size_t num_pq_chunks = 0, - const bool use_opq = false); + const bool use_opq = false, const size_t num_frozen_pts = 0); // Constructor for incremental index DISKANN_DLLEXPORT Index(Metric m, const size_t dim, const size_t max_points, const bool dynamic_index, @@ -93,6 +93,9 @@ template clas #ifdef EXEC_ENV_OLS DISKANN_DLLEXPORT void load(AlignedFileReader &reader, uint32_t num_threads, uint32_t search_l); #else + // Reads the number of frozen points from graph's metadata file section. + DISKANN_DLLEXPORT static size_t get_graph_num_frozen_points(const std::string &graph_file); + DISKANN_DLLEXPORT void load(const char *index_file, uint32_t num_threads, uint32_t search_l); #endif @@ -122,10 +125,13 @@ template clas // Get converted integer label from string to int map (_label_map) DISKANN_DLLEXPORT LabelT get_converted_label(const std::string &raw_label); - // Set starting point of an index before inserting any points incrementally - DISKANN_DLLEXPORT void set_start_point(T *data); - // Set starting point to a random point on a sphere of certain radius - DISKANN_DLLEXPORT void set_start_point_at_random(T radius); + // Set starting point of an index before inserting any points incrementally. + // The data count should be equal to _num_frozen_pts * _aligned_dim. + DISKANN_DLLEXPORT void set_start_points(const T *data, size_t data_count); + // Set starting points to random points on a sphere of certain radius. + // A fixed random seed can be specified for scenarios where it's important + // to have higher consistency between index builds. + DISKANN_DLLEXPORT void set_start_points_at_random(T radius, unsigned int random_seed = 0); // For FastL2 search on a static index, we interleave the data with graph DISKANN_DLLEXPORT void optimize_index_layout(); @@ -176,7 +182,8 @@ template clas // repositions frozen points to the end of _data - if they have been moved // during deletion DISKANN_DLLEXPORT void reposition_frozen_point_to_end(); - DISKANN_DLLEXPORT void reposition_point(unsigned old_location, unsigned new_location); + DISKANN_DLLEXPORT void reposition_points(unsigned old_location_start, unsigned new_location_start, + unsigned num_locations); // DISKANN_DLLEXPORT void save_index_as_one_file(bool flag); @@ -210,7 +217,7 @@ template clas // generates 1 frozen point that will never be deleted from the graph // This is not visible to the user - int generate_frozen_point(); + void generate_frozen_point(); // determines navigating node of the graph by calculating medoid of datafopt unsigned calculate_entry_point(); @@ -219,11 +226,13 @@ template clas std::unordered_map load_label_map(const std::string &map_file); + // Returns the locations of start point and frozen points suitable for use with iterate_to_fixed_point. + std::vector get_init_ids(); + std::pair iterate_to_fixed_point(const T *node_coords, const unsigned Lindex, const std::vector &init_ids, InMemQueryScratch *scratch, bool use_filter, - const std::vector &filters, bool ret_frozen = true, - bool search_invocation = false); + const std::vector &filters, bool search_invocation); void search_for_point_and_prune(int location, _u32 Lindex, std::vector &pruned_list, InMemQueryScratch *scratch, bool use_filter = false, _u32 filteredLindex = 0); @@ -312,6 +321,10 @@ template clas size_t _aligned_dim = 0; size_t _nd = 0; // number of active points i.e. existing in the graph size_t _max_points = 0; // total number of points in given data set + // Number of points which are used as initial candidates when iterating to + // closest point(s). These are not visible externally and won't be returned + // by search. DiskANN forces at least 1 frozen point for dynamic index. + // The frozen points have consecutive locations. See also _start below. size_t _num_frozen_pts = 0; size_t _max_range_of_loaded_graph = 0; size_t _node_size; @@ -319,6 +332,9 @@ template clas size_t _neighbor_len; unsigned _max_observed_degree = 0; + // Start point of the search. When _num_frozen_pts is greater than zero, + // this is the location of the first frozen point. Otherwise, this is a + // location of one of the points in index. unsigned _start = 0; bool _has_built = false; @@ -346,7 +362,6 @@ template clas uint32_t _indexingRange; uint32_t _indexingMaxC; float _indexingAlpha; - uint32_t _search_queue_size; // Query scratch data structures ConcurrentQueue *> _query_scratch; diff --git a/include/parameters.h b/include/parameters.h index 81d5fae6a..e24681e7b 100644 --- a/include/parameters.h +++ b/include/parameters.h @@ -52,7 +52,7 @@ class Parameters } } - template inline ParamType Get(const std::string &name, const ParamType &default_value) + template inline ParamType Get(const std::string &name, const ParamType &default_value) const { try { diff --git a/src/index.cpp b/src/index.cpp index 49ac2ccc0..9127cc27b 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -31,7 +31,8 @@ Index::Index(Metric m, const size_t dim, const size_t max_point const Parameters &indexParams, const Parameters &searchParams, const bool enable_tags, const bool concurrent_consolidate, const bool pq_dist_build, const size_t num_pq_chunks, const bool use_opq) - : Index(m, dim, max_points, dynamic_index, enable_tags, concurrent_consolidate) + : Index(m, dim, max_points, dynamic_index, enable_tags, concurrent_consolidate, pq_dist_build, num_pq_chunks, + use_opq, indexParams.Get("num_frozen_pts", 0)) { _indexingQueueSize = indexParams.Get("L"); _indexingRange = indexParams.Get("R"); @@ -50,11 +51,11 @@ Index::Index(Metric m, const size_t dim, const size_t max_point template Index::Index(Metric m, const size_t dim, const size_t max_points, const bool dynamic_index, const bool enable_tags, const bool concurrent_consolidate, const bool pq_dist_build, - const size_t num_pq_chunks, const bool use_opq) - : _dist_metric(m), _dim(dim), _max_points(max_points), _dynamic_index(dynamic_index), _enable_tags(enable_tags), - _indexingMaxC(DEFAULT_MAXC), _query_scratch(nullptr), _conc_consolidate(concurrent_consolidate), - _delete_set(new tsl::robin_set), _pq_dist(pq_dist_build), _use_opq(use_opq), - _num_pq_chunks(num_pq_chunks) + const size_t num_pq_chunks, const bool use_opq, const size_t num_frozen_pts) + : _dist_metric(m), _dim(dim), _num_frozen_pts(num_frozen_pts), _max_points(max_points), + _dynamic_index(dynamic_index), _enable_tags(enable_tags), _indexingMaxC(DEFAULT_MAXC), _query_scratch(nullptr), + _conc_consolidate(concurrent_consolidate), _delete_set(new tsl::robin_set), _pq_dist(pq_dist_build), + _use_opq(use_opq), _num_pq_chunks(num_pq_chunks) { if (dynamic_index && !enable_tags) { @@ -76,7 +77,7 @@ Index::Index(Metric m, const size_t dim, const size_t max_point // data stored to _nd * aligned_dim matrix with necessary zero-padding _aligned_dim = ROUND_UP(_dim, 8); - if (dynamic_index) + if (dynamic_index && _num_frozen_pts == 0) { _num_frozen_pts = 1; } @@ -195,7 +196,7 @@ template _u64 Index 0) { - std::memset((char *)&tag_data[_start], 0, sizeof(TagT)); + std::memset((char *)&tag_data[_start], 0, sizeof(TagT) * _num_frozen_pts); } try { @@ -211,6 +212,8 @@ template _u64 Index _u64 Index::save_data(std::string data_file) { + // Note: at this point, either _nd == _max_points or any frozen points have been + // temporarily moved to _nd, so _nd + _num_frozen_points is the valid location limit. return save_data_in_base_dimensions(data_file, _data, _nd + _num_frozen_pts, _dim, _aligned_dim); } @@ -231,6 +234,8 @@ template _u64 Index::save(const char *filename, bool compact_before_save << std::endl; } + // If frozen points were temporarily compacted to _nd, move back to _max_points. reposition_frozen_point_to_end(); diskann::cout << "Time taken for save: " << timer.elapsed() / 1000000.0 << "s." << std::endl; @@ -398,7 +404,7 @@ size_t Index::load_tags(const std::string tag_filename) throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); } - size_t num_data_points = _num_frozen_pts > 0 ? file_num_points - 1 : file_num_points; + const size_t num_data_points = file_num_points - _num_frozen_pts; _location_to_tag.reserve(num_data_points); _tag_to_location.reserve(num_data_points); for (_u32 i = 0; i < (_u32)num_data_points; i++) @@ -451,10 +457,10 @@ size_t Index::load_data(std::string filename) throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); } - if (file_num_points > _max_points) + if (file_num_points > _max_points + _num_frozen_pts) { // update and tag lock acquired in load() before calling load_data - resize(file_num_points); + resize(file_num_points - _num_frozen_pts); } #ifdef EXEC_ENV_OLS @@ -615,7 +621,6 @@ void Index::load(const char *filename, uint32_t num_threads, ui << " size(_tag_to_location):" << _tag_to_location.size() << " Max points: " << _max_points << std::endl; - _search_queue_size = search_l; // For incremental index, _query_scratch is initialized in the constructor. // For the bulk index, the params required to initialize _query_scratch // are known only at load time, hence this check and the call to @@ -627,6 +632,27 @@ void Index::load(const char *filename, uint32_t num_threads, ui } } +#ifndef EXEC_ENV_OLS +template +size_t Index::get_graph_num_frozen_points(const std::string &graph_file) +{ + size_t expected_file_size; + unsigned max_observed_degree, start; + _u64 file_frozen_pts; + + std::ifstream in; + in.exceptions(std::ios::badbit | std::ios::failbit); + + in.open(graph_file, std::ios::binary); + in.read((char *)&expected_file_size, sizeof(_u64)); + in.read((char *)&max_observed_degree, sizeof(unsigned)); + in.read((char *)&start, sizeof(unsigned)); + in.read((char *)&file_frozen_pts, sizeof(_u64)); + + return file_frozen_pts; +} +#endif + #ifdef EXEC_ENV_OLS template size_t Index::load_graph(AlignedFileReader &reader, size_t expected_num_points) @@ -693,15 +719,17 @@ size_t Index::load_graph(std::string filename, size_t expected_ diskann::cout << "Loading vamana graph " << filename << "..." << std::flush; #endif + const size_t expected_max_points = expected_num_points - file_frozen_pts; + // If user provides more points than max_points // resize the _final_graph to the larger size. - if (_max_points < expected_num_points) + if (_max_points < expected_max_points) { - diskann::cout << "Number of points in data: " << expected_num_points + diskann::cout << "Number of points in data: " << expected_max_points << " is greater than max_points: " << _max_points - << " Setting max points to: " << expected_num_points << std::endl; - _final_graph.resize(expected_num_points + _num_frozen_pts); - _max_points = expected_num_points; + << " Setting max points to: " << expected_max_points << std::endl; + _final_graph.resize(expected_max_points + _num_frozen_pts); + _max_points = expected_max_points; } #ifdef EXEC_ENV_OLS _u32 nodes_read = 0; @@ -831,10 +859,28 @@ template unsigned Index std::vector Index::get_init_ids() +{ + std::vector init_ids; + init_ids.reserve(1 + _num_frozen_pts); + + init_ids.emplace_back(_start); + + for (unsigned frozen = _max_points; frozen < _max_points + _num_frozen_pts; frozen++) + { + if (frozen != _start) + { + init_ids.emplace_back(frozen); + } + } + + return init_ids; +} + template std::pair Index::iterate_to_fixed_point( const T *query, const unsigned Lsize, const std::vector &init_ids, InMemQueryScratch *scratch, - bool use_filter, const std::vector &filter_label, bool ret_frozen, bool search_invocation) + bool use_filter, const std::vector &filter_label, bool search_invocation) { std::vector &expanded_nodes = scratch->pool(); NeighborPriorityQueue &best_L_nodes = scratch->best_l_nodes(); @@ -968,7 +1014,7 @@ std::pair Index::iterate_to_fixed_point( auto n = nbr.id; // Add node to expanded nodes to create pool for prune later - if (!search_invocation && (n != _start || _num_frozen_pts == 0 || ret_frozen)) + if (!search_invocation) { if (!use_filter) { @@ -1077,14 +1123,13 @@ void Index::search_for_point_and_prune(int location, _u32 Linde InMemQueryScratch *scratch, bool use_filter, _u32 filteredLindex) { - std::vector init_ids; - init_ids.emplace_back(_start); - - std::vector dummy; + const std::vector init_ids = get_init_ids(); + const std::vector unused_filter_label; if (!use_filter) { - iterate_to_fixed_point(_data + _aligned_dim * location, Lindex, init_ids, scratch, false, dummy, true, false); + iterate_to_fixed_point(_data + _aligned_dim * location, Lindex, init_ids, scratch, false, unused_filter_label, + false); } else { @@ -1092,7 +1137,7 @@ void Index::search_for_point_and_prune(int location, _u32 Linde for (auto &x : _pts_to_labels[location]) filter_specific_start_nodes.emplace_back(_label_to_medoid_id[x]); iterate_to_fixed_point(_data + _aligned_dim * location, filteredLindex, filter_specific_start_nodes, scratch, - true, _pts_to_labels[location], true, false); + true, _pts_to_labels[location], false); } auto &pool = scratch->pool(); @@ -1358,8 +1403,11 @@ template void Index 0) - visit_order.emplace_back((unsigned)_max_points); + // If there are any frozen points, add them all. + for (unsigned frozen = _max_points; frozen < _max_points + _num_frozen_pts; frozen++) + { + visit_order.emplace_back(frozen); + } // if there are frozen points, the first such one is set to be the _start if (_num_frozen_pts > 0) @@ -1372,9 +1420,6 @@ template void Index init_ids; - init_ids.emplace_back(_start); - diskann::Timer link_timer; #pragma omp parallel for schedule(dynamic, 2048) @@ -1463,7 +1508,7 @@ void Index::prune_all_nbrs(const Parameters ¶meters) #pragma omp parallel for for (_s64 node = 0; node < (_s64)(_max_points + _num_frozen_pts); node++) { - if ((size_t)node < _nd || (size_t)node == _max_points) + if ((size_t)node < _nd || (size_t)node >= _max_points) { if (_final_graph[node].size() > range) { @@ -1495,14 +1540,17 @@ void Index::prune_all_nbrs(const Parameters ¶meters) diskann::cout << "Prune time : " << timer.elapsed() / 1000 << "ms" << std::endl; size_t max = 0, min = 1 << 30, total = 0, cnt = 0; - for (size_t i = 0; i < (_nd + _num_frozen_pts); i++) + for (size_t i = 0; i < _max_points + _num_frozen_pts; i++) { - std::vector pool = _final_graph[i]; - max = (std::max)(max, pool.size()); - min = (std::min)(min, pool.size()); - total += pool.size(); - if (pool.size() < 2) - cnt++; + if (i < _nd || i >= _max_points) + { + const std::vector &pool = _final_graph[i]; + max = (std::max)(max, pool.size()); + min = (std::min)(min, pool.size()); + total += pool.size(); + if (pool.size() < 2) + cnt++; + } } if (min > max) min = max; @@ -1514,38 +1562,48 @@ void Index::prune_all_nbrs(const Parameters ¶meters) } } -template void Index::set_start_point(T *data) +template +void Index::set_start_points(const T *data, size_t data_count) { std::unique_lock ul(_update_lock); std::unique_lock tl(_tag_lock); if (_nd > 0) throw ANNException("Can not set starting point for a non-empty index", -1, __FUNCSIG__, __FILE__, __LINE__); - memcpy(_data + _aligned_dim * _max_points, data, _aligned_dim * sizeof(T)); + if (data_count != _num_frozen_pts * _aligned_dim) + throw ANNException("Invalid number of points", -1, __FUNCSIG__, __FILE__, __LINE__); + + memcpy(_data + _aligned_dim * _max_points, data, _aligned_dim * sizeof(T) * _num_frozen_pts); _has_built = true; - diskann::cout << "Index start point set" << std::endl; + diskann::cout << "Index start points set: #" << _num_frozen_pts << std::endl; } -template void Index::set_start_point_at_random(T radius) +template +void Index::set_start_points_at_random(T radius, unsigned int random_seed) { - std::vector real_vec; - std::random_device rd{}; - std::mt19937 gen{rd()}; + std::mt19937 gen{random_seed}; std::normal_distribution<> d{0.0, 1.0}; - double norm_sq = 0.0; - for (size_t i = 0; i < _aligned_dim; ++i) + + std::vector points_data; + points_data.reserve(_aligned_dim * _num_frozen_pts); + std::vector real_vec(_aligned_dim); + + for (size_t frozen_point = 0; frozen_point < _num_frozen_pts; frozen_point++) { - auto r = d(gen); - real_vec.push_back(r); - norm_sq += r * r; - } + double norm_sq = 0.0; + for (size_t i = 0; i < _dim; ++i) + { + auto r = d(gen); + real_vec[i] = r; + norm_sq += r * r; + } - double norm = std::sqrt(norm_sq); - std::vector start_vec; - for (auto iter : real_vec) - start_vec.push_back(static_cast(iter * radius / norm)); + const double norm = std::sqrt(norm_sq); + for (auto iter : real_vec) + points_data.push_back(static_cast(iter * radius / norm)); + } - set_start_point(start_vec.data()); + set_start_points(points_data.data(), points_data.size()); } template @@ -1969,12 +2027,12 @@ std::pair Index::search(const T *query, con diskann::cout << "Resize completed. New scratch->L is " << scratch->get_L() << std::endl; } - std::vector dummy; - std::vector init_ids; - init_ids.push_back(_start); + const std::vector unused_filter_label; + const std::vector init_ids = get_init_ids(); + std::shared_lock lock(_update_lock); - auto retval = iterate_to_fixed_point(query, L, init_ids, scratch, false, dummy, true, true); + auto retval = iterate_to_fixed_point(query, L, init_ids, scratch, false, unused_filter_label, true); NeighborPriorityQueue &best_L_nodes = scratch->best_l_nodes(); @@ -2032,8 +2090,8 @@ std::pair Index::search_with_filters(const } std::vector filter_vec; - std::vector init_ids; - init_ids.push_back(_start); + std::vector init_ids = get_init_ids(); + std::shared_lock lock(_update_lock); if (_label_to_medoid_id.find(filter_label) != _label_to_medoid_id.end()) @@ -2051,7 +2109,7 @@ std::pair Index::search_with_filters(const T *aligned_query = scratch->aligned_query(); memcpy(aligned_query, query, _dim * sizeof(T)); - auto retval = iterate_to_fixed_point(aligned_query, L, init_ids, scratch, true, filter_vec, true, true); + auto retval = iterate_to_fixed_point(aligned_query, L, init_ids, scratch, true, filter_vec, true); auto best_L_nodes = scratch->best_l_nodes(); @@ -2107,10 +2165,10 @@ size_t Index::search_with_tags(const T *query, const uint64_t K std::shared_lock ul(_update_lock); - std::vector init_ids(1, _start); - std::vector dummy; + const std::vector init_ids = get_init_ids(); + const std::vector unused_filter_label; - iterate_to_fixed_point(query, L, init_ids, scratch, false, dummy, true, true); + iterate_to_fixed_point(query, L, init_ids, scratch, false, unused_filter_label, true); NeighborPriorityQueue &best_L_nodes = scratch->best_l_nodes(); assert(best_L_nodes.size() <= L); @@ -2161,10 +2219,16 @@ template size_t Index int Index::generate_frozen_point() +template void Index::generate_frozen_point() { if (_num_frozen_pts == 0) - return 0; + return; + + if (_num_frozen_pts > 1) + { + throw ANNException("More than one frozen point not supported in generate_frozen_point", -1, __FUNCSIG__, + __FILE__, __LINE__); + } if (_nd == 0) { @@ -2183,8 +2247,6 @@ template int Index { memcpy(_data + _max_points * _aligned_dim, _data + res * _aligned_dim, _aligned_dim * sizeof(T)); } - - return 0; } template int Index::enable_delete() @@ -2387,31 +2449,10 @@ consolidation_report Index::consolidate_deletes(const Parameter template void Index::compact_frozen_point() { - if (_nd < _max_points) + if (_nd < _max_points && _num_frozen_pts > 0) { - if (_num_frozen_pts == 1) - { - // set new _start to be frozen point - _start = (_u32)_nd; - if (!_final_graph[_max_points].empty()) - { - for (unsigned i = 0; i < _nd; i++) - for (unsigned j = 0; j < _final_graph[i].size(); j++) - if (_final_graph[i][j] == _max_points) - _final_graph[i][j] = (_u32)_nd; - - _final_graph[_nd].clear(); - _final_graph[_nd].swap(_final_graph[_max_points]); - - memcpy((void *)(_data + _aligned_dim * _nd), _data + (size_t)_aligned_dim * _max_points, - sizeof(T) * _dim); - memset((_data + (size_t)_aligned_dim * _max_points), 0, sizeof(T) * _aligned_dim); - } - } - else if (_num_frozen_pts > 1) - { - throw ANNException("Case not implemented.", -1, __FUNCSIG__, __FILE__, __LINE__); - } + reposition_points(_max_points, _nd, _num_frozen_pts); + _start = (_u32)_nd; } } @@ -2593,22 +2634,69 @@ size_t Index::release_locations(const tsl::robin_set } template -void Index::reposition_point(unsigned old_location, unsigned new_location) +void Index::reposition_points(unsigned old_location_start, unsigned new_location_start, + unsigned num_locations) { - for (unsigned i = 0; i < _nd; i++) - for (unsigned j = 0; j < _final_graph[i].size(); j++) - if (_final_graph[i][j] == old_location) - _final_graph[i][j] = (unsigned)new_location; + if (num_locations == 0 || old_location_start == new_location_start) + { + return; + } + + // Update pointers to the moved nodes. Note: the computation is correct even when + // new_location_start < old_location_start given the C++ unsigned integer arithmetic + // rules. + const unsigned location_delta = new_location_start - old_location_start; + + for (unsigned i = 0; i < _max_points + _num_frozen_pts; i++) + for (auto &loc : _final_graph[i]) + if (loc >= old_location_start && loc < old_location_start + num_locations) + loc += location_delta; - _final_graph[new_location].clear(); - for (unsigned k = 0; k < _final_graph[_nd].size(); k++) - _final_graph[new_location].emplace_back(_final_graph[old_location][k]); + // The [start, end) interval which will contain obsolete points to be cleared. + unsigned mem_clear_loc_start = old_location_start; + unsigned mem_clear_loc_end_limit = old_location_start + num_locations; - _final_graph[old_location].clear(); + // Move the adjacency lists. Make sure that overlapping ranges are handled correctly. + if (new_location_start < old_location_start) + { + // New location before the old location: copy the entries in order + // to avoid modifying locations that are yet to be copied. + for (unsigned loc_offset = 0; loc_offset < num_locations; loc_offset++) + { + assert(_final_graph[new_location_start + loc_offset].empty()); + _final_graph[new_location_start + loc_offset].swap(_final_graph[old_location_start + loc_offset]); + } + + // If ranges are overlapping, make sure not to clear the newly copied data. + if (mem_clear_loc_start < new_location_start + num_locations) + { + // Clear only after the end of the new range. + mem_clear_loc_start = new_location_start + num_locations; + } + } + else + { + // Old location after the new location: copy from the end of the range + // to avoid modifying locations that are yet to be copied. + for (unsigned loc_offset = num_locations; loc_offset > 0; loc_offset--) + { + assert(_final_graph[new_location_start + loc_offset - 1u].empty()); + _final_graph[new_location_start + loc_offset - 1u].swap(_final_graph[old_location_start + loc_offset - 1u]); + } + + // If ranges are overlapping, make sure not to clear the newly copied data. + if (mem_clear_loc_end_limit > new_location_start) + { + // Clear only up to the beginning of the new range. + mem_clear_loc_end_limit = new_location_start; + } + } - memcpy((void *)(_data + (size_t)_aligned_dim * new_location), _data + (size_t)_aligned_dim * old_location, - sizeof(T) * _aligned_dim); - memset((_data + (size_t)_aligned_dim * old_location), 0, sizeof(T) * _aligned_dim); + // Use memmove to handle overlapping ranges. + memmove(_data + _aligned_dim * new_location_start, _data + _aligned_dim * old_location_start, + sizeof(T) * _aligned_dim * num_locations); + memset(_data + _aligned_dim * mem_clear_loc_start, 0, + sizeof(T) * _aligned_dim * (mem_clear_loc_end_limit - mem_clear_loc_start)); } template void Index::reposition_frozen_point_to_end() @@ -2621,30 +2709,36 @@ template void Index void Index::resize(size_t new_max_points) { + const size_t new_internal_points = new_max_points + _num_frozen_pts; auto start = std::chrono::high_resolution_clock::now(); assert(_empty_slots.size() == 0); // should not resize if there are empty slots. + #ifndef _WINDOWS T *new_data; - alloc_aligned((void **)&new_data, (new_max_points + 1) * _aligned_dim * sizeof(T), 8 * sizeof(T)); - memcpy(new_data, _data, (_max_points + 1) * _aligned_dim * sizeof(T)); + alloc_aligned((void **)&new_data, new_internal_points * _aligned_dim * sizeof(T), 8 * sizeof(T)); + memcpy(new_data, _data, (_max_points + _num_frozen_pts) * _aligned_dim * sizeof(T)); aligned_free(_data); _data = new_data; #else - realloc_aligned((void **)&_data, (new_max_points + 1) * _aligned_dim * sizeof(T), 8 * sizeof(T)); + realloc_aligned((void **)&_data, new_internal_points * _aligned_dim * sizeof(T), 8 * sizeof(T)); #endif - _final_graph.resize(new_max_points + 1); - _locks = std::vector(new_max_points + 1); + _final_graph.resize(new_internal_points); + _locks = std::vector(new_internal_points); - reposition_point((_u32)_max_points, (_u32)new_max_points); - _max_points = new_max_points; - _start = (_u32)new_max_points; + if (_num_frozen_pts != 0) + { + reposition_points((_u32)_max_points, (_u32)new_max_points, (_u32)_num_frozen_pts); + _start = (_u32)new_max_points; + } + _max_points = new_max_points; _empty_slots.reserve(_max_points); for (auto i = _nd; i < _max_points; i++) { @@ -2868,19 +2962,17 @@ template void Index[MAX_BFS_LEVELS]; - if (_dynamic_index) + bfs_sets[0].insert(_start); + visited.set(_start); + + for (unsigned i = _max_points; i < _max_points + _num_frozen_pts; ++i) { - for (unsigned i = _max_points; i < _max_points + _num_frozen_pts; ++i) + if (i != _start) { bfs_sets[0].insert(i); visited.set(i); } } - else - { - bfs_sets[0].insert(_start); - visited.set(_start); - } for (size_t l = 0; l < MAX_BFS_LEVELS - 1; ++l) { diff --git a/tests/search_memory_index.cpp b/tests/search_memory_index.cpp index 971c48d76..4e093998d 100644 --- a/tests/search_memory_index.cpp +++ b/tests/search_memory_index.cpp @@ -60,7 +60,12 @@ int search_memory_index(diskann::Metric &metric, const std::string &index_path, } using TagT = uint32_t; - diskann::Index index(metric, query_dim, 0, dynamic, tags); + const bool concurrent = false, pq_dist_build = false, use_opq = false; + const size_t num_pq_chunks = 0; + using IndexType = diskann::Index; + const size_t num_frozen_pts = IndexType::get_graph_num_frozen_points(index_path); + IndexType index(metric, query_dim, 0, dynamic, tags, concurrent, pq_dist_build, num_pq_chunks, use_opq, + num_frozen_pts); std::cout << "Index class instantiated" << std::endl; index.load(index_path.c_str(), num_threads, *(std::max_element(Lvec.begin(), Lvec.end()))); std::cout << "Index loaded" << std::endl; diff --git a/tests/test_insert_deletes_consolidate.cpp b/tests/test_insert_deletes_consolidate.cpp index 9fc2d126c..ef598c659 100644 --- a/tests/test_insert_deletes_consolidate.cpp +++ b/tests/test_insert_deletes_consolidate.cpp @@ -137,9 +137,10 @@ void delete_from_beginning(diskann::Index &index, diskann::Parameters & template void build_incremental_index(const std::string &data_path, const unsigned L, const unsigned R, const float alpha, const unsigned thread_count, size_t points_to_skip, size_t max_points_to_insert, - size_t beginning_index_size, float start_point_norm, size_t points_per_checkpoint, - size_t checkpoints_per_snapshot, const std::string &save_path, - size_t points_to_delete_from_beginning, size_t start_deletes_after, bool concurrent) + size_t beginning_index_size, float start_point_norm, unsigned num_start_pts, + size_t points_per_checkpoint, size_t checkpoints_per_snapshot, + const std::string &save_path, size_t points_to_delete_from_beginning, + size_t start_deletes_after, bool concurrent) { const unsigned C = 500; const bool saturate_graph = false; @@ -154,6 +155,7 @@ void build_incremental_index(const std::string &data_path, const unsigned L, con params.Set("num_threads", thread_count); params.Set("Lf", 0); // TODO: get this from params and default to some // value to make it backward compatible. + params.Set("num_frozen_pts", num_start_pts); size_t dim, aligned_dim; size_t num_points; @@ -179,17 +181,8 @@ void build_incremental_index(const std::string &data_path, const unsigned L, con } using TagT = uint32_t; - unsigned num_frozen = 1; const bool enable_tags = true; - auto num_frozen_str = getenv("TTS_NUM_FROZEN"); - - if (num_frozen_str != nullptr) - { - num_frozen = std::atoi(num_frozen_str); - std::cout << "Overriding num_frozen to" << num_frozen << std::endl; - } - diskann::Index index(diskann::L2, dim, max_points_to_insert, true, params, params, enable_tags, concurrent); @@ -226,7 +219,7 @@ void build_incremental_index(const std::string &data_path, const unsigned L, con } else { - index.set_start_point_at_random(static_cast(start_point_norm)); + index.set_start_points_at_random(static_cast(start_point_norm)); index.enable_delete(); } @@ -337,7 +330,7 @@ void build_incremental_index(const std::string &data_path, const unsigned L, con int main(int argc, char **argv) { std::string data_type, dist_fn, data_path, index_path_prefix; - unsigned num_threads, R, L; + unsigned num_threads, R, L, num_start_pts; float alpha, start_point_norm; size_t points_to_skip, max_points_to_insert, beginning_index_size, points_per_checkpoint, checkpoints_per_snapshot, points_to_delete_from_beginning, start_deletes_after; @@ -379,6 +372,8 @@ int main(int argc, char **argv) desc.add_options()("start_deletes_after", po::value(&start_deletes_after)->default_value(0), ""); desc.add_options()("start_point_norm", po::value(&start_point_norm)->default_value(0), "Set the start point to a random point on a sphere of this radius"); + desc.add_options()("num_start_points", po::value(&num_start_pts)->default_value(0), + "Set the number of random start (frozen) points to use when inserting and searching"); po::variables_map vm; po::store(po::parse_command_line(argc, argv, desc), vm); @@ -407,17 +402,17 @@ int main(int argc, char **argv) { if (data_type == std::string("int8")) build_incremental_index(data_path, L, R, alpha, num_threads, points_to_skip, max_points_to_insert, - beginning_index_size, start_point_norm, points_per_checkpoint, - checkpoints_per_snapshot, index_path_prefix, + beginning_index_size, start_point_norm, num_start_pts, + points_per_checkpoint, checkpoints_per_snapshot, index_path_prefix, points_to_delete_from_beginning, start_deletes_after, concurrent); else if (data_type == std::string("uint8")) build_incremental_index(data_path, L, R, alpha, num_threads, points_to_skip, max_points_to_insert, - beginning_index_size, start_point_norm, points_per_checkpoint, - checkpoints_per_snapshot, index_path_prefix, + beginning_index_size, start_point_norm, num_start_pts, + points_per_checkpoint, checkpoints_per_snapshot, index_path_prefix, points_to_delete_from_beginning, start_deletes_after, concurrent); else if (data_type == std::string("float")) build_incremental_index(data_path, L, R, alpha, num_threads, points_to_skip, max_points_to_insert, - beginning_index_size, start_point_norm, points_per_checkpoint, + beginning_index_size, start_point_norm, num_start_pts, points_per_checkpoint, checkpoints_per_snapshot, index_path_prefix, points_to_delete_from_beginning, start_deletes_after, concurrent); else diff --git a/tests/test_streaming_scenario.cpp b/tests/test_streaming_scenario.cpp index 26fa60ce7..b1f655162 100644 --- a/tests/test_streaming_scenario.cpp +++ b/tests/test_streaming_scenario.cpp @@ -169,7 +169,7 @@ template void build_incremental_index(const std::string &data_path, const unsigned L, const unsigned R, const float alpha, const unsigned insert_threads, const unsigned consolidate_threads, size_t max_points_to_insert, size_t active_window, size_t consolidate_interval, - const float start_point_norm, const std::string &save_path) + const float start_point_norm, unsigned num_start_pts, const std::string &save_path) { const unsigned C = 500; const bool saturate_graph = false; @@ -183,6 +183,7 @@ void build_incremental_index(const std::string &data_path, const unsigned L, con params.Set("num_rnds", 1); params.Set("num_threads", insert_threads); params.Set("Lf", 0); + params.Set("num_frozen_pts", num_start_pts); diskann::Parameters delete_params; delete_params.Set("L", L); delete_params.Set("R", R); @@ -215,20 +216,11 @@ void build_incremental_index(const std::string &data_path, const unsigned L, con using TagT = uint32_t; using LabelT = uint32_t; - unsigned num_frozen = 1; const bool enable_tags = true; - auto num_frozen_str = getenv("TTS_NUM_FROZEN"); - - if (num_frozen_str != nullptr) - { - num_frozen = std::atoi(num_frozen_str); - std::cout << "Overriding num_frozen to" << num_frozen << std::endl; - } - diskann::Index index(diskann::L2, dim, active_window + 4 * consolidate_interval, true, params, params, enable_tags, true); - index.set_start_point_at_random(static_cast(start_point_norm)); + index.set_start_points_at_random(static_cast(start_point_norm)); index.enable_delete(); T *data = nullptr; @@ -286,7 +278,7 @@ int main(int argc, char **argv) { std::string data_type, dist_fn, data_path, index_path_prefix; unsigned insert_threads, consolidate_threads; - unsigned R, L; + unsigned R, L, num_start_pts; float alpha, start_point_norm; size_t max_points_to_insert, active_window, consolidate_interval; @@ -325,6 +317,8 @@ int main(int argc, char **argv) "the window while deleting the same number from the left"); desc.add_options()("start_point_norm", po::value(&start_point_norm)->required(), "Set the start point to a random point on a sphere of this radius"); + desc.add_options()("num_start_points", po::value(&num_start_pts)->default_value(0), + "Set the number of random start (frozen) points to use when inserting and searching"); po::variables_map vm; po::store(po::parse_command_line(argc, argv, desc), vm); @@ -353,15 +347,15 @@ int main(int argc, char **argv) if (data_type == std::string("int8")) build_incremental_index(data_path, L, R, alpha, insert_threads, consolidate_threads, max_points_to_insert, active_window, consolidate_interval, start_point_norm, - index_path_prefix); + num_start_pts, index_path_prefix); else if (data_type == std::string("uint8")) build_incremental_index(data_path, L, R, alpha, insert_threads, consolidate_threads, max_points_to_insert, active_window, consolidate_interval, - start_point_norm, index_path_prefix); + start_point_norm, num_start_pts, index_path_prefix); else if (data_type == std::string("float")) build_incremental_index(data_path, L, R, alpha, insert_threads, consolidate_threads, max_points_to_insert, active_window, consolidate_interval, start_point_norm, - index_path_prefix); + num_start_pts, index_path_prefix); else std::cout << "Unsupported type. Use float/int8/uint8" << std::endl; }