Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consolidate Index Constructors #418

Merged
merged 24 commits into from
Aug 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 0 additions & 44 deletions apps/build_memory_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,50 +22,6 @@

namespace po = boost::program_options;

template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t>
int build_in_memory_index(const diskann::Metric &metric, const std::string &data_path, const uint32_t R,
const uint32_t L, const float alpha, const std::string &save_path, const uint32_t num_threads,
const bool use_pq_build, const size_t num_pq_bytes, const bool use_opq,
const std::string &label_file, const std::string &universal_label, const uint32_t Lf)
{
diskann::IndexWriteParameters paras = diskann::IndexWriteParametersBuilder(L, R)
.with_filter_list_size(Lf)
.with_alpha(alpha)
.with_saturate_graph(false)
.with_num_threads(num_threads)
.build();
std::string labels_file_to_use = save_path + "_label_formatted.txt";
std::string mem_labels_int_map_file = save_path + "_labels_map.txt";

size_t data_num, data_dim;
diskann::get_bin_metadata(data_path, data_num, data_dim);

diskann::Index<T, TagT, LabelT> index(metric, data_dim, data_num, false, false, false, use_pq_build, num_pq_bytes,
use_opq);
auto s = std::chrono::high_resolution_clock::now();
if (label_file == "")
{
index.build(data_path.c_str(), data_num, paras);
}
else
{
convert_labels_string_to_int(label_file, labels_file_to_use, mem_labels_int_map_file, universal_label);
if (universal_label != "")
{
LabelT unv_label_as_num = 0;
index.set_universal_label(unv_label_as_num);
}
index.build_filtered_index(data_path.c_str(), labels_file_to_use, data_num, paras);
}
std::chrono::duration<double> diff = std::chrono::high_resolution_clock::now() - s;

std::cout << "Indexing time: " << diff.count() << "\n";
index.save(save_path.c_str());
if (label_file != "")
std::remove(labels_file_to_use.c_str());
return 0;
}

int main(int argc, char **argv)
{
std::string data_type, dist_fn, data_path, index_path_prefix, label_file, universal_label, label_type;
Expand Down
2 changes: 1 addition & 1 deletion apps/build_stitched_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ void prune_and_save(path final_index_path_prefix, path full_index_path_prefix, p
auto pruning_index_timer = std::chrono::high_resolution_clock::now();

diskann::get_bin_metadata(input_data_path, number_of_label_points, dimension);
diskann::Index<T> index(diskann::Metric::L2, dimension, number_of_label_points, false, false);
diskann::Index<T> index(diskann::Metric::L2, dimension, number_of_label_points, nullptr, nullptr, 0, false, false);
harsha-simhadri marked this conversation as resolved.
Show resolved Hide resolved
yashpatel007 marked this conversation as resolved.
Show resolved Hide resolved

// not searching this index, set search_l to 0
index.load(full_index_path_prefix.c_str(), num_threads, 1);
Expand Down
4 changes: 2 additions & 2 deletions apps/test_insert_deletes_consolidate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,14 +152,14 @@ void build_incremental_index(const std::string &data_path, diskann::IndexWritePa
using TagT = uint32_t;
auto data_type = diskann_type_to_name<T>();
auto tag_type = diskann_type_to_name<TagT>();
auto index_search_params = diskann::IndexSearchParams(params.search_list_size, params.num_threads);
diskann::IndexConfig index_config = diskann::IndexConfigBuilder()
.with_metric(diskann::L2)
.with_dimension(dim)
.with_max_points(max_points_to_insert)
.is_dynamic_index(true)
.with_index_write_params(params)
.with_search_threads(params.num_threads)
.with_initial_search_list_size(params.search_list_size)
.with_index_search_params(index_search_params)
.with_data_type(data_type)
.with_tag_type(tag_type)
.with_data_load_store_strategy(diskann::MEMORY)
Expand Down
5 changes: 2 additions & 3 deletions apps/test_streaming_scenario.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ void build_incremental_index(const std::string &data_path, const uint32_t L, con
.with_num_frozen_points(num_start_pts)
.build();

auto index_search_params = diskann::IndexSearchParams(L, insert_threads);
harsha-simhadri marked this conversation as resolved.
Show resolved Hide resolved
diskann::IndexWriteParameters delete_params = diskann::IndexWriteParametersBuilder(L, R)
.with_max_occlusion_size(C)
.with_alpha(alpha)
Expand All @@ -200,7 +201,6 @@ void build_incremental_index(const std::string &data_path, const uint32_t L, con
diskann::cout << "metadata: file " << data_path << " has " << num_points << " points in " << dim << " dims"
<< std::endl;
aligned_dim = ROUND_UP(dim, 8);

auto index_config = diskann::IndexConfigBuilder()
.with_metric(diskann::L2)
.with_dimension(dim)
Expand All @@ -210,12 +210,11 @@ void build_incremental_index(const std::string &data_path, const uint32_t L, con
.is_use_opq(false)
.with_num_pq_chunks(0)
.is_pq_dist_build(false)
.with_search_threads(insert_threads)
.with_initial_search_list_size(L)
.with_tag_type(diskann_type_to_name<TagT>())
.with_label_type(diskann_type_to_name<LabelT>())
.with_data_type(diskann_type_to_name<T>())
.with_index_write_params(params)
.with_index_search_params(index_search_params)
.with_data_load_store_strategy(diskann::MEMORY)
.build();

Expand Down
2 changes: 1 addition & 1 deletion apps/utils/count_bfs_levels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ template <typename T> void bfs_count(const std::string &index_path, uint32_t dat
{
using TagT = uint32_t;
using LabelT = uint32_t;
diskann::Index<T, TagT, LabelT> index(diskann::Metric::L2, data_dims, 0, false, false);
diskann::Index<T, TagT, LabelT> index(diskann::Metric::L2, data_dims, 0, nullptr, nullptr, 0, false, false);
std::cout << "Index class instantiated" << std::endl;
index.load(index_path.c_str(), 1, 100);
std::cout << "Index loaded" << std::endl;
Expand Down
4 changes: 2 additions & 2 deletions include/in_mem_data_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ namespace diskann
template <typename data_t> class InMemDataStore : public AbstractDataStore<data_t>
{
public:
InMemDataStore(const location_t capacity, const size_t dim, std::shared_ptr<Distance<data_t>> distance_fn);
InMemDataStore(const location_t capacity, const size_t dim, std::unique_ptr<Distance<data_t>> distance_fn);
virtual ~InMemDataStore();

virtual location_t load(const std::string &filename) override;
Expand Down Expand Up @@ -73,7 +73,7 @@ template <typename data_t> class InMemDataStore : public AbstractDataStore<data_
// but this gives us perf benefits as the datastore can do distance
// computations during search and compute norms of vectors internally without
// have to copy data back and forth.
std::shared_ptr<Distance<data_t>> _distance_fn;
std::unique_ptr<Distance<data_t>> _distance_fn;

// in case we need to save vector norms for optimization
std::shared_ptr<float[]> _pre_computed_norms;
Expand Down
20 changes: 7 additions & 13 deletions include/index.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,21 +49,16 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas
**************************************************************************/

public:
// Constructor for Bulk operations and for creating the index object solely
// for loading a prexisting index.
DISKANN_DLLEXPORT Index(Metric m, const size_t dim, const size_t max_points = 1, const bool dynamic_index = false,
// Call this when creating and passing Index Config is inconvenient.
DISKANN_DLLEXPORT Index(Metric m, const size_t dim, const size_t max_points,
const std::shared_ptr<IndexWriteParameters> index_parameters,
const std::shared_ptr<IndexSearchParams> index_search_params,
const size_t num_frozen_pts = 0, const bool dynamic_index = false,
const bool enable_tags = false, const bool concurrent_consolidate = false,
yashpatel007 marked this conversation as resolved.
Show resolved Hide resolved
const bool pq_dist_build = false, const size_t num_pq_chunks = 0,
const bool use_opq = false, const size_t num_frozen_pts = 0,
const bool init_data_store = true);

// Constructor for incremental index
DISKANN_DLLEXPORT Index(Metric m, const size_t dim, const size_t max_points, const bool dynamic_index,
const IndexWriteParameters &indexParameters, const uint32_t initial_search_list_size,
const uint32_t search_threads, const bool enable_tags = false,
const bool concurrent_consolidate = false, const bool pq_dist_build = false,
const size_t num_pq_chunks = 0, const bool use_opq = false);
const bool use_opq = false);

// This is called by IndexFactory which returns AbstractIndex's simplified API
DISKANN_DLLEXPORT Index(const IndexConfig &index_config, std::unique_ptr<AbstractDataStore<T>> data_store
/* std::unique_ptr<AbstractGraphStore> graph_store*/);

Expand Down Expand Up @@ -329,7 +324,6 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas
private:
// Distance functions
Metric _dist_metric = diskann::L2;
std::shared_ptr<Distance<T>> _distance;

// Data
std::unique_ptr<AbstractDataStore<T>> _data_store;
Expand Down
58 changes: 35 additions & 23 deletions include/index_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,24 +33,23 @@ struct IndexConfig
std::string tag_type;
std::string data_type;

// Params for building index
std::shared_ptr<IndexWriteParameters> index_write_params;

uint32_t search_threads;
uint32_t initial_search_list_size;
// Params for searching index
std::shared_ptr<IndexSearchParams> index_search_params;

private:
IndexConfig(DataStoreStrategy data_strategy, GraphStoreStrategy graph_strategy, Metric metric, size_t dimension,
size_t max_points, size_t num_pq_chunks, size_t num_frozen_points, bool dynamic_index, bool enable_tags,
bool pq_dist_build, bool concurrent_consolidate, bool use_opq, const std::string &data_type,
const std::string &tag_type, const std::string &label_type,
std::shared_ptr<IndexWriteParameters> index_write_params, uint32_t search_threads,
uint32_t initial_search_list_size)
std::shared_ptr<IndexWriteParameters> index_write_params,
std::shared_ptr<IndexSearchParams> index_search_params)
: data_strategy(data_strategy), graph_strategy(graph_strategy), metric(metric), dimension(dimension),
max_points(max_points), dynamic_index(dynamic_index), enable_tags(enable_tags), pq_dist_build(pq_dist_build),
concurrent_consolidate(concurrent_consolidate), use_opq(use_opq), num_pq_chunks(num_pq_chunks),
num_frozen_pts(num_frozen_points), label_type(label_type), tag_type(tag_type), data_type(data_type),
index_write_params(index_write_params), search_threads(search_threads),
initial_search_list_size(initial_search_list_size)
index_write_params(index_write_params), index_search_params(index_search_params)
{
}

Expand All @@ -60,9 +59,7 @@ struct IndexConfig
class IndexConfigBuilder
{
public:
IndexConfigBuilder()
{
}
IndexConfigBuilder() = default;

IndexConfigBuilder &with_metric(Metric m)
{
Expand Down Expand Up @@ -160,15 +157,31 @@ class IndexConfigBuilder
return *this;
}

IndexConfigBuilder &with_search_threads(uint32_t search_threads)
IndexConfigBuilder &with_index_write_params(std::shared_ptr<IndexWriteParameters> index_write_params_ptr)
{
if (index_write_params_ptr == nullptr)
{
diskann::cout << "Passed, empty build_params while creating index config" << std::endl;
return *this;
}
this->_index_write_params = index_write_params_ptr;
return *this;
}

IndexConfigBuilder &with_index_search_params(IndexSearchParams &search_params)
{
this->_search_threads = search_threads;
this->_index_search_params = std::make_shared<IndexSearchParams>(search_params);
yashpatel007 marked this conversation as resolved.
Show resolved Hide resolved
return *this;
}

IndexConfigBuilder &with_initial_search_list_size(uint32_t search_list_size)
IndexConfigBuilder &with_index_search_params(std::shared_ptr<IndexSearchParams> search_params_ptr)
yashpatel007 marked this conversation as resolved.
Show resolved Hide resolved
{
this->_initial_search_list_size = search_list_size;
if (search_params_ptr == nullptr)
{
diskann::cout << "Passed, empty search_params while creating index config" << std::endl;
return *this;
}
this->_index_search_params = search_params_ptr;
return *this;
}

Expand All @@ -177,19 +190,20 @@ class IndexConfigBuilder
if (_data_type == "" || _data_type.empty())
throw ANNException("Error: data_type can not be empty", -1);

if (_dynamic_index && _index_write_params != nullptr)
if (_dynamic_index && _num_frozen_pts == 0)
{
if (_search_threads == 0)
throw ANNException("Error: please pass search_threads for building dynamic index.", -1);
_num_frozen_pts = 1;
}

if (_initial_search_list_size == 0)
if (_dynamic_index)
{
if (_index_search_params != nullptr && _index_search_params->initial_search_list_size == 0)
throw ANNException("Error: please pass initial_search_list_size for building dynamic index.", -1);
}

return IndexConfig(_data_strategy, _graph_strategy, _metric, _dimension, _max_points, _num_pq_chunks,
_num_frozen_pts, _dynamic_index, _enable_tags, _pq_dist_build, _concurrent_consolidate,
_use_opq, _data_type, _tag_type, _label_type, _index_write_params, _search_threads,
_initial_search_list_size);
_use_opq, _data_type, _tag_type, _label_type, _index_write_params, _index_search_params);
}

IndexConfigBuilder(const IndexConfigBuilder &) = delete;
Expand Down Expand Up @@ -217,8 +231,6 @@ class IndexConfigBuilder
std::string _data_type;

std::shared_ptr<IndexWriteParameters> _index_write_params;

uint32_t _search_threads;
uint32_t _initial_search_list_size;
std::shared_ptr<IndexSearchParams> _index_search_params;
};
} // namespace diskann
10 changes: 6 additions & 4 deletions include/index_factory.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@ class IndexFactory
DISKANN_DLLEXPORT explicit IndexFactory(const IndexConfig &config);
DISKANN_DLLEXPORT std::unique_ptr<AbstractIndex> create_instance();

// Consruct a data store with distance function emplaced within
template <typename T>
DISKANN_DLLEXPORT static std::unique_ptr<AbstractDataStore<T>> construct_datastore(DataStoreStrategy stratagy,
size_t num_points,
size_t dimension, Metric m);

private:
void check_config();

template <typename T>
std::unique_ptr<AbstractDataStore<T>> construct_datastore(DataStoreStrategy stratagy, size_t num_points,
size_t dimension);

std::unique_ptr<AbstractGraphStore> construct_graphstore(GraphStoreStrategy stratagy, size_t size);

template <typename data_type, typename tag_type, typename label_type>
Expand Down
11 changes: 11 additions & 0 deletions include/parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,17 @@ class IndexWriteParameters
friend class IndexWriteParametersBuilder;
};

class IndexSearchParams
{
public:
IndexSearchParams(const uint32_t initial_search_list_size, const uint32_t num_search_threads)
: initial_search_list_size(initial_search_list_size), num_search_threads(num_search_threads)
{
}
const uint32_t initial_search_list_size; // search L
yashpatel007 marked this conversation as resolved.
Show resolved Hide resolved
const uint32_t num_search_threads; // search threads
};

class IndexWriteParametersBuilder
{
/**
Expand Down
19 changes: 10 additions & 9 deletions python/include/static_disk_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
#include <cstdint>
#include <string>


#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>

Expand All @@ -21,16 +20,16 @@

namespace py = pybind11;

namespace diskannpy {
namespace diskannpy
{

#ifdef _WINDOWS
typedef WindowsAlignedFileReader PlatformSpecificAlignedFileReader;
#else
typedef LinuxAlignedFileReader PlatformSpecificAlignedFileReader;
#endif

template <typename DT>
class StaticDiskIndex
template <typename DT> class StaticDiskIndex
{
public:
StaticDiskIndex(diskann::Metric metric, const std::string &index_path_prefix, uint32_t num_threads,
Expand All @@ -40,13 +39,15 @@ class StaticDiskIndex

void cache_sample_paths(size_t num_nodes_to_cache, const std::string &warmup_query_file, uint32_t num_threads);

NeighborsAndDistances<StaticIdType> search(py::array_t<DT, py::array::c_style | py::array::forcecast> &query, uint64_t knn,
uint64_t complexity, uint64_t beam_width);
NeighborsAndDistances<StaticIdType> search(py::array_t<DT, py::array::c_style | py::array::forcecast> &query,
uint64_t knn, uint64_t complexity, uint64_t beam_width);

NeighborsAndDistances<StaticIdType> batch_search(
py::array_t<DT, py::array::c_style | py::array::forcecast> &queries, uint64_t num_queries, uint64_t knn,
uint64_t complexity, uint64_t beam_width, uint32_t num_threads);

NeighborsAndDistances<StaticIdType> batch_search(py::array_t<DT, py::array::c_style | py::array::forcecast> &queries, uint64_t num_queries,
uint64_t knn, uint64_t complexity, uint64_t beam_width, uint32_t num_threads);
private:
std::shared_ptr<AlignedFileReader> _reader;
diskann::PQFlashIndex<DT> _index;
};
}
} // namespace diskannpy
9 changes: 7 additions & 2 deletions python/src/builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,15 @@ void build_memory_index(const diskann::Metric metric, const std::string &vector_
.with_saturate_graph(false)
.with_num_threads(num_threads)
.build();
diskann::IndexSearchParams index_search_params =
diskann::IndexSearchParams(index_build_params.search_list_size, num_threads);
size_t data_num, data_dim;
diskann::get_bin_metadata(vector_bin_path, data_num, data_dim);
diskann::Index<T, TagT, LabelT> index(metric, data_dim, data_num, use_tags, use_tags, false, use_pq_build,
num_pq_bytes, use_opq);

diskann::Index<T, TagT, LabelT> index(metric, data_dim, data_num,
std::make_shared<diskann::IndexWriteParameters>(index_build_params),
std::make_shared<diskann::IndexSearchParams>(index_search_params), 0,
use_tags, use_tags, false, use_pq_build, num_pq_bytes, use_opq);

if (use_tags)
{
Expand Down
Loading
Loading