Skip to content

Commit

Permalink
Consolidate Index Constructors (#418)
Browse files Browse the repository at this point in the history
* initial commit

* updating python bindings to use new ctor

* python binding error fix

* error fix

* reverting some changes -> experiment

* removing redundnt code from native index

* python build error fix

* tyring to resolve python build error

* attempt at python build fix

* adding IndexSearchParams

* setting search threads to non zero

* minor check removed

* eperiment 3-> making distance fully owned by data_store

* exp 3 clang fix

* exp 4

* making distance as unique_ptr

* trying to fix build

* finally fixing problem

* some minor fix

* adding dll export to index_factory static function

* adding dll export for static fn in index_factory

* code cleanup

* resolving gopal's comments

* resolving build failures
  • Loading branch information
yashpatel007 committed Aug 15, 2023
1 parent 977dd3c commit 6d4e2bf
Show file tree
Hide file tree
Showing 20 changed files with 194 additions and 223 deletions.
44 changes: 0 additions & 44 deletions apps/build_memory_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,50 +22,6 @@

namespace po = boost::program_options;

template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t>
int build_in_memory_index(const diskann::Metric &metric, const std::string &data_path, const uint32_t R,
const uint32_t L, const float alpha, const std::string &save_path, const uint32_t num_threads,
const bool use_pq_build, const size_t num_pq_bytes, const bool use_opq,
const std::string &label_file, const std::string &universal_label, const uint32_t Lf)
{
diskann::IndexWriteParameters paras = diskann::IndexWriteParametersBuilder(L, R)
.with_filter_list_size(Lf)
.with_alpha(alpha)
.with_saturate_graph(false)
.with_num_threads(num_threads)
.build();
std::string labels_file_to_use = save_path + "_label_formatted.txt";
std::string mem_labels_int_map_file = save_path + "_labels_map.txt";

size_t data_num, data_dim;
diskann::get_bin_metadata(data_path, data_num, data_dim);

diskann::Index<T, TagT, LabelT> index(metric, data_dim, data_num, false, false, false, use_pq_build, num_pq_bytes,
use_opq);
auto s = std::chrono::high_resolution_clock::now();
if (label_file == "")
{
index.build(data_path.c_str(), data_num, paras);
}
else
{
convert_labels_string_to_int(label_file, labels_file_to_use, mem_labels_int_map_file, universal_label);
if (universal_label != "")
{
LabelT unv_label_as_num = 0;
index.set_universal_label(unv_label_as_num);
}
index.build_filtered_index(data_path.c_str(), labels_file_to_use, data_num, paras);
}
std::chrono::duration<double> diff = std::chrono::high_resolution_clock::now() - s;

std::cout << "Indexing time: " << diff.count() << "\n";
index.save(save_path.c_str());
if (label_file != "")
std::remove(labels_file_to_use.c_str());
return 0;
}

int main(int argc, char **argv)
{
std::string data_type, dist_fn, data_path, index_path_prefix, label_file, universal_label, label_type;
Expand Down
2 changes: 1 addition & 1 deletion apps/build_stitched_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ void prune_and_save(path final_index_path_prefix, path full_index_path_prefix, p
auto pruning_index_timer = std::chrono::high_resolution_clock::now();

diskann::get_bin_metadata(input_data_path, number_of_label_points, dimension);
diskann::Index<T> index(diskann::Metric::L2, dimension, number_of_label_points, false, false);
diskann::Index<T> index(diskann::Metric::L2, dimension, number_of_label_points, nullptr, nullptr, 0, false, false);

// not searching this index, set search_l to 0
index.load(full_index_path_prefix.c_str(), num_threads, 1);
Expand Down
4 changes: 2 additions & 2 deletions apps/test_insert_deletes_consolidate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,14 +152,14 @@ void build_incremental_index(const std::string &data_path, diskann::IndexWritePa
using TagT = uint32_t;
auto data_type = diskann_type_to_name<T>();
auto tag_type = diskann_type_to_name<TagT>();
auto index_search_params = diskann::IndexSearchParams(params.search_list_size, params.num_threads);
diskann::IndexConfig index_config = diskann::IndexConfigBuilder()
.with_metric(diskann::L2)
.with_dimension(dim)
.with_max_points(max_points_to_insert)
.is_dynamic_index(true)
.with_index_write_params(params)
.with_search_threads(params.num_threads)
.with_initial_search_list_size(params.search_list_size)
.with_index_search_params(index_search_params)
.with_data_type(data_type)
.with_tag_type(tag_type)
.with_data_load_store_strategy(diskann::MEMORY)
Expand Down
5 changes: 2 additions & 3 deletions apps/test_streaming_scenario.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ void build_incremental_index(const std::string &data_path, const uint32_t L, con
.with_num_frozen_points(num_start_pts)
.build();

auto index_search_params = diskann::IndexSearchParams(L, insert_threads);
diskann::IndexWriteParameters delete_params = diskann::IndexWriteParametersBuilder(L, R)
.with_max_occlusion_size(C)
.with_alpha(alpha)
Expand All @@ -200,7 +201,6 @@ void build_incremental_index(const std::string &data_path, const uint32_t L, con
diskann::cout << "metadata: file " << data_path << " has " << num_points << " points in " << dim << " dims"
<< std::endl;
aligned_dim = ROUND_UP(dim, 8);

auto index_config = diskann::IndexConfigBuilder()
.with_metric(diskann::L2)
.with_dimension(dim)
Expand All @@ -210,12 +210,11 @@ void build_incremental_index(const std::string &data_path, const uint32_t L, con
.is_use_opq(false)
.with_num_pq_chunks(0)
.is_pq_dist_build(false)
.with_search_threads(insert_threads)
.with_initial_search_list_size(L)
.with_tag_type(diskann_type_to_name<TagT>())
.with_label_type(diskann_type_to_name<LabelT>())
.with_data_type(diskann_type_to_name<T>())
.with_index_write_params(params)
.with_index_search_params(index_search_params)
.with_data_load_store_strategy(diskann::MEMORY)
.build();

Expand Down
2 changes: 1 addition & 1 deletion apps/utils/count_bfs_levels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ template <typename T> void bfs_count(const std::string &index_path, uint32_t dat
{
using TagT = uint32_t;
using LabelT = uint32_t;
diskann::Index<T, TagT, LabelT> index(diskann::Metric::L2, data_dims, 0, false, false);
diskann::Index<T, TagT, LabelT> index(diskann::Metric::L2, data_dims, 0, nullptr, nullptr, 0, false, false);
std::cout << "Index class instantiated" << std::endl;
index.load(index_path.c_str(), 1, 100);
std::cout << "Index loaded" << std::endl;
Expand Down
4 changes: 2 additions & 2 deletions include/in_mem_data_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ namespace diskann
template <typename data_t> class InMemDataStore : public AbstractDataStore<data_t>
{
public:
InMemDataStore(const location_t capacity, const size_t dim, std::shared_ptr<Distance<data_t>> distance_fn);
InMemDataStore(const location_t capacity, const size_t dim, std::unique_ptr<Distance<data_t>> distance_fn);
virtual ~InMemDataStore();

virtual location_t load(const std::string &filename) override;
Expand Down Expand Up @@ -73,7 +73,7 @@ template <typename data_t> class InMemDataStore : public AbstractDataStore<data_
// but this gives us perf benefits as the datastore can do distance
// computations during search and compute norms of vectors internally without
// have to copy data back and forth.
std::shared_ptr<Distance<data_t>> _distance_fn;
std::unique_ptr<Distance<data_t>> _distance_fn;

// in case we need to save vector norms for optimization
std::shared_ptr<float[]> _pre_computed_norms;
Expand Down
20 changes: 7 additions & 13 deletions include/index.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,21 +49,16 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas
**************************************************************************/

public:
// Constructor for Bulk operations and for creating the index object solely
// for loading a prexisting index.
DISKANN_DLLEXPORT Index(Metric m, const size_t dim, const size_t max_points = 1, const bool dynamic_index = false,
// Call this when creating and passing Index Config is inconvenient.
DISKANN_DLLEXPORT Index(Metric m, const size_t dim, const size_t max_points,
const std::shared_ptr<IndexWriteParameters> index_parameters,
const std::shared_ptr<IndexSearchParams> index_search_params,
const size_t num_frozen_pts = 0, const bool dynamic_index = false,
const bool enable_tags = false, const bool concurrent_consolidate = false,
const bool pq_dist_build = false, const size_t num_pq_chunks = 0,
const bool use_opq = false, const size_t num_frozen_pts = 0,
const bool init_data_store = true);

// Constructor for incremental index
DISKANN_DLLEXPORT Index(Metric m, const size_t dim, const size_t max_points, const bool dynamic_index,
const IndexWriteParameters &indexParameters, const uint32_t initial_search_list_size,
const uint32_t search_threads, const bool enable_tags = false,
const bool concurrent_consolidate = false, const bool pq_dist_build = false,
const size_t num_pq_chunks = 0, const bool use_opq = false);
const bool use_opq = false);

// This is called by IndexFactory which returns AbstractIndex's simplified API
DISKANN_DLLEXPORT Index(const IndexConfig &index_config, std::unique_ptr<AbstractDataStore<T>> data_store
/* std::unique_ptr<AbstractGraphStore> graph_store*/);

Expand Down Expand Up @@ -329,7 +324,6 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas
private:
// Distance functions
Metric _dist_metric = diskann::L2;
std::shared_ptr<Distance<T>> _distance;

// Data
std::unique_ptr<AbstractDataStore<T>> _data_store;
Expand Down
58 changes: 35 additions & 23 deletions include/index_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,24 +33,23 @@ struct IndexConfig
std::string tag_type;
std::string data_type;

// Params for building index
std::shared_ptr<IndexWriteParameters> index_write_params;

uint32_t search_threads;
uint32_t initial_search_list_size;
// Params for searching index
std::shared_ptr<IndexSearchParams> index_search_params;

private:
IndexConfig(DataStoreStrategy data_strategy, GraphStoreStrategy graph_strategy, Metric metric, size_t dimension,
size_t max_points, size_t num_pq_chunks, size_t num_frozen_points, bool dynamic_index, bool enable_tags,
bool pq_dist_build, bool concurrent_consolidate, bool use_opq, const std::string &data_type,
const std::string &tag_type, const std::string &label_type,
std::shared_ptr<IndexWriteParameters> index_write_params, uint32_t search_threads,
uint32_t initial_search_list_size)
std::shared_ptr<IndexWriteParameters> index_write_params,
std::shared_ptr<IndexSearchParams> index_search_params)
: data_strategy(data_strategy), graph_strategy(graph_strategy), metric(metric), dimension(dimension),
max_points(max_points), dynamic_index(dynamic_index), enable_tags(enable_tags), pq_dist_build(pq_dist_build),
concurrent_consolidate(concurrent_consolidate), use_opq(use_opq), num_pq_chunks(num_pq_chunks),
num_frozen_pts(num_frozen_points), label_type(label_type), tag_type(tag_type), data_type(data_type),
index_write_params(index_write_params), search_threads(search_threads),
initial_search_list_size(initial_search_list_size)
index_write_params(index_write_params), index_search_params(index_search_params)
{
}

Expand All @@ -60,9 +59,7 @@ struct IndexConfig
class IndexConfigBuilder
{
public:
IndexConfigBuilder()
{
}
IndexConfigBuilder() = default;

IndexConfigBuilder &with_metric(Metric m)
{
Expand Down Expand Up @@ -160,15 +157,31 @@ class IndexConfigBuilder
return *this;
}

IndexConfigBuilder &with_search_threads(uint32_t search_threads)
IndexConfigBuilder &with_index_write_params(std::shared_ptr<IndexWriteParameters> index_write_params_ptr)
{
if (index_write_params_ptr == nullptr)
{
diskann::cout << "Passed, empty build_params while creating index config" << std::endl;
return *this;
}
this->_index_write_params = index_write_params_ptr;
return *this;
}

IndexConfigBuilder &with_index_search_params(IndexSearchParams &search_params)
{
this->_search_threads = search_threads;
this->_index_search_params = std::make_shared<IndexSearchParams>(search_params);
return *this;
}

IndexConfigBuilder &with_initial_search_list_size(uint32_t search_list_size)
IndexConfigBuilder &with_index_search_params(std::shared_ptr<IndexSearchParams> search_params_ptr)
{
this->_initial_search_list_size = search_list_size;
if (search_params_ptr == nullptr)
{
diskann::cout << "Passed, empty search_params while creating index config" << std::endl;
return *this;
}
this->_index_search_params = search_params_ptr;
return *this;
}

Expand All @@ -177,19 +190,20 @@ class IndexConfigBuilder
if (_data_type == "" || _data_type.empty())
throw ANNException("Error: data_type can not be empty", -1);

if (_dynamic_index && _index_write_params != nullptr)
if (_dynamic_index && _num_frozen_pts == 0)
{
if (_search_threads == 0)
throw ANNException("Error: please pass search_threads for building dynamic index.", -1);
_num_frozen_pts = 1;
}

if (_initial_search_list_size == 0)
if (_dynamic_index)
{
if (_index_search_params != nullptr && _index_search_params->initial_search_list_size == 0)
throw ANNException("Error: please pass initial_search_list_size for building dynamic index.", -1);
}

return IndexConfig(_data_strategy, _graph_strategy, _metric, _dimension, _max_points, _num_pq_chunks,
_num_frozen_pts, _dynamic_index, _enable_tags, _pq_dist_build, _concurrent_consolidate,
_use_opq, _data_type, _tag_type, _label_type, _index_write_params, _search_threads,
_initial_search_list_size);
_use_opq, _data_type, _tag_type, _label_type, _index_write_params, _index_search_params);
}

IndexConfigBuilder(const IndexConfigBuilder &) = delete;
Expand Down Expand Up @@ -217,8 +231,6 @@ class IndexConfigBuilder
std::string _data_type;

std::shared_ptr<IndexWriteParameters> _index_write_params;

uint32_t _search_threads;
uint32_t _initial_search_list_size;
std::shared_ptr<IndexSearchParams> _index_search_params;
};
} // namespace diskann
10 changes: 6 additions & 4 deletions include/index_factory.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@ class IndexFactory
DISKANN_DLLEXPORT explicit IndexFactory(const IndexConfig &config);
DISKANN_DLLEXPORT std::unique_ptr<AbstractIndex> create_instance();

// Consruct a data store with distance function emplaced within
template <typename T>
DISKANN_DLLEXPORT static std::unique_ptr<AbstractDataStore<T>> construct_datastore(DataStoreStrategy stratagy,
size_t num_points,
size_t dimension, Metric m);

private:
void check_config();

template <typename T>
std::unique_ptr<AbstractDataStore<T>> construct_datastore(DataStoreStrategy stratagy, size_t num_points,
size_t dimension);

std::unique_ptr<AbstractGraphStore> construct_graphstore(GraphStoreStrategy stratagy, size_t size);

template <typename data_type, typename tag_type, typename label_type>
Expand Down
11 changes: 11 additions & 0 deletions include/parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,17 @@ class IndexWriteParameters
friend class IndexWriteParametersBuilder;
};

class IndexSearchParams
{
public:
IndexSearchParams(const uint32_t initial_search_list_size, const uint32_t num_search_threads)
: initial_search_list_size(initial_search_list_size), num_search_threads(num_search_threads)
{
}
const uint32_t initial_search_list_size; // search L
const uint32_t num_search_threads; // search threads
};

class IndexWriteParametersBuilder
{
/**
Expand Down
19 changes: 10 additions & 9 deletions python/include/static_disk_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
#include <cstdint>
#include <string>


#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>

Expand All @@ -21,16 +20,16 @@

namespace py = pybind11;

namespace diskannpy {
namespace diskannpy
{

#ifdef _WINDOWS
typedef WindowsAlignedFileReader PlatformSpecificAlignedFileReader;
#else
typedef LinuxAlignedFileReader PlatformSpecificAlignedFileReader;
#endif

template <typename DT>
class StaticDiskIndex
template <typename DT> class StaticDiskIndex
{
public:
StaticDiskIndex(diskann::Metric metric, const std::string &index_path_prefix, uint32_t num_threads,
Expand All @@ -40,13 +39,15 @@ class StaticDiskIndex

void cache_sample_paths(size_t num_nodes_to_cache, const std::string &warmup_query_file, uint32_t num_threads);

NeighborsAndDistances<StaticIdType> search(py::array_t<DT, py::array::c_style | py::array::forcecast> &query, uint64_t knn,
uint64_t complexity, uint64_t beam_width);
NeighborsAndDistances<StaticIdType> search(py::array_t<DT, py::array::c_style | py::array::forcecast> &query,
uint64_t knn, uint64_t complexity, uint64_t beam_width);

NeighborsAndDistances<StaticIdType> batch_search(
py::array_t<DT, py::array::c_style | py::array::forcecast> &queries, uint64_t num_queries, uint64_t knn,
uint64_t complexity, uint64_t beam_width, uint32_t num_threads);

NeighborsAndDistances<StaticIdType> batch_search(py::array_t<DT, py::array::c_style | py::array::forcecast> &queries, uint64_t num_queries,
uint64_t knn, uint64_t complexity, uint64_t beam_width, uint32_t num_threads);
private:
std::shared_ptr<AlignedFileReader> _reader;
diskann::PQFlashIndex<DT> _index;
};
}
} // namespace diskannpy
9 changes: 7 additions & 2 deletions python/src/builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,15 @@ void build_memory_index(const diskann::Metric metric, const std::string &vector_
.with_saturate_graph(false)
.with_num_threads(num_threads)
.build();
diskann::IndexSearchParams index_search_params =
diskann::IndexSearchParams(index_build_params.search_list_size, num_threads);
size_t data_num, data_dim;
diskann::get_bin_metadata(vector_bin_path, data_num, data_dim);
diskann::Index<T, TagT, LabelT> index(metric, data_dim, data_num, use_tags, use_tags, false, use_pq_build,
num_pq_bytes, use_opq);

diskann::Index<T, TagT, LabelT> index(metric, data_dim, data_num,
std::make_shared<diskann::IndexWriteParameters>(index_build_params),
std::make_shared<diskann::IndexSearchParams>(index_search_params), 0,
use_tags, use_tags, false, use_pq_build, num_pq_bytes, use_opq);

if (use_tags)
{
Expand Down
Loading

0 comments on commit 6d4e2bf

Please sign in to comment.