Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create a data store abstraction #305

Merged
merged 75 commits into from
Apr 26, 2023
Merged
Show file tree
Hide file tree
Changes from 67 commits
Commits
Show all changes
75 commits
Select commit Hold shift + click to select a range
29bbef7
gi# This is a combination of 2 commits.
harsha-simhadri Mar 28, 2023
0267012
added some seed files
harsha-simhadri Mar 28, 2023
7131d70
add seed files
harsha-simhadri Mar 31, 2023
73e5287
New distance metric hierarchy
gopal-msr Apr 2, 2023
5351dd5
Merged new distance implementation
gopal-msr Apr 3, 2023
8adc959
Refactoring changes
gopal-msr Apr 3, 2023
412ed82
Fixing compile errors in refactored code
gopal-msr Apr 3, 2023
2af4b1c
Fixing compile errors
gopal-msr Apr 3, 2023
8d6b832
DiskANN Builds with initial refactoring changes
gopal-msr Apr 3, 2023
c417f9a
Saving changes for Ravi
gopal-msr Apr 4, 2023
e82b784
More refactoring
gopal-msr Apr 4, 2023
54c70c5
Refactor
gopal-msr Apr 5, 2023
42becfb
Fixed most of the bugs related to _data
gopal-msr Apr 5, 2023
f62259c
add seed files
harsha-simhadri Mar 31, 2023
d18804c
gi# This is a combination of 2 commits.
harsha-simhadri Mar 28, 2023
4de26eb
added some seed files
harsha-simhadri Mar 28, 2023
a1c07a1
New distance metric hierarchy
gopal-msr Apr 2, 2023
e9c6697
Refactoring changes
gopal-msr Apr 3, 2023
7684cf5
Fixing compile errors in refactored code
gopal-msr Apr 3, 2023
713e4ff
Fixing compile errors
gopal-msr Apr 3, 2023
260766e
DiskANN Builds with initial refactoring changes
gopal-msr Apr 3, 2023
516fb99
Saving changes for Ravi
gopal-msr Apr 4, 2023
f66f975
More refactoring
gopal-msr Apr 4, 2023
91b486c
Refactor
gopal-msr Apr 5, 2023
b348a07
Fixed most of the bugs related to _data
gopal-msr Apr 5, 2023
7cbea15
Merge branch 'rakri_gopal/refactor' of https://github.com/microsoft/D…
gopal-msr Apr 6, 2023
aadafad
Post merge with main
gopal-msr Apr 6, 2023
418ccb9
Refactored version which compiles on Windows
gopal-msr Apr 6, 2023
469c6a5
now compiles on linux
Apr 6, 2023
874c53c
minor clean-up
Apr 9, 2023
c502147
minor bug fix
Apr 9, 2023
310f373
minor bug
Apr 10, 2023
bf11919
clang format fix + build error fix
yashpatel007 Apr 10, 2023
f7e3424
clang format fix
yashpatel007 Apr 10, 2023
f890a96
minor changes
harsha-simhadri Apr 11, 2023
0528046
added back the fast_l2 feature
Apr 11, 2023
85b44e1
added back set_start_points in index.cpp
Apr 11, 2023
f7fec45
Version for review
gopal-msr Apr 11, 2023
bb5cd79
Merge branch 'rakri_gopal/refactor' of https://github.com/microsoft/D…
gopal-msr Apr 11, 2023
921cec5
Incorporating Harsha's comments - 2
gopal-msr Apr 12, 2023
a310e44
move implementation of abstract data store methods to a cpp file
harsha-simhadri Apr 12, 2023
d751fa4
clang format
harsha-simhadri Apr 12, 2023
e7aa49b
clang format
harsha-simhadri Apr 12, 2023
e6e5fc9
Added slot manager file (empty) and fixed compile errors
gopal-msr Apr 13, 2023
b780905
fixed a linux compile error
Apr 14, 2023
23d5a10
clang
Apr 14, 2023
362e896
debugging workflow failure
Apr 16, 2023
85e008d
clang
Apr 16, 2023
0b8fc55
more debug
Apr 16, 2023
dc183e9
more debug
Apr 16, 2023
a191354
debug for workflow
Apr 16, 2023
6ea93ea
remove slot manager
harsha-simhadri Apr 17, 2023
1f3e88a
Merge branch 'main' into rakri_gopal/refactor
harsha-simhadri Apr 17, 2023
fc2e5ad
Removed the #ifdef WINDOWS directive from class definitions
gopal-msr Apr 18, 2023
8fd32d1
Incorporating changes from remote
gopal-msr Apr 18, 2023
e301aeb
Refactoring alignment factor into distance hierarchy
gopal-msr Apr 18, 2023
879a37e
Fixing cosine distance
gopal-msr Apr 18, 2023
7375708
Ensuring we call preprocess_query always
gopal-msr Apr 18, 2023
112d4fe
Fixed distance invocations
gopal-msr Apr 18, 2023
3f73901
fixed cosine bug, clang-formatted
Apr 18, 2023
ec90ca6
cleaned up and added comments
Apr 19, 2023
6025c33
clang-formatted
Apr 19, 2023
1c12790
more clang-format
Apr 19, 2023
a4a5cb5
clang-format 3
Apr 19, 2023
d9fc915
remove deleted code in scratch.cpp
harsha-simhadri Apr 19, 2023
4ce5abf
reverted clang to Microsoft
Apr 20, 2023
ca6080e
small change
Apr 20, 2023
f46f2bd
Removed slot_manager from this PR
gopal-msr Apr 20, 2023
96c5dc2
newline at EOF in_mem_Graph_store.cpp
harsha-simhadri Apr 19, 2023
d2f3bd7
rename distance_metric to distance_fn
harsha-simhadri Apr 23, 2023
4da65db
resolving PR comments
yashpatel007 Apr 24, 2023
c9db2de
minor bug fix for initialization
Apr 25, 2023
517377c
rebasing from main + build success fixed typecast warnings
yashpatel007 Apr 26, 2023
381e95a
clang format fix
yashpatel007 Apr 26, 2023
808c9ab
trying to resolve ubuntu build errors
yashpatel007 Apr 26, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/pr-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ jobs:
- name: test a streaming index
if: success() || failure()
run: |
${{ env.diskann_built_tests }}/test_streaming_scenario --data_type int8 --dist_fn l2 --data_path rand_int8_10D_10K_norm50.0.bin --index_path_prefix index_stream -R 64 -L 600 --alpha 1.2 --insert_threads 4 --consolidate_threads 4 --max_points_to_insert 10000 --active_window 4000 --consolidate_interval 2000 --start_point_norm 200
${{ env.diskann_built_tests }}/test_streaming_scenario --data_type int8 --dist_fn l2 --data_path ./rand_int8_10D_10K_norm50.0.bin --index_path_prefix index_stream -R 64 -L 600 --alpha 1.2 --insert_threads 4 --consolidate_threads 4 --max_points_to_insert 10000 --active_window 4000 --consolidate_interval 2000 --start_point_norm 200
${{ env.diskann_built_utils }}/compute_groundtruth --data_type int8 --dist_fn l2 --base_file index_stream.after-streaming-act4000-cons2000-max10000.data --query_file rand_int8_10D_1K_norm50.0.bin --K 100 --gt_file gt100_base-act4000-cons2000-max10000 --tags_file index_stream.after-streaming-act4000-cons2000-max10000.tags
${{ env.diskann_built_tests }}/search_memory_index --data_type int8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix index_stream.after-streaming-act4000-cons2000-max10000 --result_path res_stream --query_file ./rand_int8_10D_1K_norm50.0.bin --gt_file gt100_base-act4000-cons2000-max10000 -K 10 -L 20 40 60 80 100 -T 64 --dynamic true --tags 1

Expand Down
110 changes: 110 additions & 0 deletions include/abstract_data_store.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.

#pragma once

#include <vector>
#include <string>

#include "types.h"
#include "windows_customizations.h"

namespace diskann
{

template <typename data_t> class AbstractDataStore
{
public:
AbstractDataStore(const location_t capacity, const size_t dim);
yashpatel007 marked this conversation as resolved.
Show resolved Hide resolved

// Return number of points returned
virtual location_t load(const std::string &filename) = 0;

// Why does store take num_pts? Since store only has capacity, but we allow
// resizing we can end up in a situation where the store has spare capacity.
// To optimize disk utilization, we pass the number of points that are "true"
// points, so that the store can discard the empty locations before saving.
virtual size_t save(const std::string &filename, const location_t num_pts) = 0;

DISKANN_DLLEXPORT virtual location_t capacity() const;

DISKANN_DLLEXPORT virtual size_t get_dims() const;

// Implementers can choose to return _dim if they are not
// concerned about memory alignment.
// Some distance metrics (like l2) need data vectors to be aligned, so we
// align the dimension by padding zeros.
virtual size_t get_aligned_dim() const = 0;

// populate the store with vectors (either from a pointer or bin file),
// potentially after pre-processing the vectors if the metric deems so
// e.g., normalizing vectors for cosine distance over floating-point vectors
// useful for bulk or static index building.
virtual void populate_data(const data_t *vectors, const location_t num_pts) = 0;
virtual void populate_data(const std::string &filename, const size_t offset) = 0;

// save the first num_pts many vectors back to bin file
// note: cannot undo the pre-processing done in populate data
virtual void extract_data_to_bin(const std::string &filename, const location_t num_pts) = 0;

// Returns the updated capacity of the datastore. Clients should check
// if resize actually changed the capacity to new_num_points before
// proceeding with operations. See the code below:
// auto new_capcity = data_store->resize(new_num_points);
// if ( new_capacity >= new_num_points) {
// //PROCEED
// else
// //ERROR.
virtual location_t resize(const location_t new_num_points);

// operations on vectors
// like populate_data function, but over one vector at a time useful for
// streaming setting
virtual void get_vector(const location_t i, data_t *dest) const = 0;
virtual void set_vector(const location_t i, const data_t *const vector) = 0;
virtual void prefetch_vector(const location_t loc) = 0;

// internal shuffle operations to move around vectors
// will bulk-move all the vectors in [old_start_loc, old_start_loc +
// num_points) to [new_start_loc, new_start_loc + num_points) and set the old
// positions to zero vectors.
virtual void move_vectors(const location_t old_start_loc, const location_t new_start_loc,
const location_t num_points) = 0;

// same as above, without resetting the vectors in [from_loc, from_loc +
// num_points) to zero
virtual void copy_vectors(const location_t from_loc, const location_t to_loc, const location_t num_points) = 0;

// metric specific operations

virtual float get_distance(const data_t *query, const location_t loc) const = 0;
virtual void get_distance(const data_t *query, const location_t *locations, const uint32_t location_count,
float *distances) const = 0;
virtual float get_distance(const location_t loc1, const location_t loc2) const = 0;

// stats of the data stored in store
// Returns the point in the dataset that is closest to the mean of all points
// in the dataset
virtual location_t calculate_medoid() const = 0;

// search helpers
// if the base data is aligned per the request of the metric, this will tell
// how to align the query vector in a consistent manner
virtual size_t get_alignment_factor() const = 0;

protected:
// Expand the datastore to new_num_points. Returns the new capacity created,
// which should be == new_num_points in the normal case. Implementers can also
// return _capacity to indicate that there are not implementing this method.
virtual location_t expand(const location_t new_num_points) = 0;

// Shrink the datastore to new_num_points. It is NOT an error if shrink
// doesn't reduce the capacity so callers need to check this correctly. See
// also for "default" implementation
virtual location_t shrink(const location_t new_num_points) = 0;

location_t _capacity;
size_t _dim;
};

} // namespace diskann
31 changes: 31 additions & 0 deletions include/abstract_graph_store.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.

#pragma once

#include <string>
#include <vector>

#include "types.h"

namespace diskann
{

class AbstractGraphStore
{
public:
AbstractGraphStore(const size_t max_pts) : _capacity(max_pts)
{
}

virtual int load(const std::string &index_path_prefix) = 0;
virtual int store(const std::string &index_path_prefix) = 0;

virtual void get_adj_list(const location_t i, std::vector<location_t> &neighbors) = 0;
virtual void set_adj_list(const location_t i, std::vector<location_t> &neighbors) = 0;

private:
size_t _capacity;
};

} // namespace diskann
127 changes: 109 additions & 18 deletions include/distance.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once
#include "windows_customizations.h"
#include <cstring>

namespace diskann
{
Expand All @@ -14,40 +15,106 @@ enum Metric
template <typename T> class Distance
{
public:
virtual float compare(const T *a, const T *b, uint32_t length) const = 0;
virtual ~Distance()
DISKANN_DLLEXPORT Distance(diskann::Metric dist_metric) : _distance_metric(dist_metric)
{
}

// distance comparison function
DISKANN_DLLEXPORT virtual float compare(const T *a, const T *b, uint32_t length) const = 0;

// Needed only for COSINE-BYTE and INNER_PRODUCT-BYTE
DISKANN_DLLEXPORT virtual float compare(const T *a, const T *b, const float normA, const float normB,
uint32_t length) const;

// For MIPS, normalization adds an extra dimension to the vectors.
// This function lets callers know if the normalization process
// changes the dimension.
DISKANN_DLLEXPORT virtual uint32_t post_normalization_dimension(uint32_t orig_dimension) const;

DISKANN_DLLEXPORT virtual diskann::Metric get_metric() const;

// This is for efficiency. If no normalization is required, the callers
// can simply ignore the normalize_data_for_build() function.
DISKANN_DLLEXPORT virtual bool preprocessing_required() const;

// Check the preprocessing_required() function before calling this.
// Clients can call the function like this:
//
// if (metric->preprocessing_required()){
// T* normalized_data_batch;
// Split data into batches of batch_size and for each, call:
// metric->preprocess_base_points(data_batch, batch_size);
//
// TODO: This does not take into account the case for SSD inner product
// where the dimensions change after normalization.
DISKANN_DLLEXPORT virtual void preprocess_base_points(T *original_data, const size_t orig_dim,
const size_t num_points);

// Invokes normalization for a single vector during search. The scratch space
// has to be created by the caller keeping track of the fact that
// normalization might change the dimension of the query vector.
DISKANN_DLLEXPORT virtual void preprocess_query(const T *query_vec, const size_t query_dim, T *scratch_query);

// If an algorithm has a requirement that some data be aligned to a certain
// boundary it can use this function to indicate that requirement. Currently,
// we are setting it to 8 because that works well for AVX2. If we have AVX512
// implementations of distance algos, they might have to set this to 16
// (depending on how they are implemented)
DISKANN_DLLEXPORT virtual size_t get_required_alignment() const;

// Providing a default implementation for the virtual destructor because we
// don't expect most metric implementations to need it.
DISKANN_DLLEXPORT virtual ~Distance();

protected:
diskann::Metric _distance_metric;
size_t _alignment_factor = 8;
};

class DistanceCosineInt8 : public Distance<int8_t>
{
public:
DistanceCosineInt8() : Distance<int8_t>(diskann::Metric::COSINE)
{
}
DISKANN_DLLEXPORT virtual float compare(const int8_t *a, const int8_t *b, uint32_t length) const;
};

class DistanceL2Int8 : public Distance<int8_t>
{
public:
DistanceL2Int8() : Distance<int8_t>(diskann::Metric::L2)
{
}
DISKANN_DLLEXPORT virtual float compare(const int8_t *a, const int8_t *b, uint32_t size) const;
};

// AVX implementations. Borrowed from HNSW code.
class AVXDistanceL2Int8 : public Distance<int8_t>
{
public:
AVXDistanceL2Int8() : Distance<int8_t>(diskann::Metric::L2)
{
}
DISKANN_DLLEXPORT virtual float compare(const int8_t *a, const int8_t *b, uint32_t length) const;
};

class DistanceCosineFloat : public Distance<float>
{
public:
DistanceCosineFloat() : Distance<float>(diskann::Metric::COSINE)
{
}
DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t length) const;
};

class DistanceL2Float : public Distance<float>
{
public:
DistanceL2Float() : Distance<float>(diskann::Metric::L2)
{
}

#ifdef _WINDOWS
DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t size) const;
#else
Expand All @@ -58,46 +125,49 @@ class DistanceL2Float : public Distance<float>
class AVXDistanceL2Float : public Distance<float>
{
public:
AVXDistanceL2Float() : Distance<float>(diskann::Metric::L2)
{
}
DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t length) const;
};

class SlowDistanceL2Float : public Distance<float>
template <typename T> class SlowDistanceL2 : public Distance<T>
{
public:
DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t length) const;
SlowDistanceL2() : Distance<T>(diskann::Metric::L2)
{
}
DISKANN_DLLEXPORT virtual float compare(const T *a, const T *b, uint32_t length) const;
};

class SlowDistanceCosineUInt8 : public Distance<uint8_t>
{
public:
SlowDistanceCosineUInt8() : Distance<uint8_t>(diskann::Metric::COSINE)
{
}
DISKANN_DLLEXPORT virtual float compare(const uint8_t *a, const uint8_t *b, uint32_t length) const;
};

class DistanceL2UInt8 : public Distance<uint8_t>
{
public:
DistanceL2UInt8() : Distance<uint8_t>(diskann::Metric::L2)
{
harsha-simhadri marked this conversation as resolved.
Show resolved Hide resolved
}
DISKANN_DLLEXPORT virtual float compare(const uint8_t *a, const uint8_t *b, uint32_t size) const;
};

// Simple implementations for non-AVX machines. Compiler can optimize.
template <typename T> class SlowDistanceL2Int : public Distance<T>
template <typename T> class DistanceInnerProduct : public Distance<T>
{
public:
// Implementing here because this is a template function
DISKANN_DLLEXPORT virtual float compare(const T *a, const T *b, uint32_t length) const
DistanceInnerProduct() : Distance<T>(diskann::Metric::INNER_PRODUCT)
{
uint32_t result = 0;
for (uint32_t i = 0; i < length; i++)
{
result += ((int32_t)((int16_t)a[i] - (int16_t)b[i])) * ((int32_t)((int16_t)a[i] - (int16_t)b[i]));
}
return (float)result;
}
};

template <typename T> class DistanceInnerProduct : public Distance<T>
{
public:
DistanceInnerProduct(diskann::Metric metric) : Distance<T>(metric)
{
}
inline float inner_product(const T *a, const T *b, unsigned size) const;

inline float compare(const T *a, const T *b, unsigned size) const
Expand All @@ -115,13 +185,19 @@ template <typename T> class DistanceFastL2 : public DistanceInnerProduct<T>
// currently defined only for float.
// templated for future use.
public:
DistanceFastL2() : DistanceInnerProduct<T>(diskann::Metric::FAST_L2)
{
}
float norm(const T *a, unsigned size) const;
float compare(const T *a, const T *b, float norm, unsigned size) const;
};

class AVXDistanceInnerProductFloat : public Distance<float>
{
public:
AVXDistanceInnerProductFloat() : Distance<float>(diskann::Metric::INNER_PRODUCT)
{
}
DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t length) const;
};

Expand All @@ -130,13 +206,28 @@ class AVXNormalizedCosineDistanceFloat : public Distance<float>
private:
AVXDistanceInnerProductFloat _innerProduct;

protected:
void normalize_and_copy(const float *a, uint32_t length, float *a_norm) const;

public:
AVXNormalizedCosineDistanceFloat() : Distance<float>(diskann::Metric::COSINE)
{
}
DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t length) const
{
// Inner product returns negative values to indicate distance.
// This will ensure that cosine is between -1 and 1.
return 1.0f + _innerProduct.compare(a, b, length);
}
DISKANN_DLLEXPORT virtual uint32_t post_normalization_dimension(uint32_t orig_dimension) const override;

DISKANN_DLLEXPORT virtual bool preprocessing_required() const;

DISKANN_DLLEXPORT virtual void preprocess_base_points(float *original_data, const size_t orig_dim,
const size_t num_points) override;

DISKANN_DLLEXPORT virtual void preprocess_query(const float *query_vec, const size_t query_dim,
float *scratch_query_vector) override;
};

template <typename T> Distance<T> *get_distance_function(Metric m);
Expand Down
Loading