microsoft · harsha-simhadri · Apr 26, 2023 · Mar 28, 2023 · Mar 28, 2023 · Mar 31, 2023
diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
@@ -172,7 +172,7 @@ jobs:
     - name:  test a streaming index
       if: success() || failure()
       run: |
-        ${{ env.diskann_built_tests }}/test_streaming_scenario --data_type int8 --dist_fn l2 --data_path rand_int8_10D_10K_norm50.0.bin --index_path_prefix index_stream -R 64 -L 600 --alpha 1.2 --insert_threads 4 --consolidate_threads 4 --max_points_to_insert 10000 --active_window 4000 --consolidate_interval 2000 --start_point_norm 200
+        ${{ env.diskann_built_tests }}/test_streaming_scenario --data_type int8 --dist_fn l2 --data_path ./rand_int8_10D_10K_norm50.0.bin --index_path_prefix index_stream -R 64 -L 600 --alpha 1.2 --insert_threads 4 --consolidate_threads 4 --max_points_to_insert 10000 --active_window 4000 --consolidate_interval 2000 --start_point_norm 200
         ${{ env.diskann_built_utils }}/compute_groundtruth --data_type int8 --dist_fn l2 --base_file index_stream.after-streaming-act4000-cons2000-max10000.data --query_file rand_int8_10D_1K_norm50.0.bin --K 100 --gt_file gt100_base-act4000-cons2000-max10000 --tags_file index_stream.after-streaming-act4000-cons2000-max10000.tags
         ${{ env.diskann_built_tests }}/search_memory_index --data_type int8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix index_stream.after-streaming-act4000-cons2000-max10000 --result_path res_stream --query_file ./rand_int8_10D_1K_norm50.0.bin --gt_file gt100_base-act4000-cons2000-max10000 -K 10 -L 20 40 60 80 100 -T 64 --dynamic true --tags 1
 

diff --git a/include/abstract_data_store.h b/include/abstract_data_store.h
@@ -0,0 +1,110 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+#pragma once
+
+#include <vector>
+#include <string>
+
+#include "types.h"
+#include "windows_customizations.h"
+
+namespace diskann
+{
+
+template <typename data_t> class AbstractDataStore
+{
+  public:
+    AbstractDataStore(const location_t capacity, const size_t dim);
+
+    // Return number of points returned
+    virtual location_t load(const std::string &filename) = 0;
+
+    // Why does store take num_pts? Since store only has capacity, but we allow
+    // resizing we can end up in a situation where the store has spare capacity.
+    // To optimize disk utilization, we pass the number of points that are "true"
+    // points, so that the store can discard the empty locations before saving.
+    virtual size_t save(const std::string &filename, const location_t num_pts) = 0;
+
+    DISKANN_DLLEXPORT virtual location_t capacity() const;
+
+    DISKANN_DLLEXPORT virtual size_t get_dims() const;
+
+    // Implementers can choose to return _dim if they are not
+    // concerned about memory alignment.
+    // Some distance metrics (like l2) need data vectors to be aligned, so we
+    // align the dimension by padding zeros.
+    virtual size_t get_aligned_dim() const = 0;
+
+    // populate the store with vectors (either from a pointer or bin file),
+    // potentially after pre-processing the vectors if the metric deems so
+    // e.g., normalizing vectors for cosine distance over floating-point vectors
+    // useful for bulk or static index building.
+    virtual void populate_data(const data_t *vectors, const location_t num_pts) = 0;
+    virtual void populate_data(const std::string &filename, const size_t offset) = 0;
+
+    // save the first num_pts many vectors back to bin file
+    // note: cannot undo the pre-processing done in populate data
+    virtual void extract_data_to_bin(const std::string &filename, const location_t num_pts) = 0;
+
+    // Returns the updated capacity of the datastore. Clients should check
+    // if resize actually changed the capacity to new_num_points before
+    // proceeding with operations. See the code below:
+    //  auto new_capcity = data_store->resize(new_num_points);
+    //  if ( new_capacity >= new_num_points) {
+    //   //PROCEED
+    //  else
+    //    //ERROR.
+    virtual location_t resize(const location_t new_num_points);
+
+    // operations on vectors
+    // like populate_data function, but over one vector at a time useful for
+    // streaming setting
+    virtual void get_vector(const location_t i, data_t *dest) const = 0;
+    virtual void set_vector(const location_t i, const data_t *const vector) = 0;
+    virtual void prefetch_vector(const location_t loc) = 0;
+
+    // internal shuffle operations to move around vectors
+    // will bulk-move all the vectors in [old_start_loc, old_start_loc +
+    // num_points) to [new_start_loc, new_start_loc + num_points) and set the old
+    // positions to zero vectors.
+    virtual void move_vectors(const location_t old_start_loc, const location_t new_start_loc,
+                              const location_t num_points) = 0;
+
+    // same as above, without resetting the vectors in [from_loc, from_loc +
+    // num_points) to zero
+    virtual void copy_vectors(const location_t from_loc, const location_t to_loc, const location_t num_points) = 0;
+
+    // metric specific operations
+
+    virtual float get_distance(const data_t *query, const location_t loc) const = 0;
+    virtual void get_distance(const data_t *query, const location_t *locations, const uint32_t location_count,
+                              float *distances) const = 0;
+    virtual float get_distance(const location_t loc1, const location_t loc2) const = 0;
+
+    // stats of the data stored in store
+    // Returns the point in the dataset that is closest to the mean of all points
+    // in the dataset
+    virtual location_t calculate_medoid() const = 0;
+
+    // search helpers
+    // if the base data is aligned per the request of the metric, this will tell
+    // how to align the query vector in a consistent manner
+    virtual size_t get_alignment_factor() const = 0;
+
+  protected:
+    // Expand the datastore to new_num_points. Returns the new capacity created,
+    // which should be == new_num_points in the normal case. Implementers can also
+    // return _capacity to indicate that there are not implementing this method.
+    virtual location_t expand(const location_t new_num_points) = 0;
+
+    // Shrink the datastore to new_num_points. It is NOT an error if shrink
+    // doesn't reduce the capacity so callers need to check this correctly. See
+    // also for "default" implementation
+    virtual location_t shrink(const location_t new_num_points) = 0;
+
+    location_t _capacity;
+    size_t _dim;
+};
+
+} // namespace diskann
diff --git a/include/abstract_graph_store.h b/include/abstract_graph_store.h
@@ -0,0 +1,31 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "types.h"
+
+namespace diskann
+{
+
+class AbstractGraphStore
+{
+  public:
+    AbstractGraphStore(const size_t max_pts) : _capacity(max_pts)
+    {
+    }
+
+    virtual int load(const std::string &index_path_prefix) = 0;
+    virtual int store(const std::string &index_path_prefix) = 0;
+
+    virtual void get_adj_list(const location_t i, std::vector<location_t> &neighbors) = 0;
+    virtual void set_adj_list(const location_t i, std::vector<location_t> &neighbors) = 0;
+
+  private:
+    size_t _capacity;
+};
+
+} // namespace diskann
diff --git a/include/distance.h b/include/distance.h
@@ -1,5 +1,6 @@
 #pragma once
 #include "windows_customizations.h"
+#include <cstring>
 
 namespace diskann
 {
@@ -14,40 +15,106 @@ enum Metric
 template <typename T> class Distance
 {
   public:
-    virtual float compare(const T *a, const T *b, uint32_t length) const = 0;
-    virtual ~Distance()
+    DISKANN_DLLEXPORT Distance(diskann::Metric dist_metric) : _distance_metric(dist_metric)
     {
     }
+
+    // distance comparison function
+    DISKANN_DLLEXPORT virtual float compare(const T *a, const T *b, uint32_t length) const = 0;
+
+    // Needed only for COSINE-BYTE and INNER_PRODUCT-BYTE
+    DISKANN_DLLEXPORT virtual float compare(const T *a, const T *b, const float normA, const float normB,
+                                            uint32_t length) const;
+
+    // For MIPS, normalization adds an extra dimension to the vectors.
+    // This function lets callers know if the normalization process
+    // changes the dimension.
+    DISKANN_DLLEXPORT virtual uint32_t post_normalization_dimension(uint32_t orig_dimension) const;
+
+    DISKANN_DLLEXPORT virtual diskann::Metric get_metric() const;
+
+    // This is for efficiency. If no normalization is required, the callers
+    // can simply ignore the normalize_data_for_build() function.
+    DISKANN_DLLEXPORT virtual bool preprocessing_required() const;
+
+    // Check the preprocessing_required() function before calling this.
+    // Clients can call the function like this:
+    //
+    //  if (metric->preprocessing_required()){
+    //     T* normalized_data_batch;
+    //      Split data into batches of batch_size and for each, call:
+    //       metric->preprocess_base_points(data_batch, batch_size);
+    //
+    //  TODO: This does not take into account the case for SSD inner product
+    //  where the dimensions change after normalization.
+    DISKANN_DLLEXPORT virtual void preprocess_base_points(T *original_data, const size_t orig_dim,
+                                                          const size_t num_points);
+
+    // Invokes normalization for a single vector during search. The scratch space
+    // has to be created by the caller keeping track of the fact that
+    // normalization might change the dimension of the query vector.
+    DISKANN_DLLEXPORT virtual void preprocess_query(const T *query_vec, const size_t query_dim, T *scratch_query);
+
+    // If an algorithm has a requirement that some data be aligned to a certain
+    // boundary it can use this function to indicate that requirement. Currently,
+    // we are setting it to 8 because that works well for AVX2. If we have AVX512
+    // implementations of distance algos, they might have to set this to 16
+    // (depending on how they are implemented)
+    DISKANN_DLLEXPORT virtual size_t get_required_alignment() const;
+
+    // Providing a default implementation for the virtual destructor because we
+    // don't expect most metric implementations to need it.
+    DISKANN_DLLEXPORT virtual ~Distance();
+
+  protected:
+    diskann::Metric _distance_metric;
+    size_t _alignment_factor = 8;
 };
 
 class DistanceCosineInt8 : public Distance<int8_t>
 {
   public:
+    DistanceCosineInt8() : Distance<int8_t>(diskann::Metric::COSINE)
+    {
+    }
     DISKANN_DLLEXPORT virtual float compare(const int8_t *a, const int8_t *b, uint32_t length) const;
 };
 
 class DistanceL2Int8 : public Distance<int8_t>
 {
   public:
+    DistanceL2Int8() : Distance<int8_t>(diskann::Metric::L2)
+    {
+    }
     DISKANN_DLLEXPORT virtual float compare(const int8_t *a, const int8_t *b, uint32_t size) const;
 };
 
 // AVX implementations. Borrowed from HNSW code.
 class AVXDistanceL2Int8 : public Distance<int8_t>
 {
   public:
+    AVXDistanceL2Int8() : Distance<int8_t>(diskann::Metric::L2)
+    {
+    }
     DISKANN_DLLEXPORT virtual float compare(const int8_t *a, const int8_t *b, uint32_t length) const;
 };
 
 class DistanceCosineFloat : public Distance<float>
 {
   public:
+    DistanceCosineFloat() : Distance<float>(diskann::Metric::COSINE)
+    {
+    }
     DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t length) const;
 };
 
 class DistanceL2Float : public Distance<float>
 {
   public:
+    DistanceL2Float() : Distance<float>(diskann::Metric::L2)
+    {
+    }
+
 #ifdef _WINDOWS
     DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t size) const;
 #else
@@ -58,46 +125,49 @@ class DistanceL2Float : public Distance<float>
 class AVXDistanceL2Float : public Distance<float>
 {
   public:
+    AVXDistanceL2Float() : Distance<float>(diskann::Metric::L2)
+    {
+    }
     DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t length) const;
 };
 
-class SlowDistanceL2Float : public Distance<float>
+template <typename T> class SlowDistanceL2 : public Distance<T>
 {
   public:
-    DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t length) const;
+    SlowDistanceL2() : Distance<T>(diskann::Metric::L2)
+    {
+    }
+    DISKANN_DLLEXPORT virtual float compare(const T *a, const T *b, uint32_t length) const;
 };
 
 class SlowDistanceCosineUInt8 : public Distance<uint8_t>
 {
   public:
+    SlowDistanceCosineUInt8() : Distance<uint8_t>(diskann::Metric::COSINE)
+    {
+    }
     DISKANN_DLLEXPORT virtual float compare(const uint8_t *a, const uint8_t *b, uint32_t length) const;
 };
 
 class DistanceL2UInt8 : public Distance<uint8_t>
 {
   public:
+    DistanceL2UInt8() : Distance<uint8_t>(diskann::Metric::L2)
+    {
+    }
     DISKANN_DLLEXPORT virtual float compare(const uint8_t *a, const uint8_t *b, uint32_t size) const;
 };
 
-// Simple implementations for non-AVX machines. Compiler can optimize.
-template <typename T> class SlowDistanceL2Int : public Distance<T>
+template <typename T> class DistanceInnerProduct : public Distance<T>
 {
   public:
-    // Implementing here because this is a template function
-    DISKANN_DLLEXPORT virtual float compare(const T *a, const T *b, uint32_t length) const
+    DistanceInnerProduct() : Distance<T>(diskann::Metric::INNER_PRODUCT)
     {
-        uint32_t result = 0;
-        for (uint32_t i = 0; i < length; i++)
-        {
-            result += ((int32_t)((int16_t)a[i] - (int16_t)b[i])) * ((int32_t)((int16_t)a[i] - (int16_t)b[i]));
-        }
-        return (float)result;
     }
-};
 
-template <typename T> class DistanceInnerProduct : public Distance<T>
-{
-  public:
+    DistanceInnerProduct(diskann::Metric metric) : Distance<T>(metric)
+    {
+    }
     inline float inner_product(const T *a, const T *b, unsigned size) const;
 
     inline float compare(const T *a, const T *b, unsigned size) const
@@ -115,13 +185,19 @@ template <typename T> class DistanceFastL2 : public DistanceInnerProduct<T>
     // currently defined only for float.
     // templated for future use.
   public:
+    DistanceFastL2() : DistanceInnerProduct<T>(diskann::Metric::FAST_L2)
+    {
+    }
     float norm(const T *a, unsigned size) const;
     float compare(const T *a, const T *b, float norm, unsigned size) const;
 };
 
 class AVXDistanceInnerProductFloat : public Distance<float>
 {
   public:
+    AVXDistanceInnerProductFloat() : Distance<float>(diskann::Metric::INNER_PRODUCT)
+    {
+    }
     DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t length) const;
 };
 
@@ -130,13 +206,28 @@ class AVXNormalizedCosineDistanceFloat : public Distance<float>
   private:
     AVXDistanceInnerProductFloat _innerProduct;
 
+  protected:
+    void normalize_and_copy(const float *a, uint32_t length, float *a_norm) const;
+
   public:
+    AVXNormalizedCosineDistanceFloat() : Distance<float>(diskann::Metric::COSINE)
+    {
+    }
     DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t length) const
     {
         // Inner product returns negative values to indicate distance.
         // This will ensure that cosine is between -1 and 1.
         return 1.0f + _innerProduct.compare(a, b, length);
     }
+    DISKANN_DLLEXPORT virtual uint32_t post_normalization_dimension(uint32_t orig_dimension) const override;
+
+    DISKANN_DLLEXPORT virtual bool preprocessing_required() const;
+
+    DISKANN_DLLEXPORT virtual void preprocess_base_points(float *original_data, const size_t orig_dim,
+                                                          const size_t num_points) override;
+
+    DISKANN_DLLEXPORT virtual void preprocess_query(const float *query_vec, const size_t query_dim,
+                                                    float *scratch_query_vector) override;
 };
 
 template <typename T> Distance<T> *get_distance_function(Metric m);