From 973a2228f73b2da090ae1321a7094cd9166f3ae8 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Thu, 6 Jun 2024 10:24:10 -0700 Subject: [PATCH 1/5] ss --- rust/lance-index/src/lib.rs | 6 ++++ rust/lance-index/src/vector.rs | 8 +++-- rust/lance-index/src/vector/flat/index.rs | 6 ++-- rust/lance-index/src/vector/flat/storage.rs | 2 +- rust/lance-index/src/vector/graph.rs | 2 +- rust/lance-index/src/vector/hnsw.rs | 2 +- rust/lance-index/src/vector/hnsw/builder.rs | 2 +- rust/lance-index/src/vector/pq/storage.rs | 2 +- rust/lance-index/src/vector/quantizer.rs | 31 +++++++++++++------ rust/lance-index/src/vector/sq/storage.rs | 2 +- .../src/vector/{v3 => }/storage.rs | 2 ++ rust/lance-index/src/vector/v3.rs | 3 -- rust/lance-index/src/vector/v3/subindex.rs | 3 +- rust/lance/src/index/vector/builder.rs | 4 +-- rust/lance/src/index/vector/hnsw.rs | 2 +- rust/lance/src/index/vector/ivf/io.rs | 2 +- rust/lance/src/index/vector/ivf/v2.rs | 6 ++-- 17 files changed, 49 insertions(+), 36 deletions(-) rename rust/lance-index/src/vector/{v3 => }/storage.rs (98%) diff --git a/rust/lance-index/src/lib.rs b/rust/lance-index/src/lib.rs index c65e4d7136..95492daf5f 100644 --- a/rust/lance-index/src/lib.rs +++ b/rust/lance-index/src/lib.rs @@ -2,6 +2,12 @@ // SPDX-FileCopyrightText: Copyright The Lance Authors //! Lance secondary index library +//! +//!
+//! This is internal crate used by the lance project. +//!
+//! API stability is not guaranteed. +//!
#![cfg_attr( all(feature = "nightly", target_arch = "x86_64"), diff --git a/rust/lance-index/src/vector.rs b/rust/lance-index/src/vector.rs index 1aadd542a0..e8919cdf1a 100644 --- a/rust/lance-index/src/vector.rs +++ b/rust/lance-index/src/vector.rs @@ -22,6 +22,7 @@ pub mod pq; pub mod quantizer; pub mod residual; pub mod sq; +pub mod storage; pub mod transform; pub mod utils; pub mod v3; @@ -31,11 +32,12 @@ use crate::{prefilter::PreFilter, Index}; pub use residual::RESIDUAL_COLUMN; // TODO: Make these crate private once the migration from lance to lance-index is done. +pub const DIST_COL: &str = "_distance"; +pub const DISTANCE_TYPE_KEY: &str = "distance_type"; +pub const INDEX_UUID_COLUMN: &str = "__index_uuid"; +pub const PART_ID_COLUMN: &str = "__ivf_part_id"; pub const PQ_CODE_COLUMN: &str = "__pq_code"; pub const SQ_CODE_COLUMN: &str = "__sq_code"; -pub const PART_ID_COLUMN: &str = "__ivf_part_id"; -pub const INDEX_UUID_COLUMN: &str = "__index_uuid"; -pub const DIST_COL: &str = "_distance"; /// Query parameters for the vector indices #[derive(Debug, Clone)] diff --git a/rust/lance-index/src/vector/flat/index.rs b/rust/lance-index/src/vector/flat/index.rs index c0c1a88e5c..f9a3900719 100644 --- a/rust/lance-index/src/vector/flat/index.rs +++ b/rust/lance-index/src/vector/flat/index.rs @@ -20,10 +20,8 @@ use crate::{ vector::{ graph::{OrderedFloat, OrderedNode}, quantizer::{Quantization, QuantizationType, Quantizer, QuantizerMetadata}, - v3::{ - storage::{DistCalculator, VectorStore}, - subindex::IvfSubIndex, - }, + storage::{DistCalculator, VectorStore}, + v3::subindex::IvfSubIndex, Query, DIST_COL, }, }; diff --git a/rust/lance-index/src/vector/flat/storage.rs b/rust/lance-index/src/vector/flat/storage.rs index 98acc7c7c2..d364765fda 100644 --- a/rust/lance-index/src/vector/flat/storage.rs +++ b/rust/lance-index/src/vector/flat/storage.rs @@ -6,8 +6,8 @@ use std::sync::Arc; use crate::vector::quantizer::QuantizerStorage; +use crate::vector::storage::{DistCalculator, VectorStore}; use crate::vector::utils::prefetch_arrow_array; -use crate::vector::v3::storage::{DistCalculator, VectorStore}; use arrow::array::AsArray; use arrow::compute::concat_batches; use arrow::datatypes::UInt64Type; diff --git a/rust/lance-index/src/vector/graph.rs b/rust/lance-index/src/vector/graph.rs index d2fd8e1345..c0401e6561 100644 --- a/rust/lance-index/src/vector/graph.rs +++ b/rust/lance-index/src/vector/graph.rs @@ -17,7 +17,7 @@ pub mod builder; use crate::vector::DIST_COL; -use crate::vector::v3::storage::DistCalculator; +use crate::vector::storage::DistCalculator; pub(crate) const NEIGHBORS_COL: &str = "__neighbors"; diff --git a/rust/lance-index/src/vector/hnsw.rs b/rust/lance-index/src/vector/hnsw.rs index be5d5a80e2..9a67f8ecc7 100644 --- a/rust/lance-index/src/vector/hnsw.rs +++ b/rust/lance-index/src/vector/hnsw.rs @@ -16,7 +16,7 @@ use serde::{Deserialize, Serialize}; use self::builder::HnswBuildParams; use super::graph::{OrderedFloat, OrderedNode}; -use super::v3::storage::VectorStore; +use super::storage::VectorStore; const HNSW_TYPE: &str = "HNSW"; const VECTOR_ID_COL: &str = "__vector_id"; diff --git a/rust/lance-index/src/vector/hnsw/builder.rs b/rust/lance-index/src/vector/hnsw/builder.rs index 09c36c0cc1..aa9a97da26 100644 --- a/rust/lance-index/src/vector/hnsw/builder.rs +++ b/rust/lance-index/src/vector/hnsw/builder.rs @@ -37,7 +37,7 @@ use crate::vector::graph::greedy_search; use crate::vector::graph::{ Graph, OrderedFloat, OrderedNode, VisitedGenerator, DISTS_FIELD, NEIGHBORS_COL, NEIGHBORS_FIELD, }; -use crate::vector::v3::storage::{DistCalculator, VectorStore}; +use crate::vector::storage::{DistCalculator, VectorStore}; use crate::vector::DIST_COL; use crate::{IndexMetadata, INDEX_METADATA_SCHEMA_KEY}; diff --git a/rust/lance-index/src/vector/pq/storage.rs b/rust/lance-index/src/vector/pq/storage.rs index 6f268be569..fe92b22b53 100644 --- a/rust/lance-index/src/vector/pq/storage.rs +++ b/rust/lance-index/src/vector/pq/storage.rs @@ -38,8 +38,8 @@ use crate::{ ivf::storage::IvfData, pq::transform::PQTransformer, quantizer::{QuantizerMetadata, QuantizerStorage}, + storage::{DistCalculator, VectorStore}, transform::Transformer, - v3::storage::{DistCalculator, VectorStore}, PQ_CODE_COLUMN, }, IndexMetadata, INDEX_METADATA_SCHEMA_KEY, diff --git a/rust/lance-index/src/vector/quantizer.rs b/rust/lance-index/src/vector/quantizer.rs index 77804cb5b4..28a366fb22 100644 --- a/rust/lance-index/src/vector/quantizer.rs +++ b/rust/lance-index/src/vector/quantizer.rs @@ -12,7 +12,7 @@ use lance_arrow::ArrowFloatType; use lance_core::{Error, Result}; use lance_file::reader::FileReader; use lance_io::traits::Reader; -use lance_linalg::distance::{DistanceType, Dot, MetricType, L2}; +use lance_linalg::distance::{DistanceType, Dot, L2}; use lance_table::format::SelfDescribingFileReader; use serde::{Deserialize, Serialize}; use snafu::{location, Location}; @@ -33,7 +33,7 @@ use super::{ storage::{ScalarQuantizationMetadata, ScalarQuantizationStorage}, ScalarQuantizer, }, - v3::storage::VectorStore, + storage::VectorStore, }; use super::{PQ_CODE_COLUMN, SQ_CODE_COLUMN}; @@ -66,6 +66,11 @@ impl std::fmt::Display for QuantizationType { } } +/// Quantization Method. +/// +///
+/// Internal use only. End-user does not use this directly. +///
#[derive(Debug, Clone, DeepSizeOf)] pub enum Quantizer { Flat(FlatQuantizer), @@ -157,7 +162,7 @@ pub trait QuantizerStorage: Clone + Sized + DeepSizeOf + VectorStore { async fn load_partition( reader: &FileReader, range: std::ops::Range, - metric_type: MetricType, + distance_type: DistanceType, metadata: &Self::Metadata, ) -> Result; } @@ -335,11 +340,11 @@ where } } -/// Loader to load partitioned PQ storage from disk. +/// Loader to load partitioned [VectorStore] from disk. pub struct IvfQuantizationStorage { reader: FileReader, - metric_type: MetricType, + distance_type: DistanceType, quantizer: Quantizer, metadata: Q::Metadata, @@ -359,7 +364,7 @@ impl Clone for IvfQuantizationStorage { fn clone(&self) -> Self { Self { reader: self.reader.clone(), - metric_type: self.metric_type, + distance_type: self.distance_type, quantizer: self.quantizer.clone(), metadata: self.metadata.clone(), ivf: self.ivf.clone(), @@ -391,15 +396,15 @@ impl IvfQuantizationStorage { message: format!("Failed to parse index metadata: {}", metadata_str), location: location!(), })?; - let metric_type: MetricType = MetricType::try_from(index_metadata.distance_type.as_str())?; + let distance_type = DistanceType::try_from(index_metadata.distance_type.as_str())?; let ivf_data = IvfData::load(&reader).await?; let metadata = Q::Metadata::load(&reader).await?; - let quantizer = Q::from_metadata(&metadata, metric_type)?; + let quantizer = Q::from_metadata(&metadata, distance_type)?; Ok(Self { reader, - metric_type, + distance_type, quantizer, metadata, ivf: ivf_data, @@ -419,8 +424,14 @@ impl IvfQuantizationStorage { self.ivf.num_partitions() } + /// Load one partition of vector storage. + /// + /// # Parameters + /// - `part_id`, partition id + /// + /// pub async fn load_partition(&self, part_id: usize) -> Result { let range = self.ivf.row_range(part_id); - Q::Storage::load_partition(&self.reader, range, self.metric_type, &self.metadata).await + Q::Storage::load_partition(&self.reader, range, self.distance_type, &self.metadata).await } } diff --git a/rust/lance-index/src/vector/sq/storage.rs b/rust/lance-index/src/vector/sq/storage.rs index 08df2b444b..6f312a698f 100644 --- a/rust/lance-index/src/vector/sq/storage.rs +++ b/rust/lance-index/src/vector/sq/storage.rs @@ -24,8 +24,8 @@ use snafu::{location, Location}; use crate::{ vector::{ quantizer::{QuantizerMetadata, QuantizerStorage}, + storage::{DistCalculator, VectorStore}, transform::Transformer, - v3::storage::{DistCalculator, VectorStore}, SQ_CODE_COLUMN, }, IndexMetadata, INDEX_METADATA_SCHEMA_KEY, diff --git a/rust/lance-index/src/vector/v3/storage.rs b/rust/lance-index/src/vector/storage.rs similarity index 98% rename from rust/lance-index/src/vector/v3/storage.rs rename to rust/lance-index/src/vector/storage.rs index c84f39174b..59aaf4245d 100644 --- a/rust/lance-index/src/vector/v3/storage.rs +++ b/rust/lance-index/src/vector/storage.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors +//! Vector Storage, holding (quantized) vectors and providing distance calculation. + use std::{any::Any, sync::Arc}; use arrow::compute::concat_batches; diff --git a/rust/lance-index/src/vector/v3.rs b/rust/lance-index/src/vector/v3.rs index 89c7281124..b210e4a721 100644 --- a/rust/lance-index/src/vector/v3.rs +++ b/rust/lance-index/src/vector/v3.rs @@ -2,7 +2,4 @@ // SPDX-FileCopyrightText: Copyright The Lance Authors pub mod shuffler; -pub mod storage; pub mod subindex; - -pub const DISTANCE_TYPE_KEY: &str = "distance_type"; diff --git a/rust/lance-index/src/vector/v3/subindex.rs b/rust/lance-index/src/vector/v3/subindex.rs index d2c6d0fabc..ac05e8b0dc 100644 --- a/rust/lance-index/src/vector/v3/subindex.rs +++ b/rust/lance-index/src/vector/v3/subindex.rs @@ -7,10 +7,9 @@ use arrow_array::{ArrayRef, RecordBatch}; use deepsize::DeepSizeOf; use lance_core::Result; +use crate::vector::storage::VectorStore; use crate::{prefilter::PreFilter, vector::Query}; -use super::storage::VectorStore; - pub const SUB_INDEX_METADATA_KEY: &str = "sub_index_metadata"; /// A sub index for IVF index diff --git a/rust/lance/src/index/vector/builder.rs b/rust/lance/src/index/vector/builder.rs index 466c521fc9..eee266e567 100644 --- a/rust/lance/src/index/vector/builder.rs +++ b/rust/lance/src/index/vector/builder.rs @@ -16,13 +16,13 @@ use lance_index::{ IvfBuildParams, }, quantizer::Quantization, + storage::{StorageBuilder, VectorStore}, transform::Transformer, v3::{ shuffler::{ShuffleReader, Shuffler}, - storage::{StorageBuilder, VectorStore}, subindex::IvfSubIndex, - DISTANCE_TYPE_KEY, }, + DISTANCE_TYPE_KEY, }, INDEX_AUXILIARY_FILE_NAME, INDEX_FILE_NAME, }; diff --git a/rust/lance/src/index/vector/hnsw.rs b/rust/lance/src/index/vector/hnsw.rs index 8872f410c8..12eb70df9d 100644 --- a/rust/lance/src/index/vector/hnsw.rs +++ b/rust/lance/src/index/vector/hnsw.rs @@ -20,7 +20,7 @@ use lance_index::{ hnsw::{HnswMetadata, VECTOR_ID_FIELD}, ivf::storage::IVF_PARTITION_KEY, quantizer::{IvfQuantizationStorage, Quantization}, - v3::storage::VectorStore, + storage::VectorStore, Query, }, Index, IndexType, diff --git a/rust/lance/src/index/vector/ivf/io.rs b/rust/lance/src/index/vector/ivf/io.rs index 9dc15ffee1..6c2d0ca317 100644 --- a/rust/lance/src/index/vector/ivf/io.rs +++ b/rust/lance/src/index/vector/ivf/io.rs @@ -27,7 +27,7 @@ use lance_index::vector::pq::ProductQuantizer; use lance_index::vector::{ quantizer::{Quantization, Quantizer}, sq::ScalarQuantizer, - v3::storage::VectorStore, + storage::VectorStore, }; use lance_index::vector::{PART_ID_COLUMN, PQ_CODE_COLUMN}; use lance_io::encodings::plain::PlainEncoder; diff --git a/rust/lance/src/index/vector/ivf/v2.rs b/rust/lance/src/index/vector/ivf/v2.rs index 8061703e0a..0dd5839cc8 100644 --- a/rust/lance/src/index/vector/ivf/v2.rs +++ b/rust/lance/src/index/vector/ivf/v2.rs @@ -24,10 +24,8 @@ use lance_file::v2::reader::FileReader; use lance_index::{ pb, vector::{ - ivf::storage::IVF_METADATA_KEY, - quantizer::Quantization, - v3::{storage::IvfQuantizationStorage, subindex::IvfSubIndex, DISTANCE_TYPE_KEY}, - Query, DIST_COL, + ivf::storage::IVF_METADATA_KEY, quantizer::Quantization, storage::IvfQuantizationStorage, + v3::subindex::IvfSubIndex, Query, DISTANCE_TYPE_KEY, DIST_COL, }, Index, IndexType, INDEX_AUXILIARY_FILE_NAME, INDEX_FILE_NAME, }; From 9ca9be9a711e9c34b20019648af0f56c809f303a Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Thu, 6 Jun 2024 10:35:18 -0700 Subject: [PATCH 2/5] clean up --- rust/lance-index/benches/sq.rs | 2 +- rust/lance-index/src/vector/hnsw/builder.rs | 10 +++++++++ rust/lance/src/index/vector/hnsw.rs | 2 -- rust/lance/src/index/vector/hnsw/builder.rs | 25 --------------------- rust/lance/src/index/vector/ivf/io.rs | 6 ++--- 5 files changed, 14 insertions(+), 31 deletions(-) delete mode 100644 rust/lance/src/index/vector/hnsw/builder.rs diff --git a/rust/lance-index/benches/sq.rs b/rust/lance-index/benches/sq.rs index 854f1e4dae..f19aad6094 100644 --- a/rust/lance-index/benches/sq.rs +++ b/rust/lance-index/benches/sq.rs @@ -11,7 +11,7 @@ use criterion::{criterion_group, criterion_main, Criterion}; use lance_arrow::{FixedSizeListArrayExt, RecordBatchExt}; use lance_core::ROW_ID; use lance_index::vector::{ - sq::storage::ScalarQuantizationStorage, v3::storage::VectorStore, SQ_CODE_COLUMN, + sq::storage::ScalarQuantizationStorage, storage::VectorStore, SQ_CODE_COLUMN, }; use lance_linalg::distance::DistanceType; use lance_testing::datagen::generate_random_array; diff --git a/rust/lance-index/src/vector/hnsw/builder.rs b/rust/lance-index/src/vector/hnsw/builder.rs index aa9a97da26..ae43a5979a 100644 --- a/rust/lance-index/src/vector/hnsw/builder.rs +++ b/rust/lance-index/src/vector/hnsw/builder.rs @@ -32,6 +32,7 @@ use serde::{Deserialize, Serialize}; use super::super::graph::beam_search; use super::{select_neighbors_heuristic, HnswMetadata, HNSW_TYPE, VECTOR_ID_COL, VECTOR_ID_FIELD}; use crate::scalar::IndexWriter; +use crate::vector::flat::storage::FlatStorage; use crate::vector::graph::builder::GraphBuilderNode; use crate::vector::graph::greedy_search; use crate::vector::graph::{ @@ -103,6 +104,15 @@ impl HnswBuildParams { self.parallel_limit = Some(limit); self } + + pub async fn build(self, data: ArrayRef) -> Result { + // We have normalized the vectors if the metric type is cosine, so we can use the L2 distance + let vec_store = Arc::new(FlatStorage::new( + data.as_fixed_size_list().clone(), + DistanceType::L2, + )); + HNSW::build_with_storage(DistanceType::L2, self, vec_store).await + } } /// Build a HNSW graph. diff --git a/rust/lance/src/index/vector/hnsw.rs b/rust/lance/src/index/vector/hnsw.rs index 12eb70df9d..0cadb55c8d 100644 --- a/rust/lance/src/index/vector/hnsw.rs +++ b/rust/lance/src/index/vector/hnsw.rs @@ -37,8 +37,6 @@ use super::VectorIndex; use crate::index::prefilter::PreFilter; use crate::RESULT_SCHEMA; -pub mod builder; - #[derive(Clone, DeepSizeOf)] pub(crate) struct HNSWIndexOptions { pub use_residual: bool, diff --git a/rust/lance/src/index/vector/hnsw/builder.rs b/rust/lance/src/index/vector/hnsw/builder.rs deleted file mode 100644 index 6e80670687..0000000000 --- a/rust/lance/src/index/vector/hnsw/builder.rs +++ /dev/null @@ -1,25 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright The Lance Authors - -use std::sync::Arc; - -use arrow::array::AsArray; -use arrow_array::Array; -use lance_core::Result; -use lance_index::vector::{ - flat::storage::FlatStorage, - hnsw::builder::{HnswBuildParams, HNSW}, -}; -use lance_linalg::distance::DistanceType; - -pub async fn build_hnsw_model( - hnsw_params: HnswBuildParams, - vectors: Arc, -) -> Result { - // We have normalized the vectors if the metric type is cosine, so we can use the L2 distance - let vec_store = Arc::new(FlatStorage::new( - vectors.as_fixed_size_list().clone(), - DistanceType::L2, - )); - HNSW::build_with_storage(DistanceType::L2, hnsw_params, vec_store).await -} diff --git a/rust/lance/src/index/vector/ivf/io.rs b/rust/lance/src/index/vector/ivf/io.rs index 6c2d0ca317..c0fba46466 100644 --- a/rust/lance/src/index/vector/ivf/io.rs +++ b/rust/lance/src/index/vector/ivf/io.rs @@ -45,7 +45,7 @@ use tokio::sync::Semaphore; use super::{IVFIndex, Ivf}; use crate::index::vector::pq::{build_pq_storage, PQIndex}; -use crate::index::vector::{hnsw::builder::build_hnsw_model, sq::build_sq_storage}; +use crate::index::vector::sq::build_sq_storage; use crate::Result; use crate::{dataset::ROW_ID, Dataset}; @@ -511,11 +511,11 @@ async fn build_hnsw_quantization_partition( } async fn build_and_write_hnsw( - hnsw_params: HnswBuildParams, + params: HnswBuildParams, vectors: Arc, mut writer: FileWriter, ) -> Result { - let hnsw = build_hnsw_model(hnsw_params, vectors).await?; + let hnsw = params.build(vectors).await?; let length = hnsw.write(&mut writer).await?; Result::Ok(length) } From 6ed87bc22cc6479b3c96730518465f2f9aaae8d7 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Thu, 6 Jun 2024 10:49:14 -0700 Subject: [PATCH 3/5] move all hnsw to lance-vector --- rust/lance-index/src/vector.rs | 12 ++++++- rust/lance-index/src/vector/hnsw.rs | 9 +++-- .../src/vector/hnsw/index.rs} | 33 +++++++++---------- rust/lance/src/index/vector.rs | 7 ++-- rust/lance/src/index/vector/ivf.rs | 8 ++--- 5 files changed, 38 insertions(+), 31 deletions(-) rename rust/{lance/src/index/vector/hnsw.rs => lance-index/src/vector/hnsw/index.rs} (95%) diff --git a/rust/lance-index/src/vector.rs b/rust/lance-index/src/vector.rs index e8919cdf1a..8cf12f4180 100644 --- a/rust/lance-index/src/vector.rs +++ b/rust/lance-index/src/vector.rs @@ -7,10 +7,12 @@ use std::{collections::HashMap, sync::Arc}; use arrow_array::{ArrayRef, RecordBatch}; +use arrow_schema::Field; use async_trait::async_trait; -use lance_core::Result; +use lance_core::{Result, ROW_ID_FIELD}; use lance_io::traits::Reader; use lance_linalg::distance::DistanceType; +use lazy_static::lazy_static; pub mod bq; pub mod flat; @@ -39,6 +41,14 @@ pub const PART_ID_COLUMN: &str = "__ivf_part_id"; pub const PQ_CODE_COLUMN: &str = "__pq_code"; pub const SQ_CODE_COLUMN: &str = "__sq_code"; +lazy_static! { + pub static ref VECTOR_RESULT_SCHEMA: arrow_schema::SchemaRef = + arrow_schema::SchemaRef::new(arrow_schema::Schema::new(vec![ + ROW_ID_FIELD.clone(), + Field::new(DIST_COL, arrow_schema::DataType::Float32, false), + ])); +} + /// Query parameters for the vector indices #[derive(Debug, Clone)] pub struct Query { diff --git a/rust/lance-index/src/vector/hnsw.rs b/rust/lance-index/src/vector/hnsw.rs index 9a67f8ecc7..278db09def 100644 --- a/rust/lance-index/src/vector/hnsw.rs +++ b/rust/lance-index/src/vector/hnsw.rs @@ -6,18 +6,21 @@ //! Hierarchical Navigable Small World (HNSW). //! -pub mod builder; use arrow_schema::{DataType, Field}; -pub use builder::HNSW; use deepsize::DeepSizeOf; use itertools::Itertools; use serde::{Deserialize, Serialize}; use self::builder::HnswBuildParams; - use super::graph::{OrderedFloat, OrderedNode}; use super::storage::VectorStore; +pub mod builder; +pub mod index; + +pub use builder::HNSW; +pub use index::HNSWIndex; + const HNSW_TYPE: &str = "HNSW"; const VECTOR_ID_COL: &str = "__vector_id"; const POINTER_COL: &str = "__pointer"; diff --git a/rust/lance/src/index/vector/hnsw.rs b/rust/lance-index/src/vector/hnsw/index.rs similarity index 95% rename from rust/lance/src/index/vector/hnsw.rs rename to rust/lance-index/src/vector/hnsw/index.rs index 0cadb55c8d..453e3469e8 100644 --- a/rust/lance/src/index/vector/hnsw.rs +++ b/rust/lance-index/src/vector/hnsw/index.rs @@ -13,18 +13,6 @@ use async_trait::async_trait; use deepsize::DeepSizeOf; use lance_core::{datatypes::Schema, Error, Result}; use lance_file::reader::FileReader; -use lance_index::vector::{hnsw::HNSW, quantizer::Quantizer}; -use lance_index::{ - vector::{ - graph::NEIGHBORS_FIELD, - hnsw::{HnswMetadata, VECTOR_ID_FIELD}, - ivf::storage::IVF_PARTITION_KEY, - quantizer::{IvfQuantizationStorage, Quantization}, - storage::VectorStore, - Query, - }, - Index, IndexType, -}; use lance_io::traits::Reader; use lance_linalg::distance::DistanceType; use lance_table::format::SelfDescribingFileReader; @@ -33,17 +21,26 @@ use serde_json::json; use snafu::{location, Location}; use tracing::instrument; -use super::VectorIndex; -use crate::index::prefilter::PreFilter; -use crate::RESULT_SCHEMA; +use crate::prefilter::PreFilter; +use crate::{ + vector::{ + graph::NEIGHBORS_FIELD, + hnsw::{HnswMetadata, HNSW, VECTOR_ID_FIELD}, + ivf::storage::IVF_PARTITION_KEY, + quantizer::{IvfQuantizationStorage, Quantization, Quantizer}, + storage::VectorStore, + Query, VectorIndex, VECTOR_RESULT_SCHEMA, + }, + Index, IndexType, +}; #[derive(Clone, DeepSizeOf)] -pub(crate) struct HNSWIndexOptions { +pub struct HNSWIndexOptions { pub use_residual: bool, } #[derive(Clone, DeepSizeOf)] -pub(crate) struct HNSWIndex { +pub struct HNSWIndex { distance_type: DistanceType, // Some(T) if the index is loaded, None otherwise @@ -148,7 +145,7 @@ impl Index for HNSWIndex { impl VectorIndex for HNSWIndex { #[instrument(level = "debug", skip_all, name = "HNSWIndex::search")] async fn search(&self, query: &Query, pre_filter: Arc) -> Result { - let schema = RESULT_SCHEMA.clone(); + let schema = VECTOR_RESULT_SCHEMA.clone(); let hnsw = self.hnsw.as_ref().ok_or(Error::Index { message: "HNSW index not loaded".to_string(), diff --git a/rust/lance/src/index/vector.rs b/rust/lance/src/index/vector.rs index fcab6007a4..d7db02602c 100644 --- a/rust/lance/src/index/vector.rs +++ b/rust/lance/src/index/vector.rs @@ -8,7 +8,6 @@ use std::sync::Arc; use std::{any::Any, collections::HashMap}; pub mod builder; -pub mod hnsw; pub mod ivf; pub mod pq; pub mod sq; @@ -26,7 +25,10 @@ use lance_index::vector::ivf::storage::IvfData; use lance_index::vector::pq::ProductQuantizerImpl; use lance_index::vector::v3::shuffler::IvfShuffler; use lance_index::vector::{ - hnsw::builder::HnswBuildParams, + hnsw::{ + builder::HnswBuildParams, + index::{HNSWIndex, HNSWIndexOptions}, + }, ivf::IvfBuildParams, pq::PQBuildParams, sq::{builder::SQBuildParams, ScalarQuantizer}, @@ -41,7 +43,6 @@ use tracing::instrument; use utils::get_vector_dim; use uuid::Uuid; -use self::hnsw::{HNSWIndex, HNSWIndexOptions}; use self::{ivf::*, pq::PQIndex}; use super::{pb, DatasetIndexInternalExt, IndexParams}; diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs index 11ca36721c..1693e64759 100644 --- a/rust/lance/src/index/vector/ivf.rs +++ b/rust/lance/src/index/vector/ivf.rs @@ -30,14 +30,10 @@ use lance_file::{ format::MAGIC, writer::{FileWriter, FileWriterOptions}, }; -use lance_index::vector::{ - hnsw::HNSW, - quantizer::{Quantization, QuantizationMetadata, Quantizer}, -}; use lance_index::{ optimize::OptimizeOptions, vector::{ - hnsw::builder::HnswBuildParams, + hnsw::{builder::HnswBuildParams, HNSWIndex, HNSW}, ivf::{ builder::load_precomputed_partitions, shuffler::shuffle_dataset, @@ -45,6 +41,7 @@ use lance_index::{ IvfBuildParams, }, pq::{PQBuildParams, ProductQuantizer}, + quantizer::{Quantization, QuantizationMetadata, Quantizer}, sq::{builder::SQBuildParams, ScalarQuantizer}, Query, VectorIndex, DIST_COL, }, @@ -78,7 +75,6 @@ use uuid::Uuid; use self::io::write_hnsw_quantization_index_partitions; use super::{ - hnsw::HNSWIndex, pq::{build_pq_model, PQIndex}, utils::maybe_sample_training_data, }; From 02db25bbf00e5747def3eac56b10b0bf9f3a3968 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Thu, 6 Jun 2024 11:00:04 -0700 Subject: [PATCH 4/5] fix python build --- python/src/utils.rs | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/python/src/utils.rs b/python/src/utils.rs index c5fbbb3cf7..d64a3f05b4 100644 --- a/python/src/utils.rs +++ b/python/src/utils.rs @@ -21,16 +21,12 @@ use arrow_array::{ }; use arrow_data::ArrayData; use arrow_schema::DataType; -use lance::{ - datatypes::Schema, - index::vector::{hnsw::builder::*, sq}, - io::ObjectStore, -}; +use lance::{datatypes::Schema, index::vector::sq, io::ObjectStore}; use lance_arrow::FixedSizeListArrayExt; use lance_file::writer::FileWriter; use lance_index::vector::{ hnsw::{builder::HnswBuildParams, HNSW}, - v3::storage::VectorStore, + storage::VectorStore, }; use lance_linalg::kmeans::compute_partitions; use lance_linalg::{ @@ -180,7 +176,7 @@ impl Hnsw { let hnsw = RT .runtime - .block_on(build_hnsw_model(params, vectors.clone())) + .block_on(params.build(vectors.clone())) .map_err(|e| PyIOError::new_err(e.to_string()))?; Ok(Self { hnsw, vectors }) } From f32f3cd16129e2397e051a8d3f02a8241d89200f Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Thu, 6 Jun 2024 11:40:01 -0700 Subject: [PATCH 5/5] fix test --- rust/lance-index/src/vector.rs | 2 +- rust/lance/src/lib.rs | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/rust/lance-index/src/vector.rs b/rust/lance-index/src/vector.rs index 8cf12f4180..ebaee29f1f 100644 --- a/rust/lance-index/src/vector.rs +++ b/rust/lance-index/src/vector.rs @@ -44,8 +44,8 @@ pub const SQ_CODE_COLUMN: &str = "__sq_code"; lazy_static! { pub static ref VECTOR_RESULT_SCHEMA: arrow_schema::SchemaRef = arrow_schema::SchemaRef::new(arrow_schema::Schema::new(vec![ - ROW_ID_FIELD.clone(), Field::new(DIST_COL, arrow_schema::DataType::Float32, false), + ROW_ID_FIELD.clone(), ])); } diff --git a/rust/lance/src/lib.rs b/rust/lance/src/lib.rs index 62bc34f359..706a553841 100644 --- a/rust/lance/src/lib.rs +++ b/rust/lance/src/lib.rs @@ -68,11 +68,9 @@ //! //! ``` //! -use std::sync::Arc; use arrow_schema::DataType; use dataset::builder::DatasetBuilder; -use lance_core::ROW_ID_FIELD; pub use lance_core::{datatypes, error}; pub use lance_core::{Error, Result}; @@ -98,10 +96,4 @@ pub async fn open_dataset>(table_uri: T) -> Result { lazy_static::lazy_static! { pub static ref DIST_FIELD : arrow_schema::Field = arrow_schema::Field::new(DIST_COL, DataType::Float32, true); - /// Row ID field. This is nullable because its validity bitmap is sometimes used - /// as a selection vector. - pub static ref RESULT_SCHEMA: Arc = Arc::new(arrow_schema::Schema::new(vec![ - DIST_FIELD.clone(), - ROW_ID_FIELD.clone(), - ])); }