Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: move hnsw from lance crate to lance-index crate #2446

Merged
merged 5 commits into from
Jun 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 3 additions & 7 deletions python/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,12 @@ use arrow_array::{
};
use arrow_data::ArrayData;
use arrow_schema::DataType;
use lance::{
datatypes::Schema,
index::vector::{hnsw::builder::*, sq},
io::ObjectStore,
};
use lance::{datatypes::Schema, index::vector::sq, io::ObjectStore};
use lance_arrow::FixedSizeListArrayExt;
use lance_file::writer::FileWriter;
use lance_index::vector::{
hnsw::{builder::HnswBuildParams, HNSW},
v3::storage::VectorStore,
storage::VectorStore,
};
use lance_linalg::kmeans::compute_partitions;
use lance_linalg::{
Expand Down Expand Up @@ -180,7 +176,7 @@ impl Hnsw {

let hnsw = RT
.runtime
.block_on(build_hnsw_model(params, vectors.clone()))
.block_on(params.build(vectors.clone()))
.map_err(|e| PyIOError::new_err(e.to_string()))?;
Ok(Self { hnsw, vectors })
}
Expand Down
2 changes: 1 addition & 1 deletion rust/lance-index/benches/sq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use criterion::{criterion_group, criterion_main, Criterion};
use lance_arrow::{FixedSizeListArrayExt, RecordBatchExt};
use lance_core::ROW_ID;
use lance_index::vector::{
sq::storage::ScalarQuantizationStorage, v3::storage::VectorStore, SQ_CODE_COLUMN,
sq::storage::ScalarQuantizationStorage, storage::VectorStore, SQ_CODE_COLUMN,
};
use lance_linalg::distance::DistanceType;
use lance_testing::datagen::generate_random_array;
Expand Down
6 changes: 6 additions & 0 deletions rust/lance-index/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@
// SPDX-FileCopyrightText: Copyright The Lance Authors

//! Lance secondary index library
//!
//! <section class="warning">
//! This is internal crate used by <a href="https://github.com/lancedb/lance">the lance project</a>.
//! <br/>
//! API stability is not guaranteed.
//! </section>

#![cfg_attr(
all(feature = "nightly", target_arch = "x86_64"),
Expand Down
20 changes: 16 additions & 4 deletions rust/lance-index/src/vector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@
use std::{collections::HashMap, sync::Arc};

use arrow_array::{ArrayRef, RecordBatch};
use arrow_schema::Field;
use async_trait::async_trait;
use lance_core::Result;
use lance_core::{Result, ROW_ID_FIELD};
use lance_io::traits::Reader;
use lance_linalg::distance::DistanceType;
use lazy_static::lazy_static;

pub mod bq;
pub mod flat;
Expand All @@ -22,6 +24,7 @@ pub mod pq;
pub mod quantizer;
pub mod residual;
pub mod sq;
pub mod storage;
pub mod transform;
pub mod utils;
pub mod v3;
Expand All @@ -31,11 +34,20 @@ use crate::{prefilter::PreFilter, Index};
pub use residual::RESIDUAL_COLUMN;

// TODO: Make these crate private once the migration from lance to lance-index is done.
pub const DIST_COL: &str = "_distance";
pub const DISTANCE_TYPE_KEY: &str = "distance_type";
pub const INDEX_UUID_COLUMN: &str = "__index_uuid";
pub const PART_ID_COLUMN: &str = "__ivf_part_id";
pub const PQ_CODE_COLUMN: &str = "__pq_code";
pub const SQ_CODE_COLUMN: &str = "__sq_code";
pub const PART_ID_COLUMN: &str = "__ivf_part_id";
pub const INDEX_UUID_COLUMN: &str = "__index_uuid";
pub const DIST_COL: &str = "_distance";

lazy_static! {
pub static ref VECTOR_RESULT_SCHEMA: arrow_schema::SchemaRef =
arrow_schema::SchemaRef::new(arrow_schema::Schema::new(vec![
Field::new(DIST_COL, arrow_schema::DataType::Float32, false),
ROW_ID_FIELD.clone(),
]));
}

/// Query parameters for the vector indices
#[derive(Debug, Clone)]
Expand Down
6 changes: 2 additions & 4 deletions rust/lance-index/src/vector/flat/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,8 @@ use crate::{
vector::{
graph::{OrderedFloat, OrderedNode},
quantizer::{Quantization, QuantizationType, Quantizer, QuantizerMetadata},
v3::{
storage::{DistCalculator, VectorStore},
subindex::IvfSubIndex,
},
storage::{DistCalculator, VectorStore},
v3::subindex::IvfSubIndex,
Query, DIST_COL,
},
};
Expand Down
2 changes: 1 addition & 1 deletion rust/lance-index/src/vector/flat/storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
use std::sync::Arc;

use crate::vector::quantizer::QuantizerStorage;
use crate::vector::storage::{DistCalculator, VectorStore};
use crate::vector::utils::prefetch_arrow_array;
use crate::vector::v3::storage::{DistCalculator, VectorStore};
use arrow::array::AsArray;
use arrow::compute::concat_batches;
use arrow::datatypes::UInt64Type;
Expand Down
2 changes: 1 addition & 1 deletion rust/lance-index/src/vector/graph.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ pub mod builder;

use crate::vector::DIST_COL;

use crate::vector::v3::storage::DistCalculator;
use crate::vector::storage::DistCalculator;

pub(crate) const NEIGHBORS_COL: &str = "__neighbors";

Expand Down
11 changes: 7 additions & 4 deletions rust/lance-index/src/vector/hnsw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,20 @@
//! Hierarchical Navigable Small World (HNSW).
//!

pub mod builder;
use arrow_schema::{DataType, Field};
pub use builder::HNSW;
use deepsize::DeepSizeOf;
use itertools::Itertools;
use serde::{Deserialize, Serialize};

use self::builder::HnswBuildParams;

use super::graph::{OrderedFloat, OrderedNode};
use super::v3::storage::VectorStore;
use super::storage::VectorStore;

pub mod builder;
pub mod index;

pub use builder::HNSW;
pub use index::HNSWIndex;

const HNSW_TYPE: &str = "HNSW";
const VECTOR_ID_COL: &str = "__vector_id";
Expand Down
12 changes: 11 additions & 1 deletion rust/lance-index/src/vector/hnsw/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,13 @@ use serde::{Deserialize, Serialize};
use super::super::graph::beam_search;
use super::{select_neighbors_heuristic, HnswMetadata, HNSW_TYPE, VECTOR_ID_COL, VECTOR_ID_FIELD};
use crate::scalar::IndexWriter;
use crate::vector::flat::storage::FlatStorage;
use crate::vector::graph::builder::GraphBuilderNode;
use crate::vector::graph::greedy_search;
use crate::vector::graph::{
Graph, OrderedFloat, OrderedNode, VisitedGenerator, DISTS_FIELD, NEIGHBORS_COL, NEIGHBORS_FIELD,
};
use crate::vector::v3::storage::{DistCalculator, VectorStore};
use crate::vector::storage::{DistCalculator, VectorStore};
use crate::vector::DIST_COL;
use crate::{IndexMetadata, INDEX_METADATA_SCHEMA_KEY};

Expand Down Expand Up @@ -103,6 +104,15 @@ impl HnswBuildParams {
self.parallel_limit = Some(limit);
self
}

pub async fn build(self, data: ArrayRef) -> Result<HNSW> {
// We have normalized the vectors if the metric type is cosine, so we can use the L2 distance
let vec_store = Arc::new(FlatStorage::new(
data.as_fixed_size_list().clone(),
DistanceType::L2,
));
HNSW::build_with_storage(DistanceType::L2, self, vec_store).await
}
}

/// Build a HNSW graph.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,6 @@ use async_trait::async_trait;
use deepsize::DeepSizeOf;
use lance_core::{datatypes::Schema, Error, Result};
use lance_file::reader::FileReader;
use lance_index::vector::{hnsw::HNSW, quantizer::Quantizer};
use lance_index::{
vector::{
graph::NEIGHBORS_FIELD,
hnsw::{HnswMetadata, VECTOR_ID_FIELD},
ivf::storage::IVF_PARTITION_KEY,
quantizer::{IvfQuantizationStorage, Quantization},
v3::storage::VectorStore,
Query,
},
Index, IndexType,
};
use lance_io::traits::Reader;
use lance_linalg::distance::DistanceType;
use lance_table::format::SelfDescribingFileReader;
Expand All @@ -33,19 +21,26 @@ use serde_json::json;
use snafu::{location, Location};
use tracing::instrument;

use super::VectorIndex;
use crate::index::prefilter::PreFilter;
use crate::RESULT_SCHEMA;

pub mod builder;
use crate::prefilter::PreFilter;
use crate::{
vector::{
graph::NEIGHBORS_FIELD,
hnsw::{HnswMetadata, HNSW, VECTOR_ID_FIELD},
ivf::storage::IVF_PARTITION_KEY,
quantizer::{IvfQuantizationStorage, Quantization, Quantizer},
storage::VectorStore,
Query, VectorIndex, VECTOR_RESULT_SCHEMA,
},
Index, IndexType,
};

#[derive(Clone, DeepSizeOf)]
pub(crate) struct HNSWIndexOptions {
pub struct HNSWIndexOptions {
pub use_residual: bool,
}

#[derive(Clone, DeepSizeOf)]
pub(crate) struct HNSWIndex<Q: Quantization> {
pub struct HNSWIndex<Q: Quantization> {
distance_type: DistanceType,

// Some(T) if the index is loaded, None otherwise
Expand Down Expand Up @@ -150,7 +145,7 @@ impl<Q: Quantization + Send + Sync + 'static> Index for HNSWIndex<Q> {
impl<Q: Quantization + Send + Sync + 'static> VectorIndex for HNSWIndex<Q> {
#[instrument(level = "debug", skip_all, name = "HNSWIndex::search")]
async fn search(&self, query: &Query, pre_filter: Arc<dyn PreFilter>) -> Result<RecordBatch> {
let schema = RESULT_SCHEMA.clone();
let schema = VECTOR_RESULT_SCHEMA.clone();

let hnsw = self.hnsw.as_ref().ok_or(Error::Index {
message: "HNSW index not loaded".to_string(),
Expand Down
2 changes: 1 addition & 1 deletion rust/lance-index/src/vector/pq/storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ use crate::{
ivf::storage::IvfData,
pq::transform::PQTransformer,
quantizer::{QuantizerMetadata, QuantizerStorage},
storage::{DistCalculator, VectorStore},
transform::Transformer,
v3::storage::{DistCalculator, VectorStore},
PQ_CODE_COLUMN,
},
IndexMetadata, INDEX_METADATA_SCHEMA_KEY,
Expand Down
31 changes: 21 additions & 10 deletions rust/lance-index/src/vector/quantizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use lance_arrow::ArrowFloatType;
use lance_core::{Error, Result};
use lance_file::reader::FileReader;
use lance_io::traits::Reader;
use lance_linalg::distance::{DistanceType, Dot, MetricType, L2};
use lance_linalg::distance::{DistanceType, Dot, L2};
use lance_table::format::SelfDescribingFileReader;
use serde::{Deserialize, Serialize};
use snafu::{location, Location};
Expand All @@ -33,7 +33,7 @@ use super::{
storage::{ScalarQuantizationMetadata, ScalarQuantizationStorage},
ScalarQuantizer,
},
v3::storage::VectorStore,
storage::VectorStore,
};
use super::{PQ_CODE_COLUMN, SQ_CODE_COLUMN};

Expand Down Expand Up @@ -66,6 +66,11 @@ impl std::fmt::Display for QuantizationType {
}
}

/// Quantization Method.
///
/// <section class="warning">
/// Internal use only. End-user does not use this directly.
/// </section>
#[derive(Debug, Clone, DeepSizeOf)]
pub enum Quantizer {
Flat(FlatQuantizer),
Expand Down Expand Up @@ -157,7 +162,7 @@ pub trait QuantizerStorage: Clone + Sized + DeepSizeOf + VectorStore {
async fn load_partition(
reader: &FileReader,
range: std::ops::Range<usize>,
metric_type: MetricType,
distance_type: DistanceType,
metadata: &Self::Metadata,
) -> Result<Self>;
}
Expand Down Expand Up @@ -335,11 +340,11 @@ where
}
}

/// Loader to load partitioned PQ storage from disk.
/// Loader to load partitioned [VectorStore] from disk.
pub struct IvfQuantizationStorage<Q: Quantization> {
reader: FileReader,

metric_type: MetricType,
distance_type: DistanceType,
quantizer: Quantizer,
metadata: Q::Metadata,

Expand All @@ -359,7 +364,7 @@ impl<Q: Quantization> Clone for IvfQuantizationStorage<Q> {
fn clone(&self) -> Self {
Self {
reader: self.reader.clone(),
metric_type: self.metric_type,
distance_type: self.distance_type,
quantizer: self.quantizer.clone(),
metadata: self.metadata.clone(),
ivf: self.ivf.clone(),
Expand Down Expand Up @@ -391,15 +396,15 @@ impl<Q: Quantization> IvfQuantizationStorage<Q> {
message: format!("Failed to parse index metadata: {}", metadata_str),
location: location!(),
})?;
let metric_type: MetricType = MetricType::try_from(index_metadata.distance_type.as_str())?;
let distance_type = DistanceType::try_from(index_metadata.distance_type.as_str())?;

let ivf_data = IvfData::load(&reader).await?;

let metadata = Q::Metadata::load(&reader).await?;
let quantizer = Q::from_metadata(&metadata, metric_type)?;
let quantizer = Q::from_metadata(&metadata, distance_type)?;
Ok(Self {
reader,
metric_type,
distance_type,
quantizer,
metadata,
ivf: ivf_data,
Expand All @@ -419,8 +424,14 @@ impl<Q: Quantization> IvfQuantizationStorage<Q> {
self.ivf.num_partitions()
}

/// Load one partition of vector storage.
///
/// # Parameters
/// - `part_id`, partition id
///
///
pub async fn load_partition(&self, part_id: usize) -> Result<Q::Storage> {
let range = self.ivf.row_range(part_id);
Q::Storage::load_partition(&self.reader, range, self.metric_type, &self.metadata).await
Q::Storage::load_partition(&self.reader, range, self.distance_type, &self.metadata).await
}
}
2 changes: 1 addition & 1 deletion rust/lance-index/src/vector/sq/storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ use snafu::{location, Location};
use crate::{
vector::{
quantizer::{QuantizerMetadata, QuantizerStorage},
storage::{DistCalculator, VectorStore},
transform::Transformer,
v3::storage::{DistCalculator, VectorStore},
SQ_CODE_COLUMN,
},
IndexMetadata, INDEX_METADATA_SCHEMA_KEY,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

//! Vector Storage, holding (quantized) vectors and providing distance calculation.

use std::{any::Any, sync::Arc};

use arrow::compute::concat_batches;
Expand Down
3 changes: 0 additions & 3 deletions rust/lance-index/src/vector/v3.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,4 @@
// SPDX-FileCopyrightText: Copyright The Lance Authors

pub mod shuffler;
pub mod storage;
pub mod subindex;

pub const DISTANCE_TYPE_KEY: &str = "distance_type";
3 changes: 1 addition & 2 deletions rust/lance-index/src/vector/v3/subindex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@ use arrow_array::{ArrayRef, RecordBatch};
use deepsize::DeepSizeOf;
use lance_core::Result;

use crate::vector::storage::VectorStore;
use crate::{prefilter::PreFilter, vector::Query};

use super::storage::VectorStore;

pub const SUB_INDEX_METADATA_KEY: &str = "sub_index_metadata";

/// A sub index for IVF index
Expand Down
Loading
Loading