lancedb · westonpace · Jun 19, 2024 · Jun 19, 2024 · Jun 20, 2024 · Jun 20, 2024
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
@@ -40,6 +40,7 @@
 )
 from .dependencies import numpy as np
 from .dependencies import pandas as pd
+from .file import LanceFileWriter
 from .fragment import FragmentMetadata, LanceFragment
 from .lance import (
     CleanupStats,
@@ -1486,6 +1487,14 @@ def create_index(
                     metric,
                     accelerator,
                 )
+                arrow_ivf_centroids = pa.FixedSizeListArray.from_arrays(
+                    ivf_centroids.reshape(-1), ivf_centroids.shape[1]
+                )
+                tab = pa.table({"ivf_centroids": arrow_ivf_centroids})
+                with LanceFileWriter(
+                    "/tmp/ivf_centroids.lance", schema=tab.schema
+                ) as f:
+                    f.write_batch(tab)
                 kwargs["precomputed_partitions_file"] = partitions_file
 
             if (ivf_centroids is None) and (pq_codebook is not None):

diff --git a/rust/lance-file/src/v2/writer.rs b/rust/lance-file/src/v2/writer.rs
@@ -21,6 +21,7 @@ use prost::Message;
 use prost_types::Any;
 use snafu::{location, Location};
 use tokio::io::AsyncWriteExt;
+use tracing::instrument;
 
 use crate::datatypes::FieldsWithMeta;
 use crate::format::pb;
@@ -173,6 +174,7 @@ impl FileWriter {
         Ok(())
     }
 
+    #[instrument(skip_all, level = "debug")]
     async fn write_pages(
         &mut self,
         mut encoding_tasks: FuturesUnordered<EncodeTask>,
@@ -208,6 +210,28 @@ impl FileWriter {
         Ok(())
     }
 
+    #[instrument(skip_all, level = "debug")]
+    fn encode_batch(&mut self, batch: &RecordBatch) -> Result<Vec<Vec<EncodeTask>>> {
+        self.schema
+            .fields
+            .iter()
+            .zip(self.column_writers.iter_mut())
+            .map(|(field, column_writer)| {
+                let array = batch
+                    .column_by_name(&field.name)
+                    .ok_or(Error::InvalidInput {
+                        source: format!(
+                            "Cannot write batch.  The batch was missing the column `{}`",
+                            field.name
+                        )
+                        .into(),
+                        location: location!(),
+                    })?;
+                column_writer.maybe_encode(array.clone())
+            })
+            .collect::<Result<Vec<_>>>()
+    }
+
     /// Schedule a batch of data to be written to the file
     ///
     /// Note: the future returned by this method may complete before the data has been fully
@@ -235,25 +259,8 @@ impl FileWriter {
         };
         // First we push each array into its column writer.  This may or may not generate enough
         // data to trigger an encoding task.  We collect any encoding tasks into a queue.
-        let encoding_tasks = self
-            .schema
-            .fields
-            .iter()
-            .zip(self.column_writers.iter_mut())
-            .map(|(field, column_writer)| {
-                let array = batch
-                    .column_by_name(&field.name)
-                    .ok_or(Error::InvalidInput {
-                        source: format!(
-                            "Cannot write batch.  The batch was missing the column `{}`",
-                            field.name
-                        )
-                        .into(),
-                        location: location!(),
-                    })?;
-                column_writer.maybe_encode(array.clone())
-            })
-            .collect::<Result<Vec<_>>>()?;
+        let encoding_tasks = self.encode_batch(batch)?;
+
         let encoding_tasks = encoding_tasks
             .into_iter()
             .flatten()

diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml
@@ -62,6 +62,7 @@ lance-datagen.workspace = true
 lance-testing.workspace = true
 pprof.workspace = true
 tempfile.workspace = true
+test-log.workspace = true
 datafusion-sql.workspace = true
 
 [build-dependencies]

diff --git a/rust/lance-index/src/vector/ivf.rs b/rust/lance-index/src/vector/ivf.rs
@@ -14,6 +14,7 @@ use lance_linalg::{
     distance::{DistanceType, MetricType},
     kmeans::{compute_partitions_arrow_array, kmeans_find_partitions_arrow_array},
 };
+use tracing::instrument;
 
 use crate::vector::ivf::transform::IvfTransformer;
 use crate::vector::{
@@ -244,6 +245,7 @@ impl Ivf {
 }
 
 impl Transformer for Ivf {
+    #[instrument(name = "transform_ivf2", level = "debug", skip_all)]
     fn transform(&self, batch: &RecordBatch) -> Result<RecordBatch> {
         let mut batch = batch.clone();
         for transform in self.transforms.as_slice() {

diff --git a/rust/lance-index/src/vector/ivf/builder.rs b/rust/lance-index/src/vector/ivf/builder.rs
@@ -3,12 +3,13 @@

 //! Build IVF model

 use std::collections::HashMap;
 use std::sync::Arc;

 use arrow_array::cast::AsArray;
 use arrow_array::{Array, FixedSizeListArray, UInt32Array, UInt64Array};
 use futures::TryStreamExt;
+use lance_core::utils::address::RowAddress;
 use object_store::path::Path;
 use snafu::{location, Location};
 
@@ -102,29 +103,30 @@
 /// Currently, because `Dataset` is not cleanly refactored from `lance` to `lance-core`,
 /// we have to use `RecordBatchStream` as parameter.
 pub async fn load_precomputed_partitions(
-    stream: impl RecordBatchStream + Unpin + 'static,
-    size_hint: usize,
-) -> Result<HashMap<u64, u32>> {
-    let partition_lookup = stream
-        .try_fold(HashMap::with_capacity(size_hint), |mut lookup, batch| {
-            let row_ids: &UInt64Array = batch
-                .column_by_name("row_id")
-                .expect("malformed partition file: missing row_id column")
-                .as_primitive();
-            let partitions: &UInt32Array = batch
-                .column_by_name("partition")
-                .expect("malformed partition file: missing partition column")
-                .as_primitive();
-            row_ids
-                .values()
-                .iter()
-                .zip(partitions.values().iter())
-                .for_each(|(row_id, partition)| {
-                    lookup.insert(*row_id, *partition);
-                });
-            async move { Ok(lookup) }
-        })
-        .await?;
-
-    Ok(partition_lookup)
+    mut stream: impl RecordBatchStream + Unpin + 'static,
+    fragment_sizes: &[u32],
+) -> Result<Vec<Vec<u32>>> {
+    let mut mapping = fragment_sizes
+        .iter()
+        .map(|&size| vec![0; size as usize])
+        .collect::<Vec<_>>();
+    while let Some(batch) = stream.try_next().await? {
+        let row_ids: &UInt64Array = batch
+            .column_by_name("row_id")
+            .expect("malformed partition file: missing row_id column")
+            .as_primitive();
+        let partitions: &UInt32Array = batch
+            .column_by_name("partition")
+            .expect("malformed partition file: missing partition column")
+            .as_primitive();
+        row_ids
+            .values()
+            .iter()
+            .zip(partitions.values().iter())
+            .for_each(|(row_id, partition)| {
+                let addr = RowAddress::new_from_id(*row_id);
+                mapping[addr.fragment_id() as usize][addr.row_id() as usize] = *partition;
+            });
+    }
+    Ok(mapping)
 }