lancedb · broccoliSpicy · Sep 27, 2024 · Sep 16, 2024 · Sep 16, 2024 · Sep 17, 2024
diff --git a/protos/encodings.proto b/protos/encodings.proto
@@ -190,6 +190,18 @@ message Bitpacked {
   bool signed = 4;
 }
 
+// Items are bitpacked in a buffer
+message BitpackedForNonNeg {
+  // the number of bits used for a value in the buffer
+  uint64 compressed_bits_per_value = 1;
+
+  // the number of bits of the uncompressed value. e.g. for a u32, this will be 32
+  uint64 uncompressed_bits_per_value = 2;
+
+  // The items in the list
+  Buffer buffer = 3;
+}
+
 // An array encoding for shredded structs that will never be null
 //
 // There is no actual data in this column.
@@ -240,6 +252,7 @@ message ArrayEncoding {
         PackedStruct packed_struct = 9;
         Bitpacked bitpacked = 10;
         FixedSizeBinary fixed_size_binary = 11;
+        BitpackedForNonNeg bitpacked_for_non_neg = 12;
     }
 }
 

diff --git a/rust/lance-encoding/Cargo.toml b/rust/lance-encoding/Cargo.toml
@@ -37,6 +37,8 @@ snafu.workspace = true
 tokio.workspace = true
 tracing.workspace = true
 zstd.workspace = true
+fastlanes = "0.1.5"
+bytemuck = "=1.18.0"
 
 [dev-dependencies]
 lance-testing.workspace = true

diff --git a/rust/lance-encoding/benches/decoder.rs b/rust/lance-encoding/benches/decoder.rs
@@ -2,7 +2,7 @@
 // SPDX-FileCopyrightText: Copyright The Lance Authors
 use std::{collections::HashMap, sync::Arc};
 
-use arrow_array::{RecordBatch, UInt32Array};
+use arrow_array::{RecordBatch, UInt32Array, UInt8Array};
 use arrow_schema::{DataType, Field, Schema, TimeUnit};
 use arrow_select::take::take;
 use criterion::{criterion_group, criterion_main, Criterion};
@@ -60,6 +60,47 @@ const ENCODING_OPTIONS: EncodingOptions = EncodingOptions {
     keep_original_array: true,
 };
 
+fn bench_decode2(c: &mut Criterion) {
+    let rt = tokio::runtime::Runtime::new().unwrap();
+    let mut group = c.benchmark_group("decode_uint8");
+    group.measurement_time(std::time::Duration::new(12, 0));
+    let array = UInt8Array::from(vec![5; 1024 * 1024 * 1024]);
+    let data = RecordBatch::try_new(
+        Arc::new(Schema::new(vec![Field::new(
+            "uint8",
+            DataType::UInt8,
+            false,
+        )])),
+        vec![Arc::new(array)],
+    )
+    .unwrap();
+    let lance_schema =
+        Arc::new(lance_core::datatypes::Schema::try_from(data.schema().as_ref()).unwrap());
+    let input_bytes = data.get_array_memory_size();
+    group.throughput(criterion::Throughput::Bytes(input_bytes as u64));
+    let encoding_strategy = CoreFieldEncodingStrategy::default();
+    let encoded = rt
+        .block_on(encode_batch(
+            &data,
+            lance_schema,
+            &encoding_strategy,
+            &ENCODING_OPTIONS,
+        ))
+        .unwrap();
+    group.bench_function("uint8", |b| {
+        b.iter(|| {
+            let batch = rt
+                .block_on(lance_encoding::decoder::decode_batch(
+                    &encoded,
+                    &FilterExpression::no_filter(),
+                    &DecoderMiddlewareChain::default(),
+                ))
+                .unwrap();
+            assert_eq!(data.num_rows(), batch.num_rows());
+        })
+    });
+}
+
 fn bench_decode(c: &mut Criterion) {
     let rt = tokio::runtime::Runtime::new().unwrap();
     let mut group = c.benchmark_group("decode_primitive");
@@ -314,7 +355,7 @@ criterion_group!(
     name=benches;
     config = Criterion::default().significance_level(0.1).sample_size(10)
         .with_profiler(pprof::criterion::PProfProfiler::new(100, pprof::criterion::Output::Flamegraph(None)));
-    targets = bench_decode, bench_decode_fsl, bench_decode_str_with_dict_encoding, bench_decode_packed_struct,
+    targets = bench_decode2, bench_decode, bench_decode_fsl, bench_decode_str_with_dict_encoding, bench_decode_packed_struct,
                 bench_decode_str_with_fixed_size_binary_encoding);
 
 // Non-linux version does not support pprof.

diff --git a/rust/lance-encoding/src/buffer.rs b/rust/lance-encoding/src/buffer.rs
@@ -203,6 +203,36 @@ impl LanceBuffer {
         Self::Borrowed(Buffer::from_vec(vec))
     }
 
+    pub fn reinterpret_to_rust_native<T>(&mut self) -> Result<&[T]>
+    where
+        T: Copy, // Ensure `T` can be copied (as needed for safely reinterpreting bytes)
+    {
+        let buffer = self.borrow_and_clone();
+
+        let buffer = buffer.into_buffer();
+
+        // Get the raw byte slice from the buffer.
+        let byte_slice = buffer.as_slice();
+
+        // Safety check - ensure that the byte slice length is a multiple of `T`.
+        if byte_slice.len() % std::mem::size_of::<T>() != 0 {
+            return Err(Error::Internal {
+                message: "Buffer size is not a multiple of the target type size".to_string(),
+                location: location!(),
+            });
+        }
+
+        // Reinterpret the byte slice as a slice of `T`.
+        let typed_slice = unsafe {
+            std::slice::from_raw_parts(
+                byte_slice.as_ptr() as *const T,
+                byte_slice.len() / std::mem::size_of::<T>(),
+            )
+        };
+
+        Ok(typed_slice)
+    }
+
     /// Reinterprets a LanceBuffer into a Vec<T>
     ///
     /// Unfortunately, there is no way to do this safely in Rust without a copy, even if

diff --git a/rust/lance-encoding/src/encoder.rs b/rust/lance-encoding/src/encoder.rs
@@ -14,6 +14,8 @@ use snafu::{location, Location};
 use crate::buffer::LanceBuffer;
 use crate::data::DataBlock;
 use crate::encodings::logical::r#struct::StructFieldEncoder;
+use crate::encodings::physical::bitpack_fastlanes::compute_compressed_bit_width_for_non_neg;
+use crate::encodings::physical::bitpack_fastlanes::BitpackedForNonNegArrayEncoder;
 use crate::encodings::physical::block_compress::CompressionScheme;
 use crate::encodings::physical::dictionary::AlreadyDictionaryEncoder;
 use crate::encodings::physical::fsst::FsstArrayEncoder;
@@ -331,6 +333,23 @@ impl CoreArrayEncodingStrategy {
 
                 Ok(Box::new(PackedStructEncoder::new(inner_encoders)))
             }
+            DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 => {
+                let compressed_bit_width = compute_compressed_bit_width_for_non_neg(arrays);
+                Ok(Box::new(BitpackedForNonNegArrayEncoder::new(
+                    compressed_bit_width as usize,
+                    data_type.clone(),
+                )))
+            }
+
+            // for signed integers, I intend to make it a cascaded encoding, a sparse array for the negative values and very wide(bit-width) values,
+            // then a bitpacked array for the narrow(bit-width) values, I need `BitpackedForNeg` to be merged first
-            // for signed integers, I intend to make it a cascaded encoding, a sparse array for the negative values and very wide(bit-width) values,
-            // then a bitpacked array for the narrow(bit-width) values, I need `BitpackedForNeg` to be merged first
+            // TODO: for signed integers, I intend to make it a cascaded encoding, a sparse array for the negative values and very wide(bit-width) values,
+            // then a bitpacked array for the narrow(bit-width) values, I need `BitpackedForNeg` to be merged first
-            // for signed integers, I intend to make it a cascaded encoding, a sparse array for the negative values and very wide(bit-width) values,
-            // then a bitpacked array for the narrow(bit-width) values, I need `BitpackedForNeg` to be merged first
+            // TODO: for signed integers, I intend to make it a cascaded encoding, a sparse array for the negative values and very wide(bit-width) values,
+            // then a bitpacked array for the narrow(bit-width) values, I need `BitpackedForNeg` to be merged first
+            DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => {
+                let compressed_bit_width = compute_compressed_bit_width_for_non_neg(arrays);
+                Ok(Box::new(BitpackedForNonNegArrayEncoder::new(
+                    compressed_bit_width as usize,
+                    data_type.clone(),
+                )))
+            }
             _ => Ok(Box::new(BasicEncoder::new(Box::new(
                 ValueEncoder::default(),
             )))),

diff --git a/rust/lance-encoding/src/encodings/physical.rs b/rust/lance-encoding/src/encodings/physical.rs
@@ -19,6 +19,7 @@ pub mod basic;
 pub mod binary;
 pub mod bitmap;
 pub mod bitpack;
+pub mod bitpack_fastlanes;
 pub mod block_compress;
 pub mod dictionary;
 pub mod fixed_size_binary;
@@ -109,6 +110,19 @@ fn get_bitpacked_buffer_decoder(
     ))
 }
 
+fn get_bitpacked_for_non_neg_buffer_decoder(
+    encoding: &pb::BitpackedForNonNeg,
+    buffers: &PageBuffers,
+) -> Box<dyn PageScheduler> {
+    let (buffer_offset, _buffer_size) = get_buffer(encoding.buffer.as_ref().unwrap(), buffers);
+
+    Box::new(bitpack_fastlanes::BitpackedForNonNegScheduler::new(
+        encoding.compressed_bits_per_value,
+        encoding.uncompressed_bits_per_value,
+        buffer_offset,
+    ))
+}
+
 /// Convert a protobuf array encoding into a physical page scheduler
 pub fn decoder_from_array_encoding(
     encoding: &pb::ArrayEncoding,
@@ -252,6 +266,9 @@ pub fn decoder_from_array_encoding(
                 buffer_offset,
             ))
         }
+        pb::array_encoding::ArrayEncoding::BitpackedForNonNeg(bitpacked) => {
+            get_bitpacked_for_non_neg_buffer_decoder(bitpacked, buffers)
+        }
         // Currently there is no way to encode struct nullability and structs are encoded with a "header" column
         // (that has no data).  We never actually decode that column and so this branch is never actually encountered.
         //