diff --git a/protos/encodings.proto b/protos/encodings.proto index 7b8af601a4..19cd814257 100644 --- a/protos/encodings.proto +++ b/protos/encodings.proto @@ -190,6 +190,18 @@ message Bitpacked { bool signed = 4; } +// Items are bitpacked in a buffer +message BitpackedForNonNeg { + // the number of bits used for a value in the buffer + uint64 compressed_bits_per_value = 1; + + // the number of bits of the uncompressed value. e.g. for a u32, this will be 32 + uint64 uncompressed_bits_per_value = 2; + + // The items in the list + Buffer buffer = 3; +} + // An array encoding for shredded structs that will never be null // // There is no actual data in this column. @@ -240,6 +252,7 @@ message ArrayEncoding { PackedStruct packed_struct = 9; Bitpacked bitpacked = 10; FixedSizeBinary fixed_size_binary = 11; + BitpackedForNonNeg bitpacked_for_non_neg = 12; } } diff --git a/rust/lance-encoding/Cargo.toml b/rust/lance-encoding/Cargo.toml index 35c3c336ff..c9272c41ad 100644 --- a/rust/lance-encoding/Cargo.toml +++ b/rust/lance-encoding/Cargo.toml @@ -37,6 +37,10 @@ snafu.workspace = true tokio.workspace = true tracing.workspace = true zstd.workspace = true +bytemuck = "=1.18.0" +arrayref = "0.3.7" +paste = "1.0.15" +seq-macro = "0.3.5" [dev-dependencies] lance-testing.workspace = true diff --git a/rust/lance-encoding/benches/decoder.rs b/rust/lance-encoding/benches/decoder.rs index 21155f2453..f8bca14585 100644 --- a/rust/lance-encoding/benches/decoder.rs +++ b/rust/lance-encoding/benches/decoder.rs @@ -65,8 +65,8 @@ fn bench_decode(c: &mut Criterion) { let mut group = c.benchmark_group("decode_primitive"); for data_type in PRIMITIVE_TYPES { let data = lance_datagen::gen() - .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) - .into_batch_rows(lance_datagen::RowCount::from(1024 * 1024)) + .anon_col(lance_datagen::array::rand_type(data_type)) + .into_batch_rows(lance_datagen::RowCount::from(1024 * 1024 * 1024)) .unwrap(); let lance_schema = Arc::new(lance_core::datatypes::Schema::try_from(data.schema().as_ref()).unwrap()); @@ -96,6 +96,7 @@ fn bench_decode(c: &mut Criterion) { }); } } + fn bench_decode_fsl(c: &mut Criterion) { let rt = tokio::runtime::Runtime::new().unwrap(); let mut group = c.benchmark_group("decode_primitive_fsl"); diff --git a/rust/lance-encoding/src/buffer.rs b/rust/lance-encoding/src/buffer.rs index 3e17588d71..57255565bc 100644 --- a/rust/lance-encoding/src/buffer.rs +++ b/rust/lance-encoding/src/buffer.rs @@ -283,6 +283,32 @@ impl LanceBuffer { pub fn copy_array(array: [u8; N]) -> Self { Self::Owned(Vec::from(array)) } + + #[allow(clippy::len_without_is_empty)] + pub fn len(&self) -> usize { + match self { + Self::Borrowed(buffer) => buffer.len(), + Self::Owned(buffer) => buffer.len(), + } + } + + /// Returns a new [LanceBuffer] that is a slice of this buffer starting at `offset`, + /// with `length` bytes. + /// Doing so allows the same memory region to be shared between lance buffers. + /// # Panics + /// Panics if `(offset + length)` is larger than the existing length. + /// If the buffer is owned this method will require a copy. + pub fn slice_with_length(&self, offset: usize, length: usize) -> Self { + let original_buffer_len = self.len(); + assert!( + offset.saturating_add(length) <= original_buffer_len, + "the offset + length of the sliced Buffer cannot exceed the existing length" + ); + match self { + Self::Borrowed(buffer) => Self::Borrowed(buffer.slice_with_length(offset, length)), + Self::Owned(buffer) => Self::Owned(buffer[offset..offset + length].to_vec()), + } + } } impl AsRef<[u8]> for LanceBuffer { diff --git a/rust/lance-encoding/src/compression_algo/fastlanes.rs b/rust/lance-encoding/src/compression_algo/fastlanes.rs new file mode 100644 index 0000000000..346c7ab219 --- /dev/null +++ b/rust/lance-encoding/src/compression_algo/fastlanes.rs @@ -0,0 +1,2147 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +// NOTICE: +// This file is a modification of the `fastlanes` crate: https://github.com/spiraldb/fastlanes +// It is modified to allow a rust stable build +// +// The original code can be accessed at +// https://github.com/spiraldb/fastlanes/blob/8e0ff374f815d919d0c0ebdccf5ffd9e6dc7d663/src/bitpacking.rs +// https://github.com/spiraldb/fastlanes/blob/8e0ff374f815d919d0c0ebdccf5ffd9e6dc7d663/src/lib.rs +// https://github.com/spiraldb/fastlanes/blob/8e0ff374f815d919d0c0ebdccf5ffd9e6dc7d663/src/macros.rs +// +// The original code is licensed under the Apache Software License: +// https://github.com/spiraldb/fastlanes/blob/8e0ff374f815d919d0c0ebdccf5ffd9e6dc7d663/LICENSE + +use arrayref::{array_mut_ref, array_ref}; +use core::mem::size_of; +use paste::paste; + +pub const FL_ORDER: [usize; 8] = [0, 4, 2, 6, 1, 5, 3, 7]; + +pub trait FastLanes: Sized + Copy { + const T: usize = size_of::() * 8; + const LANES: usize = 1024 / Self::T; +} + +// Implement the trait for basic unsigned integer types +impl FastLanes for u8 {} +impl FastLanes for u16 {} +impl FastLanes for u32 {} +impl FastLanes for u64 {} + +macro_rules! pack { + ($T:ty, $W:expr, $packed:expr, $lane:expr, | $_1:tt $idx:ident | $($body:tt)*) => { + macro_rules! __kernel__ {( $_1 $idx:ident ) => ( $($body)* )} + { + use paste::paste; + + // The number of bits of T. + const T: usize = <$T>::T; + + #[inline(always)] + fn index(row: usize, lane: usize) -> usize { + let o = row / 8; + let s = row % 8; + (FL_ORDER[o] * 16) + (s * 128) + lane + } + + if $W == 0 { + // Nothing to do if W is 0, since the packed array is zero bytes. + } else if $W == T { + // Special case for W=T, we can just copy the input value directly to the packed value. + paste!(seq_t!(row in $T { + let idx = index(row, $lane); + $packed[<$T>::LANES * row + $lane] = __kernel__!(idx); + })); + } else { + // A mask of W bits. + let mask: $T = (1 << $W) - 1; + + // First we loop over each lane in the virtual 1024 bit word. + let mut tmp: $T = 0; + + // Loop over each of the rows of the lane. + // Inlining this loop means all branches are known at compile time and + // the code is auto-vectorized for SIMD execution. + paste!(seq_t!(row in $T { + let idx = index(row, $lane); + let src = __kernel__!(idx); + let src = src & mask; + + // Shift the src bits into their position in the tmp output variable. + if row == 0 { + tmp = src; + } else { + tmp |= src << (row * $W) % T; + } + + // If the next packed position is after our current one, then we have filled + // the current output and we can write the packed value. + let curr_word: usize = (row * $W) / T; + let next_word: usize = ((row + 1) * $W) / T; + + #[allow(unused_assignments)] + if next_word > curr_word { + $packed[<$T>::LANES * curr_word + $lane] = tmp; + let remaining_bits: usize = ((row + 1) * $W) % T; + // Keep the remaining bits for the next packed value. + tmp = src >> $W - remaining_bits; + } + })); + } + } + }; +} + +macro_rules! unpack { + ($T:ty, $W:expr, $packed:expr, $lane:expr, | $_1:tt $idx:ident, $_2:tt $elem:ident | $($body:tt)*) => { + macro_rules! __kernel__ {( $_1 $idx:ident, $_2 $elem:ident ) => ( $($body)* )} + { + use paste::paste; + + // The number of bits of T. + const T: usize = <$T>::T; + + #[inline(always)] + fn index(row: usize, lane: usize) -> usize { + let o = row / 8; + let s = row % 8; + (FL_ORDER[o] * 16) + (s * 128) + lane + } + + if $W == 0 { + // Special case for W=0, we just need to zero the output. + // We'll still respect the iteration order in case the kernel has side effects. + paste!(seq_t!(row in $T { + let idx = index(row, $lane); + let zero: $T = 0; + __kernel__!(idx, zero); + })); + } else if $W == T { + // Special case for W=T, we can just copy the packed value directly to the output. + paste!(seq_t!(row in $T { + let idx = index(row, $lane); + let src = $packed[<$T>::LANES * row + $lane]; + __kernel__!(idx, src); + })); + } else { + #[inline] + fn mask(width: usize) -> $T { + if width == T { <$T>::MAX } else { (1 << (width % T)) - 1 } + } + + let mut src: $T = $packed[$lane]; + let mut tmp: $T; + + paste!(seq_t!(row in $T { + // Figure out the packed positions + let curr_word: usize = (row * $W) / T; + let next_word = ((row + 1) * $W) / T; + + let shift = (row * $W) % T; + + if next_word > curr_word { + // Consume some bits from the curr packed input, the remainder are in the next + // packed input value + let remaining_bits = ((row + 1) * $W) % T; + let current_bits = $W - remaining_bits; + tmp = (src >> shift) & mask(current_bits); + + if next_word < $W { + // Load the next packed value + src = $packed[<$T>::LANES * next_word + $lane]; + // Consume the remaining bits from the next input value. + tmp |= (src & mask(remaining_bits)) << current_bits; + } + } else { + // Otherwise, just grab W bits from the src value + tmp = (src >> shift) & mask($W); + } + + // Write out the unpacked value + let idx = index(row, $lane); + __kernel__!(idx, tmp); + })); + } + } + }; +} + +// Macro for repeating a code block bit_size_of:: times. +macro_rules! seq_t { + ($ident:ident in u8 $body:tt) => {seq_macro::seq!($ident in 0..8 $body)}; + ($ident:ident in u16 $body:tt) => {seq_macro::seq!($ident in 0..16 $body)}; + ($ident:ident in u32 $body:tt) => {seq_macro::seq!($ident in 0..32 $body)}; + ($ident:ident in u64 $body:tt) => {seq_macro::seq!($ident in 0..64 $body)}; +} + +/// `BitPack` into a compile-time known bit-width. +pub trait BitPacking: FastLanes { + /// Packs 1024 elements into `W` bits each, where `W` is runtime-known instead of + /// compile-time known. + /// + /// # Safety + /// The input slice must be of exactly length 1024. The output slice must be of length + /// `1024 * W / T`, where `T` is the bit-width of Self and `W` is the packed width. + /// These lengths are checked only with `debug_assert` (i.e., not checked on release builds). + unsafe fn unchecked_pack(width: usize, input: &[Self], output: &mut [Self]); + + /// Unpacks 1024 elements from `W` bits each, where `W` is runtime-known instead of + /// compile-time known. + /// + /// # Safety + /// The input slice must be of length `1024 * W / T`, where `T` is the bit-width of Self and `W` + /// is the packed width. The output slice must be of exactly length 1024. + /// These lengths are checked only with `debug_assert` (i.e., not checked on release builds). + unsafe fn unchecked_unpack(width: usize, input: &[Self], output: &mut [Self]); +} + +impl BitPacking for u8 { + unsafe fn unchecked_pack(width: usize, input: &[Self], output: &mut [Self]) { + let packed_len = 128 * width / size_of::(); + debug_assert_eq!( + output.len(), + packed_len, + "Output buffer must be of size 1024 * W / T" + ); + debug_assert_eq!(input.len(), 1024, "Input buffer must be of size 1024"); + debug_assert!( + width <= Self::T, + "Width must be less than or equal to {}", + Self::T + ); + + match width { + 1 => pack_8_1( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 / 8], + ), + 2 => pack_8_2( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 2 / 8], + ), + 3 => pack_8_3( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 3 / 8], + ), + 4 => pack_8_4( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 4 / 8], + ), + 5 => pack_8_5( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 5 / 8], + ), + 6 => pack_8_6( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 6 / 8], + ), + 7 => pack_8_7( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 7 / 8], + ), + 8 => pack_8_8( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 8 / 8], + ), + + _ => unreachable!("Unsupported width: {}", width), + } + } + + unsafe fn unchecked_unpack(width: usize, input: &[Self], output: &mut [Self]) { + let packed_len = 128 * width / size_of::(); + debug_assert_eq!( + input.len(), + packed_len, + "Input buffer must be of size 1024 * W / T" + ); + debug_assert_eq!(output.len(), 1024, "Output buffer must be of size 1024"); + debug_assert!( + width <= Self::T, + "Width must be less than or equal to {}", + Self::T + ); + + match width { + 1 => unpack_8_1( + array_ref![input, 0, 1024 / 8], + array_mut_ref![output, 0, 1024], + ), + 2 => unpack_8_2( + array_ref![input, 0, 1024 * 2 / 8], + array_mut_ref![output, 0, 1024], + ), + 3 => unpack_8_3( + array_ref![input, 0, 1024 * 3 / 8], + array_mut_ref![output, 0, 1024], + ), + 4 => unpack_8_4( + array_ref![input, 0, 1024 * 4 / 8], + array_mut_ref![output, 0, 1024], + ), + 5 => unpack_8_5( + array_ref![input, 0, 1024 * 5 / 8], + array_mut_ref![output, 0, 1024], + ), + 6 => unpack_8_6( + array_ref![input, 0, 1024 * 6 / 8], + array_mut_ref![output, 0, 1024], + ), + 7 => unpack_8_7( + array_ref![input, 0, 1024 * 7 / 8], + array_mut_ref![output, 0, 1024], + ), + 8 => unpack_8_8( + array_ref![input, 0, 1024 * 8 / 8], + array_mut_ref![output, 0, 1024], + ), + + _ => unreachable!("Unsupported width: {}", width), + } + } +} + +impl BitPacking for u16 { + unsafe fn unchecked_pack(width: usize, input: &[Self], output: &mut [Self]) { + let packed_len = 128 * width / size_of::(); + debug_assert_eq!( + output.len(), + packed_len, + "Output buffer must be of size 1024 * W / T" + ); + debug_assert_eq!(input.len(), 1024, "Input buffer must be of size 1024"); + debug_assert!( + width <= Self::T, + "Width must be less than or equal to {}", + Self::T + ); + + match width { + 1 => pack_16_1( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 / 16], + ), + 2 => pack_16_2( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 2 / 16], + ), + 3 => pack_16_3( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 3 / 16], + ), + 4 => pack_16_4( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 4 / 16], + ), + 5 => pack_16_5( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 5 / 16], + ), + 6 => pack_16_6( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 6 / 16], + ), + 7 => pack_16_7( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 7 / 16], + ), + 8 => pack_16_8( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 8 / 16], + ), + 9 => pack_16_9( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 9 / 16], + ), + + 10 => pack_16_10( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 10 / 16], + ), + 11 => pack_16_11( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 11 / 16], + ), + 12 => pack_16_12( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 12 / 16], + ), + 13 => pack_16_13( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 13 / 16], + ), + 14 => pack_16_14( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 14 / 16], + ), + 15 => pack_16_15( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 15 / 16], + ), + 16 => pack_16_16( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 16 / 16], + ), + + _ => unreachable!("Unsupported width: {}", width), + } + } + + unsafe fn unchecked_unpack(width: usize, input: &[Self], output: &mut [Self]) { + let packed_len = 128 * width / size_of::(); + debug_assert_eq!( + input.len(), + packed_len, + "Input buffer must be of size 1024 * W / T" + ); + debug_assert_eq!(output.len(), 1024, "Output buffer must be of size 1024"); + debug_assert!( + width <= Self::T, + "Width must be less than or equal to {}", + Self::T + ); + + match width { + 1 => unpack_16_1( + array_ref![input, 0, 1024 / 16], + array_mut_ref![output, 0, 1024], + ), + 2 => unpack_16_2( + array_ref![input, 0, 1024 * 2 / 16], + array_mut_ref![output, 0, 1024], + ), + 3 => unpack_16_3( + array_ref![input, 0, 1024 * 3 / 16], + array_mut_ref![output, 0, 1024], + ), + 4 => unpack_16_4( + array_ref![input, 0, 1024 * 4 / 16], + array_mut_ref![output, 0, 1024], + ), + 5 => unpack_16_5( + array_ref![input, 0, 1024 * 5 / 16], + array_mut_ref![output, 0, 1024], + ), + 6 => unpack_16_6( + array_ref![input, 0, 1024 * 6 / 16], + array_mut_ref![output, 0, 1024], + ), + 7 => unpack_16_7( + array_ref![input, 0, 1024 * 7 / 16], + array_mut_ref![output, 0, 1024], + ), + 8 => unpack_16_8( + array_ref![input, 0, 1024 * 8 / 16], + array_mut_ref![output, 0, 1024], + ), + 9 => unpack_16_9( + array_ref![input, 0, 1024 * 9 / 16], + array_mut_ref![output, 0, 1024], + ), + + 10 => unpack_16_10( + array_ref![input, 0, 1024 * 10 / 16], + array_mut_ref![output, 0, 1024], + ), + 11 => unpack_16_11( + array_ref![input, 0, 1024 * 11 / 16], + array_mut_ref![output, 0, 1024], + ), + 12 => unpack_16_12( + array_ref![input, 0, 1024 * 12 / 16], + array_mut_ref![output, 0, 1024], + ), + 13 => unpack_16_13( + array_ref![input, 0, 1024 * 13 / 16], + array_mut_ref![output, 0, 1024], + ), + 14 => unpack_16_14( + array_ref![input, 0, 1024 * 14 / 16], + array_mut_ref![output, 0, 1024], + ), + 15 => unpack_16_15( + array_ref![input, 0, 1024 * 15 / 16], + array_mut_ref![output, 0, 1024], + ), + 16 => unpack_16_16( + array_ref![input, 0, 1024 * 16 / 16], + array_mut_ref![output, 0, 1024], + ), + + _ => unreachable!("Unsupported width: {}", width), + } + } +} + +impl BitPacking for u32 { + unsafe fn unchecked_pack(width: usize, input: &[Self], output: &mut [Self]) { + let packed_len = 128 * width / size_of::(); + debug_assert_eq!( + output.len(), + packed_len, + "Output buffer must be of size 1024 * W / T" + ); + debug_assert_eq!(input.len(), 1024, "Input buffer must be of size 1024"); + debug_assert!( + width <= Self::T, + "Width must be less than or equal to {}", + Self::T + ); + + match width { + 1 => pack_32_1( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 / 32], + ), + 2 => pack_32_2( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 2 / 32], + ), + 3 => pack_32_3( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 3 / 32], + ), + 4 => pack_32_4( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 4 / 32], + ), + 5 => pack_32_5( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 5 / 32], + ), + 6 => pack_32_6( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 6 / 32], + ), + 7 => pack_32_7( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 7 / 32], + ), + 8 => pack_32_8( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 8 / 32], + ), + 9 => pack_32_9( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 9 / 32], + ), + + 10 => pack_32_10( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 10 / 32], + ), + 11 => pack_32_11( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 11 / 32], + ), + 12 => pack_32_12( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 12 / 32], + ), + 13 => pack_32_13( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 13 / 32], + ), + 14 => pack_32_14( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 14 / 32], + ), + 15 => pack_32_15( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 15 / 32], + ), + 16 => pack_32_16( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 16 / 32], + ), + 17 => pack_32_17( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 17 / 32], + ), + 18 => pack_32_18( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 18 / 32], + ), + 19 => pack_32_19( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 19 / 32], + ), + + 20 => pack_32_20( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 20 / 32], + ), + 21 => pack_32_21( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 21 / 32], + ), + 22 => pack_32_22( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 22 / 32], + ), + 23 => pack_32_23( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 23 / 32], + ), + 24 => pack_32_24( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 24 / 32], + ), + 25 => pack_32_25( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 25 / 32], + ), + 26 => pack_32_26( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 26 / 32], + ), + 27 => pack_32_27( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 27 / 32], + ), + 28 => pack_32_28( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 28 / 32], + ), + 29 => pack_32_29( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 29 / 32], + ), + + 30 => pack_32_30( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 30 / 32], + ), + 31 => pack_32_31( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 31 / 32], + ), + 32 => pack_32_32( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 32 / 32], + ), + + _ => unreachable!("Unsupported width: {}", width), + } + } + + unsafe fn unchecked_unpack(width: usize, input: &[Self], output: &mut [Self]) { + let packed_len = 128 * width / size_of::(); + debug_assert_eq!( + input.len(), + packed_len, + "Input buffer must be of size 1024 * W / T" + ); + debug_assert_eq!(output.len(), 1024, "Output buffer must be of size 1024"); + debug_assert!( + width <= Self::T, + "Width must be less than or equal to {}", + Self::T + ); + + match width { + 1 => unpack_32_1( + array_ref![input, 0, 1024 / 32], + array_mut_ref![output, 0, 1024], + ), + 2 => unpack_32_2( + array_ref![input, 0, 1024 * 2 / 32], + array_mut_ref![output, 0, 1024], + ), + 3 => unpack_32_3( + array_ref![input, 0, 1024 * 3 / 32], + array_mut_ref![output, 0, 1024], + ), + 4 => unpack_32_4( + array_ref![input, 0, 1024 * 4 / 32], + array_mut_ref![output, 0, 1024], + ), + 5 => unpack_32_5( + array_ref![input, 0, 1024 * 5 / 32], + array_mut_ref![output, 0, 1024], + ), + 6 => unpack_32_6( + array_ref![input, 0, 1024 * 6 / 32], + array_mut_ref![output, 0, 1024], + ), + 7 => unpack_32_7( + array_ref![input, 0, 1024 * 7 / 32], + array_mut_ref![output, 0, 1024], + ), + 8 => unpack_32_8( + array_ref![input, 0, 1024 * 8 / 32], + array_mut_ref![output, 0, 1024], + ), + 9 => unpack_32_9( + array_ref![input, 0, 1024 * 9 / 32], + array_mut_ref![output, 0, 1024], + ), + + 10 => unpack_32_10( + array_ref![input, 0, 1024 * 10 / 32], + array_mut_ref![output, 0, 1024], + ), + 11 => unpack_32_11( + array_ref![input, 0, 1024 * 11 / 32], + array_mut_ref![output, 0, 1024], + ), + 12 => unpack_32_12( + array_ref![input, 0, 1024 * 12 / 32], + array_mut_ref![output, 0, 1024], + ), + 13 => unpack_32_13( + array_ref![input, 0, 1024 * 13 / 32], + array_mut_ref![output, 0, 1024], + ), + 14 => unpack_32_14( + array_ref![input, 0, 1024 * 14 / 32], + array_mut_ref![output, 0, 1024], + ), + 15 => unpack_32_15( + array_ref![input, 0, 1024 * 15 / 32], + array_mut_ref![output, 0, 1024], + ), + 16 => unpack_32_16( + array_ref![input, 0, 1024 * 16 / 32], + array_mut_ref![output, 0, 1024], + ), + 17 => unpack_32_17( + array_ref![input, 0, 1024 * 17 / 32], + array_mut_ref![output, 0, 1024], + ), + 18 => unpack_32_18( + array_ref![input, 0, 1024 * 18 / 32], + array_mut_ref![output, 0, 1024], + ), + 19 => unpack_32_19( + array_ref![input, 0, 1024 * 19 / 32], + array_mut_ref![output, 0, 1024], + ), + + 20 => unpack_32_20( + array_ref![input, 0, 1024 * 20 / 32], + array_mut_ref![output, 0, 1024], + ), + 21 => unpack_32_21( + array_ref![input, 0, 1024 * 21 / 32], + array_mut_ref![output, 0, 1024], + ), + 22 => unpack_32_22( + array_ref![input, 0, 1024 * 22 / 32], + array_mut_ref![output, 0, 1024], + ), + 23 => unpack_32_23( + array_ref![input, 0, 1024 * 23 / 32], + array_mut_ref![output, 0, 1024], + ), + 24 => unpack_32_24( + array_ref![input, 0, 1024 * 24 / 32], + array_mut_ref![output, 0, 1024], + ), + 25 => unpack_32_25( + array_ref![input, 0, 1024 * 25 / 32], + array_mut_ref![output, 0, 1024], + ), + 26 => unpack_32_26( + array_ref![input, 0, 1024 * 26 / 32], + array_mut_ref![output, 0, 1024], + ), + 27 => unpack_32_27( + array_ref![input, 0, 1024 * 27 / 32], + array_mut_ref![output, 0, 1024], + ), + 28 => unpack_32_28( + array_ref![input, 0, 1024 * 28 / 32], + array_mut_ref![output, 0, 1024], + ), + 29 => unpack_32_29( + array_ref![input, 0, 1024 * 29 / 32], + array_mut_ref![output, 0, 1024], + ), + + 30 => unpack_32_30( + array_ref![input, 0, 1024 * 30 / 32], + array_mut_ref![output, 0, 1024], + ), + 31 => unpack_32_31( + array_ref![input, 0, 1024 * 31 / 32], + array_mut_ref![output, 0, 1024], + ), + 32 => unpack_32_32( + array_ref![input, 0, 1024 * 32 / 32], + array_mut_ref![output, 0, 1024], + ), + + _ => unreachable!("Unsupported width: {}", width), + } + } +} + +impl BitPacking for u64 { + unsafe fn unchecked_pack(width: usize, input: &[Self], output: &mut [Self]) { + let packed_len = 128 * width / size_of::(); + debug_assert_eq!( + output.len(), + packed_len, + "Output buffer must be of size 1024 * W / T" + ); + debug_assert_eq!(input.len(), 1024, "Input buffer must be of size 1024"); + debug_assert!( + width <= Self::T, + "Width must be less than or equal to {}", + Self::T + ); + + match width { + 1 => pack_64_1( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 / 64], + ), + 2 => pack_64_2( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 2 / 64], + ), + 3 => pack_64_3( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 3 / 64], + ), + 4 => pack_64_4( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 4 / 64], + ), + 5 => pack_64_5( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 5 / 64], + ), + 6 => pack_64_6( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 6 / 64], + ), + 7 => pack_64_7( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 7 / 64], + ), + 8 => pack_64_8( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 8 / 64], + ), + 9 => pack_64_9( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 9 / 64], + ), + + 10 => pack_64_10( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 10 / 64], + ), + 11 => pack_64_11( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 11 / 64], + ), + 12 => pack_64_12( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 12 / 64], + ), + 13 => pack_64_13( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 13 / 64], + ), + 14 => pack_64_14( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 14 / 64], + ), + 15 => pack_64_15( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 15 / 64], + ), + 16 => pack_64_16( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 16 / 64], + ), + 17 => pack_64_17( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 17 / 64], + ), + 18 => pack_64_18( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 18 / 64], + ), + 19 => pack_64_19( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 19 / 64], + ), + + 20 => pack_64_20( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 20 / 64], + ), + 21 => pack_64_21( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 21 / 64], + ), + 22 => pack_64_22( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 22 / 64], + ), + 23 => pack_64_23( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 23 / 64], + ), + 24 => pack_64_24( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 24 / 64], + ), + 25 => pack_64_25( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 25 / 64], + ), + 26 => pack_64_26( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 26 / 64], + ), + 27 => pack_64_27( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 27 / 64], + ), + 28 => pack_64_28( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 28 / 64], + ), + 29 => pack_64_29( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 29 / 64], + ), + + 30 => pack_64_30( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 30 / 64], + ), + 31 => pack_64_31( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 31 / 64], + ), + 32 => pack_64_32( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 32 / 64], + ), + 33 => pack_64_33( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 33 / 64], + ), + 34 => pack_64_34( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 34 / 64], + ), + 35 => pack_64_35( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 35 / 64], + ), + 36 => pack_64_36( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 36 / 64], + ), + 37 => pack_64_37( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 37 / 64], + ), + 38 => pack_64_38( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 38 / 64], + ), + 39 => pack_64_39( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 39 / 64], + ), + + 40 => pack_64_40( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 40 / 64], + ), + 41 => pack_64_41( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 41 / 64], + ), + 42 => pack_64_42( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 42 / 64], + ), + 43 => pack_64_43( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 43 / 64], + ), + 44 => pack_64_44( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 44 / 64], + ), + 45 => pack_64_45( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 45 / 64], + ), + 46 => pack_64_46( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 46 / 64], + ), + 47 => pack_64_47( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 47 / 64], + ), + 48 => pack_64_48( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 48 / 64], + ), + 49 => pack_64_49( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 49 / 64], + ), + + 50 => pack_64_50( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 50 / 64], + ), + 51 => pack_64_51( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 51 / 64], + ), + 52 => pack_64_52( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 52 / 64], + ), + 53 => pack_64_53( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 53 / 64], + ), + 54 => pack_64_54( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 54 / 64], + ), + 55 => pack_64_55( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 55 / 64], + ), + 56 => pack_64_56( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 56 / 64], + ), + 57 => pack_64_57( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 57 / 64], + ), + 58 => pack_64_58( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 58 / 64], + ), + 59 => pack_64_59( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 59 / 64], + ), + + 60 => pack_64_60( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 60 / 64], + ), + 61 => pack_64_61( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 61 / 64], + ), + 62 => pack_64_62( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 62 / 64], + ), + 63 => pack_64_63( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 63 / 64], + ), + 64 => pack_64_64( + array_ref![input, 0, 1024], + array_mut_ref![output, 0, 1024 * 64 / 64], + ), + + _ => unreachable!("Unsupported width: {}", width), + } + } + + unsafe fn unchecked_unpack(width: usize, input: &[Self], output: &mut [Self]) { + let packed_len = 128 * width / size_of::(); + debug_assert_eq!( + input.len(), + packed_len, + "Input buffer must be of size 1024 * W / T" + ); + debug_assert_eq!(output.len(), 1024, "Output buffer must be of size 1024"); + debug_assert!( + width <= Self::T, + "Width must be less than or equal to {}", + Self::T + ); + + match width { + 1 => unpack_64_1( + array_ref![input, 0, 1024 / 64], + array_mut_ref![output, 0, 1024], + ), + 2 => unpack_64_2( + array_ref![input, 0, 1024 * 2 / 64], + array_mut_ref![output, 0, 1024], + ), + 3 => unpack_64_3( + array_ref![input, 0, 1024 * 3 / 64], + array_mut_ref![output, 0, 1024], + ), + 4 => unpack_64_4( + array_ref![input, 0, 1024 * 4 / 64], + array_mut_ref![output, 0, 1024], + ), + 5 => unpack_64_5( + array_ref![input, 0, 1024 * 5 / 64], + array_mut_ref![output, 0, 1024], + ), + 6 => unpack_64_6( + array_ref![input, 0, 1024 * 6 / 64], + array_mut_ref![output, 0, 1024], + ), + 7 => unpack_64_7( + array_ref![input, 0, 1024 * 7 / 64], + array_mut_ref![output, 0, 1024], + ), + 8 => unpack_64_8( + array_ref![input, 0, 1024 * 8 / 64], + array_mut_ref![output, 0, 1024], + ), + 9 => unpack_64_9( + array_ref![input, 0, 1024 * 9 / 64], + array_mut_ref![output, 0, 1024], + ), + + 10 => unpack_64_10( + array_ref![input, 0, 1024 * 10 / 64], + array_mut_ref![output, 0, 1024], + ), + 11 => unpack_64_11( + array_ref![input, 0, 1024 * 11 / 64], + array_mut_ref![output, 0, 1024], + ), + 12 => unpack_64_12( + array_ref![input, 0, 1024 * 12 / 64], + array_mut_ref![output, 0, 1024], + ), + 13 => unpack_64_13( + array_ref![input, 0, 1024 * 13 / 64], + array_mut_ref![output, 0, 1024], + ), + 14 => unpack_64_14( + array_ref![input, 0, 1024 * 14 / 64], + array_mut_ref![output, 0, 1024], + ), + 15 => unpack_64_15( + array_ref![input, 0, 1024 * 15 / 64], + array_mut_ref![output, 0, 1024], + ), + 16 => unpack_64_16( + array_ref![input, 0, 1024 * 16 / 64], + array_mut_ref![output, 0, 1024], + ), + 17 => unpack_64_17( + array_ref![input, 0, 1024 * 17 / 64], + array_mut_ref![output, 0, 1024], + ), + 18 => unpack_64_18( + array_ref![input, 0, 1024 * 18 / 64], + array_mut_ref![output, 0, 1024], + ), + 19 => unpack_64_19( + array_ref![input, 0, 1024 * 19 / 64], + array_mut_ref![output, 0, 1024], + ), + + 20 => unpack_64_20( + array_ref![input, 0, 1024 * 20 / 64], + array_mut_ref![output, 0, 1024], + ), + 21 => unpack_64_21( + array_ref![input, 0, 1024 * 21 / 64], + array_mut_ref![output, 0, 1024], + ), + 22 => unpack_64_22( + array_ref![input, 0, 1024 * 22 / 64], + array_mut_ref![output, 0, 1024], + ), + 23 => unpack_64_23( + array_ref![input, 0, 1024 * 23 / 64], + array_mut_ref![output, 0, 1024], + ), + 24 => unpack_64_24( + array_ref![input, 0, 1024 * 24 / 64], + array_mut_ref![output, 0, 1024], + ), + 25 => unpack_64_25( + array_ref![input, 0, 1024 * 25 / 64], + array_mut_ref![output, 0, 1024], + ), + 26 => unpack_64_26( + array_ref![input, 0, 1024 * 26 / 64], + array_mut_ref![output, 0, 1024], + ), + 27 => unpack_64_27( + array_ref![input, 0, 1024 * 27 / 64], + array_mut_ref![output, 0, 1024], + ), + 28 => unpack_64_28( + array_ref![input, 0, 1024 * 28 / 64], + array_mut_ref![output, 0, 1024], + ), + 29 => unpack_64_29( + array_ref![input, 0, 1024 * 29 / 64], + array_mut_ref![output, 0, 1024], + ), + + 30 => unpack_64_30( + array_ref![input, 0, 1024 * 30 / 64], + array_mut_ref![output, 0, 1024], + ), + 31 => unpack_64_31( + array_ref![input, 0, 1024 * 31 / 64], + array_mut_ref![output, 0, 1024], + ), + 32 => unpack_64_32( + array_ref![input, 0, 1024 * 32 / 64], + array_mut_ref![output, 0, 1024], + ), + 33 => unpack_64_33( + array_ref![input, 0, 1024 * 33 / 64], + array_mut_ref![output, 0, 1024], + ), + 34 => unpack_64_34( + array_ref![input, 0, 1024 * 34 / 64], + array_mut_ref![output, 0, 1024], + ), + 35 => unpack_64_35( + array_ref![input, 0, 1024 * 35 / 64], + array_mut_ref![output, 0, 1024], + ), + 36 => unpack_64_36( + array_ref![input, 0, 1024 * 36 / 64], + array_mut_ref![output, 0, 1024], + ), + 37 => unpack_64_37( + array_ref![input, 0, 1024 * 37 / 64], + array_mut_ref![output, 0, 1024], + ), + 38 => unpack_64_38( + array_ref![input, 0, 1024 * 38 / 64], + array_mut_ref![output, 0, 1024], + ), + 39 => unpack_64_39( + array_ref![input, 0, 1024 * 39 / 64], + array_mut_ref![output, 0, 1024], + ), + + 40 => unpack_64_40( + array_ref![input, 0, 1024 * 40 / 64], + array_mut_ref![output, 0, 1024], + ), + 41 => unpack_64_41( + array_ref![input, 0, 1024 * 41 / 64], + array_mut_ref![output, 0, 1024], + ), + 42 => unpack_64_42( + array_ref![input, 0, 1024 * 42 / 64], + array_mut_ref![output, 0, 1024], + ), + 43 => unpack_64_43( + array_ref![input, 0, 1024 * 43 / 64], + array_mut_ref![output, 0, 1024], + ), + 44 => unpack_64_44( + array_ref![input, 0, 1024 * 44 / 64], + array_mut_ref![output, 0, 1024], + ), + 45 => unpack_64_45( + array_ref![input, 0, 1024 * 45 / 64], + array_mut_ref![output, 0, 1024], + ), + 46 => unpack_64_46( + array_ref![input, 0, 1024 * 46 / 64], + array_mut_ref![output, 0, 1024], + ), + 47 => unpack_64_47( + array_ref![input, 0, 1024 * 47 / 64], + array_mut_ref![output, 0, 1024], + ), + 48 => unpack_64_48( + array_ref![input, 0, 1024 * 48 / 64], + array_mut_ref![output, 0, 1024], + ), + 49 => unpack_64_49( + array_ref![input, 0, 1024 * 49 / 64], + array_mut_ref![output, 0, 1024], + ), + + 50 => unpack_64_50( + array_ref![input, 0, 1024 * 50 / 64], + array_mut_ref![output, 0, 1024], + ), + 51 => unpack_64_51( + array_ref![input, 0, 1024 * 51 / 64], + array_mut_ref![output, 0, 1024], + ), + 52 => unpack_64_52( + array_ref![input, 0, 1024 * 52 / 64], + array_mut_ref![output, 0, 1024], + ), + 53 => unpack_64_53( + array_ref![input, 0, 1024 * 53 / 64], + array_mut_ref![output, 0, 1024], + ), + 54 => unpack_64_54( + array_ref![input, 0, 1024 * 54 / 64], + array_mut_ref![output, 0, 1024], + ), + 55 => unpack_64_55( + array_ref![input, 0, 1024 * 55 / 64], + array_mut_ref![output, 0, 1024], + ), + 56 => unpack_64_56( + array_ref![input, 0, 1024 * 56 / 64], + array_mut_ref![output, 0, 1024], + ), + 57 => unpack_64_57( + array_ref![input, 0, 1024 * 57 / 64], + array_mut_ref![output, 0, 1024], + ), + 58 => unpack_64_58( + array_ref![input, 0, 1024 * 58 / 64], + array_mut_ref![output, 0, 1024], + ), + 59 => unpack_64_59( + array_ref![input, 0, 1024 * 59 / 64], + array_mut_ref![output, 0, 1024], + ), + + 60 => unpack_64_60( + array_ref![input, 0, 1024 * 60 / 64], + array_mut_ref![output, 0, 1024], + ), + 61 => unpack_64_61( + array_ref![input, 0, 1024 * 61 / 64], + array_mut_ref![output, 0, 1024], + ), + 62 => unpack_64_62( + array_ref![input, 0, 1024 * 62 / 64], + array_mut_ref![output, 0, 1024], + ), + 63 => unpack_64_63( + array_ref![input, 0, 1024 * 63 / 64], + array_mut_ref![output, 0, 1024], + ), + 64 => unpack_64_64( + array_ref![input, 0, 1024 * 64 / 64], + array_mut_ref![output, 0, 1024], + ), + + _ => unreachable!("Unsupported width: {}", width), + } + } +} + +macro_rules! unpack_8 { + ($name:ident, $bits:expr) => { + fn $name(input: &[u8; 1024 * $bits / u8::T], output: &mut [u8; 1024]) { + for lane in 0..u8::LANES { + unpack!(u8, $bits, input, lane, |$idx, $elem| { + output[$idx] = $elem; + }); + } + } + }; +} + +unpack_8!(unpack_8_1, 1); +unpack_8!(unpack_8_2, 2); +unpack_8!(unpack_8_3, 3); +unpack_8!(unpack_8_4, 4); +unpack_8!(unpack_8_5, 5); +unpack_8!(unpack_8_6, 6); +unpack_8!(unpack_8_7, 7); +unpack_8!(unpack_8_8, 8); + +macro_rules! pack_8 { + ($name:ident, $bits:expr) => { + fn $name(input: &[u8; 1024], output: &mut [u8; 1024 * $bits / u8::T]) { + for lane in 0..u8::LANES { + pack!(u8, $bits, output, lane, |$idx| { input[$idx] }); + } + } + }; +} +pack_8!(pack_8_1, 1); +pack_8!(pack_8_2, 2); +pack_8!(pack_8_3, 3); +pack_8!(pack_8_4, 4); +pack_8!(pack_8_5, 5); +pack_8!(pack_8_6, 6); +pack_8!(pack_8_7, 7); +pack_8!(pack_8_8, 8); + +macro_rules! unpack_16 { + ($name:ident, $bits:expr) => { + fn $name(input: &[u16; 1024 * $bits / u16::T], output: &mut [u16; 1024]) { + for lane in 0..u16::LANES { + unpack!(u16, $bits, input, lane, |$idx, $elem| { + output[$idx] = $elem; + }); + } + } + }; +} + +unpack_16!(unpack_16_1, 1); +unpack_16!(unpack_16_2, 2); +unpack_16!(unpack_16_3, 3); +unpack_16!(unpack_16_4, 4); +unpack_16!(unpack_16_5, 5); +unpack_16!(unpack_16_6, 6); +unpack_16!(unpack_16_7, 7); +unpack_16!(unpack_16_8, 8); +unpack_16!(unpack_16_9, 9); +unpack_16!(unpack_16_10, 10); +unpack_16!(unpack_16_11, 11); +unpack_16!(unpack_16_12, 12); +unpack_16!(unpack_16_13, 13); +unpack_16!(unpack_16_14, 14); +unpack_16!(unpack_16_15, 15); +unpack_16!(unpack_16_16, 16); + +macro_rules! pack_16 { + ($name:ident, $bits:expr) => { + fn $name(input: &[u16; 1024], output: &mut [u16; 1024 * $bits / u16::T]) { + for lane in 0..u16::LANES { + pack!(u16, $bits, output, lane, |$idx| { input[$idx] }); + } + } + }; +} + +pack_16!(pack_16_1, 1); +pack_16!(pack_16_2, 2); +pack_16!(pack_16_3, 3); +pack_16!(pack_16_4, 4); +pack_16!(pack_16_5, 5); +pack_16!(pack_16_6, 6); +pack_16!(pack_16_7, 7); +pack_16!(pack_16_8, 8); +pack_16!(pack_16_9, 9); +pack_16!(pack_16_10, 10); +pack_16!(pack_16_11, 11); +pack_16!(pack_16_12, 12); +pack_16!(pack_16_13, 13); +pack_16!(pack_16_14, 14); +pack_16!(pack_16_15, 15); +pack_16!(pack_16_16, 16); + +macro_rules! unpack_32 { + ($name:ident, $bit_width:expr) => { + fn $name(input: &[u32; 1024 * $bit_width / u32::T], output: &mut [u32; 1024]) { + for lane in 0..u32::LANES { + unpack!(u32, $bit_width, input, lane, |$idx, $elem| { + output[$idx] = $elem + }); + } + } + }; +} + +unpack_32!(unpack_32_1, 1); +unpack_32!(unpack_32_2, 2); +unpack_32!(unpack_32_3, 3); +unpack_32!(unpack_32_4, 4); +unpack_32!(unpack_32_5, 5); +unpack_32!(unpack_32_6, 6); +unpack_32!(unpack_32_7, 7); +unpack_32!(unpack_32_8, 8); +unpack_32!(unpack_32_9, 9); +unpack_32!(unpack_32_10, 10); +unpack_32!(unpack_32_11, 11); +unpack_32!(unpack_32_12, 12); +unpack_32!(unpack_32_13, 13); +unpack_32!(unpack_32_14, 14); +unpack_32!(unpack_32_15, 15); +unpack_32!(unpack_32_16, 16); +unpack_32!(unpack_32_17, 17); +unpack_32!(unpack_32_18, 18); +unpack_32!(unpack_32_19, 19); +unpack_32!(unpack_32_20, 20); +unpack_32!(unpack_32_21, 21); +unpack_32!(unpack_32_22, 22); +unpack_32!(unpack_32_23, 23); +unpack_32!(unpack_32_24, 24); +unpack_32!(unpack_32_25, 25); +unpack_32!(unpack_32_26, 26); +unpack_32!(unpack_32_27, 27); +unpack_32!(unpack_32_28, 28); +unpack_32!(unpack_32_29, 29); +unpack_32!(unpack_32_30, 30); +unpack_32!(unpack_32_31, 31); +unpack_32!(unpack_32_32, 32); + +macro_rules! pack_32 { + ($name:ident, $bits:expr) => { + fn $name(input: &[u32; 1024], output: &mut [u32; 1024 * $bits / u32::BITS as usize]) { + for lane in 0..u32::LANES { + pack!(u32, $bits, output, lane, |$idx| { input[$idx] }); + } + } + }; +} + +pack_32!(pack_32_1, 1); +pack_32!(pack_32_2, 2); +pack_32!(pack_32_3, 3); +pack_32!(pack_32_4, 4); +pack_32!(pack_32_5, 5); +pack_32!(pack_32_6, 6); +pack_32!(pack_32_7, 7); +pack_32!(pack_32_8, 8); +pack_32!(pack_32_9, 9); +pack_32!(pack_32_10, 10); +pack_32!(pack_32_11, 11); +pack_32!(pack_32_12, 12); +pack_32!(pack_32_13, 13); +pack_32!(pack_32_14, 14); +pack_32!(pack_32_15, 15); +pack_32!(pack_32_16, 16); +pack_32!(pack_32_17, 17); +pack_32!(pack_32_18, 18); +pack_32!(pack_32_19, 19); +pack_32!(pack_32_20, 20); +pack_32!(pack_32_21, 21); +pack_32!(pack_32_22, 22); +pack_32!(pack_32_23, 23); +pack_32!(pack_32_24, 24); +pack_32!(pack_32_25, 25); +pack_32!(pack_32_26, 26); +pack_32!(pack_32_27, 27); +pack_32!(pack_32_28, 28); +pack_32!(pack_32_29, 29); +pack_32!(pack_32_30, 30); +pack_32!(pack_32_31, 31); +pack_32!(pack_32_32, 32); + +macro_rules! unpack_64 { + ($name:ident, $bit_width:expr) => { + fn $name(input: &[u64; 1024 * $bit_width / u64::T], output: &mut [u64; 1024]) { + for lane in 0..u64::LANES { + unpack!(u64, $bit_width, input, lane, |$idx, $elem| { + output[$idx] = $elem + }); + } + } + }; +} + +unpack_64!(unpack_64_1, 1); +unpack_64!(unpack_64_2, 2); +unpack_64!(unpack_64_3, 3); +unpack_64!(unpack_64_4, 4); +unpack_64!(unpack_64_5, 5); +unpack_64!(unpack_64_6, 6); +unpack_64!(unpack_64_7, 7); +unpack_64!(unpack_64_8, 8); +unpack_64!(unpack_64_9, 9); +unpack_64!(unpack_64_10, 10); +unpack_64!(unpack_64_11, 11); +unpack_64!(unpack_64_12, 12); +unpack_64!(unpack_64_13, 13); +unpack_64!(unpack_64_14, 14); +unpack_64!(unpack_64_15, 15); +unpack_64!(unpack_64_16, 16); +unpack_64!(unpack_64_17, 17); +unpack_64!(unpack_64_18, 18); +unpack_64!(unpack_64_19, 19); +unpack_64!(unpack_64_20, 20); +unpack_64!(unpack_64_21, 21); +unpack_64!(unpack_64_22, 22); +unpack_64!(unpack_64_23, 23); +unpack_64!(unpack_64_24, 24); +unpack_64!(unpack_64_25, 25); +unpack_64!(unpack_64_26, 26); +unpack_64!(unpack_64_27, 27); +unpack_64!(unpack_64_28, 28); +unpack_64!(unpack_64_29, 29); +unpack_64!(unpack_64_30, 30); +unpack_64!(unpack_64_31, 31); +unpack_64!(unpack_64_32, 32); + +unpack_64!(unpack_64_33, 33); +unpack_64!(unpack_64_34, 34); +unpack_64!(unpack_64_35, 35); +unpack_64!(unpack_64_36, 36); +unpack_64!(unpack_64_37, 37); +unpack_64!(unpack_64_38, 38); +unpack_64!(unpack_64_39, 39); +unpack_64!(unpack_64_40, 40); +unpack_64!(unpack_64_41, 41); +unpack_64!(unpack_64_42, 42); +unpack_64!(unpack_64_43, 43); +unpack_64!(unpack_64_44, 44); +unpack_64!(unpack_64_45, 45); +unpack_64!(unpack_64_46, 46); +unpack_64!(unpack_64_47, 47); +unpack_64!(unpack_64_48, 48); +unpack_64!(unpack_64_49, 49); +unpack_64!(unpack_64_50, 50); +unpack_64!(unpack_64_51, 51); +unpack_64!(unpack_64_52, 52); +unpack_64!(unpack_64_53, 53); +unpack_64!(unpack_64_54, 54); +unpack_64!(unpack_64_55, 55); +unpack_64!(unpack_64_56, 56); +unpack_64!(unpack_64_57, 57); +unpack_64!(unpack_64_58, 58); +unpack_64!(unpack_64_59, 59); +unpack_64!(unpack_64_60, 60); +unpack_64!(unpack_64_61, 61); +unpack_64!(unpack_64_62, 62); +unpack_64!(unpack_64_63, 63); +unpack_64!(unpack_64_64, 64); + +macro_rules! pack_64 { + ($name:ident, $bits:expr) => { + fn $name(input: &[u64; 1024], output: &mut [u64; 1024 * $bits / u64::BITS as usize]) { + for lane in 0..u64::LANES { + pack!(u64, $bits, output, lane, |$idx| { input[$idx] }); + } + } + }; +} + +pack_64!(pack_64_1, 1); +pack_64!(pack_64_2, 2); +pack_64!(pack_64_3, 3); +pack_64!(pack_64_4, 4); +pack_64!(pack_64_5, 5); +pack_64!(pack_64_6, 6); +pack_64!(pack_64_7, 7); +pack_64!(pack_64_8, 8); +pack_64!(pack_64_9, 9); +pack_64!(pack_64_10, 10); +pack_64!(pack_64_11, 11); +pack_64!(pack_64_12, 12); +pack_64!(pack_64_13, 13); +pack_64!(pack_64_14, 14); +pack_64!(pack_64_15, 15); +pack_64!(pack_64_16, 16); +pack_64!(pack_64_17, 17); +pack_64!(pack_64_18, 18); +pack_64!(pack_64_19, 19); +pack_64!(pack_64_20, 20); +pack_64!(pack_64_21, 21); +pack_64!(pack_64_22, 22); +pack_64!(pack_64_23, 23); +pack_64!(pack_64_24, 24); +pack_64!(pack_64_25, 25); +pack_64!(pack_64_26, 26); +pack_64!(pack_64_27, 27); +pack_64!(pack_64_28, 28); +pack_64!(pack_64_29, 29); +pack_64!(pack_64_30, 30); +pack_64!(pack_64_31, 31); +pack_64!(pack_64_32, 32); + +pack_64!(pack_64_33, 33); +pack_64!(pack_64_34, 34); +pack_64!(pack_64_35, 35); +pack_64!(pack_64_36, 36); +pack_64!(pack_64_37, 37); +pack_64!(pack_64_38, 38); +pack_64!(pack_64_39, 39); +pack_64!(pack_64_40, 40); +pack_64!(pack_64_41, 41); +pack_64!(pack_64_42, 42); +pack_64!(pack_64_43, 43); +pack_64!(pack_64_44, 44); +pack_64!(pack_64_45, 45); +pack_64!(pack_64_46, 46); +pack_64!(pack_64_47, 47); +pack_64!(pack_64_48, 48); +pack_64!(pack_64_49, 49); +pack_64!(pack_64_50, 50); +pack_64!(pack_64_51, 51); +pack_64!(pack_64_52, 52); +pack_64!(pack_64_53, 53); +pack_64!(pack_64_54, 54); +pack_64!(pack_64_55, 55); +pack_64!(pack_64_56, 56); +pack_64!(pack_64_57, 57); +pack_64!(pack_64_58, 58); +pack_64!(pack_64_59, 59); +pack_64!(pack_64_60, 60); +pack_64!(pack_64_61, 61); +pack_64!(pack_64_62, 62); +pack_64!(pack_64_63, 63); +pack_64!(pack_64_64, 64); + +#[cfg(test)] +mod test { + use super::*; + use core::array; + // a fast random number generator + pub struct XorShift { + state: u64, + } + + impl XorShift { + pub fn new(seed: u64) -> Self { + Self { state: seed } + } + + pub fn next(&mut self) -> u64 { + let mut x = self.state; + x ^= x << 13; + x ^= x >> 7; + x ^= x << 17; + self.state = x; + x + } + } + + // a macro version of this function generalize u8, u16, u32, u64 takes very long time for a test build, so I + // write it for each type separately + fn pack_unpack_u8(bit_width: usize) { + let mut values: [u8; 1024] = [0; 1024]; + let mut rng = XorShift::new(123456789); + for value in &mut values { + *value = (rng.next() % (1 << bit_width)) as u8; + } + + let mut packed = vec![0; 1024 * bit_width / 8]; + for lane in 0..u8::LANES { + // Always loop over lanes first. This is what the compiler vectorizes. + pack!(u8, bit_width, packed, lane, |$pos| { + values[$pos] + }); + } + + let mut unpacked: [u8; 1024] = [0; 1024]; + for lane in 0..u8::LANES { + // Always loop over lanes first. This is what the compiler vectorizes. + unpack!(u8, bit_width, packed, lane, |$idx, $elem| { + unpacked[$idx] = $elem; + }); + } + + assert_eq!(values, unpacked); + } + + fn pack_unpack_u16(bit_width: usize) { + let mut values: [u16; 1024] = [0; 1024]; + let mut rng = XorShift::new(123456789); + for value in &mut values { + *value = (rng.next() % (1 << bit_width)) as u16; + } + + let mut packed = vec![0; 1024 * bit_width / 16]; + for lane in 0..u16::LANES { + // Always loop over lanes first. This is what the compiler vectorizes. + pack!(u16, bit_width, packed, lane, |$pos| { + values[$pos] + }); + } + + let mut unpacked: [u16; 1024] = [0; 1024]; + for lane in 0..u16::LANES { + // Always loop over lanes first. This is what the compiler vectorizes. + unpack!(u16, bit_width, packed, lane, |$idx, $elem| { + unpacked[$idx] = $elem; + }); + } + + assert_eq!(values, unpacked); + } + + fn pack_unpack_u32(bit_width: usize) { + let mut values: [u32; 1024] = [0; 1024]; + let mut rng = XorShift::new(123456789); + for value in &mut values { + *value = (rng.next() % (1 << bit_width)) as u32; + } + + let mut packed = vec![0; 1024 * bit_width / 32]; + for lane in 0..u32::LANES { + // Always loop over lanes first. This is what the compiler vectorizes. + pack!(u32, bit_width, packed, lane, |$pos| { + values[$pos] + }); + } + + let mut unpacked: [u32; 1024] = [0; 1024]; + for lane in 0..u32::LANES { + // Always loop over lanes first. This is what the compiler vectorizes. + unpack!(u32, bit_width, packed, lane, |$idx, $elem| { + unpacked[$idx] = $elem; + }); + } + + assert_eq!(values, unpacked); + } + + fn pack_unpack_u64(bit_width: usize) { + let mut values: [u64; 1024] = [0; 1024]; + let mut rng = XorShift::new(123456789); + if bit_width == 64 { + for value in &mut values { + *value = rng.next(); + } + } else { + for value in &mut values { + *value = rng.next() % (1 << bit_width); + } + } + + let mut packed = vec![0; 1024 * bit_width / 64]; + for lane in 0..u64::LANES { + // Always loop over lanes first. This is what the compiler vectorizes. + pack!(u64, bit_width, packed, lane, |$pos| { + values[$pos] + }); + } + + let mut unpacked: [u64; 1024] = [0; 1024]; + for lane in 0..u64::LANES { + // Always loop over lanes first. This is what the compiler vectorizes. + unpack!(u64, bit_width, packed, lane, |$idx, $elem| { + unpacked[$idx] = $elem; + }); + } + + assert_eq!(values, unpacked); + } + + #[test] + fn test_pack() { + pack_unpack_u8(0); + pack_unpack_u8(1); + pack_unpack_u8(2); + pack_unpack_u8(3); + pack_unpack_u8(4); + pack_unpack_u8(5); + pack_unpack_u8(6); + pack_unpack_u8(7); + pack_unpack_u8(8); + + pack_unpack_u16(0); + pack_unpack_u16(1); + pack_unpack_u16(2); + pack_unpack_u16(3); + pack_unpack_u16(4); + pack_unpack_u16(5); + pack_unpack_u16(6); + pack_unpack_u16(7); + pack_unpack_u16(8); + pack_unpack_u16(9); + pack_unpack_u16(10); + pack_unpack_u16(11); + pack_unpack_u16(12); + pack_unpack_u16(13); + pack_unpack_u16(14); + pack_unpack_u16(15); + pack_unpack_u16(16); + + pack_unpack_u32(0); + pack_unpack_u32(1); + pack_unpack_u32(2); + pack_unpack_u32(3); + pack_unpack_u32(4); + pack_unpack_u32(5); + pack_unpack_u32(6); + pack_unpack_u32(7); + pack_unpack_u32(8); + pack_unpack_u32(9); + pack_unpack_u32(10); + pack_unpack_u32(11); + pack_unpack_u32(12); + pack_unpack_u32(13); + pack_unpack_u32(14); + pack_unpack_u32(15); + pack_unpack_u32(16); + pack_unpack_u32(17); + pack_unpack_u32(18); + pack_unpack_u32(19); + pack_unpack_u32(20); + pack_unpack_u32(21); + pack_unpack_u32(22); + pack_unpack_u32(23); + pack_unpack_u32(24); + pack_unpack_u32(25); + pack_unpack_u32(26); + pack_unpack_u32(27); + pack_unpack_u32(28); + pack_unpack_u32(29); + pack_unpack_u32(30); + pack_unpack_u32(31); + pack_unpack_u32(32); + + pack_unpack_u64(0); + pack_unpack_u64(1); + pack_unpack_u64(2); + pack_unpack_u64(3); + pack_unpack_u64(4); + pack_unpack_u64(5); + pack_unpack_u64(6); + pack_unpack_u64(7); + pack_unpack_u64(8); + pack_unpack_u64(9); + pack_unpack_u64(10); + pack_unpack_u64(11); + pack_unpack_u64(12); + pack_unpack_u64(13); + pack_unpack_u64(14); + pack_unpack_u64(15); + pack_unpack_u64(16); + pack_unpack_u64(17); + pack_unpack_u64(18); + pack_unpack_u64(19); + pack_unpack_u64(20); + pack_unpack_u64(21); + pack_unpack_u64(22); + pack_unpack_u64(23); + pack_unpack_u64(24); + pack_unpack_u64(25); + pack_unpack_u64(26); + pack_unpack_u64(27); + pack_unpack_u64(28); + pack_unpack_u64(29); + pack_unpack_u64(30); + pack_unpack_u64(31); + pack_unpack_u64(32); + pack_unpack_u64(33); + pack_unpack_u64(34); + pack_unpack_u64(35); + pack_unpack_u64(36); + pack_unpack_u64(37); + pack_unpack_u64(38); + pack_unpack_u64(39); + pack_unpack_u64(40); + pack_unpack_u64(41); + pack_unpack_u64(42); + pack_unpack_u64(43); + pack_unpack_u64(44); + pack_unpack_u64(45); + pack_unpack_u64(46); + pack_unpack_u64(47); + pack_unpack_u64(48); + pack_unpack_u64(49); + pack_unpack_u64(50); + pack_unpack_u64(51); + pack_unpack_u64(52); + pack_unpack_u64(53); + pack_unpack_u64(54); + pack_unpack_u64(55); + pack_unpack_u64(56); + pack_unpack_u64(57); + pack_unpack_u64(58); + pack_unpack_u64(59); + pack_unpack_u64(60); + pack_unpack_u64(61); + pack_unpack_u64(62); + pack_unpack_u64(63); + pack_unpack_u64(64); + } + + fn unchecked_pack_unpack_u8(bit_width: usize) { + let mut values = [0u8; 1024]; + let mut rng = XorShift::new(123456789); + for value in &mut values { + *value = (rng.next() % (1 << bit_width)) as u8; + } + let mut packed = vec![0; 1024 * bit_width / 8]; + unsafe { + BitPacking::unchecked_pack(bit_width, &values, &mut packed); + } + let mut output = [0; 1024]; + unsafe { BitPacking::unchecked_unpack(bit_width, &packed, &mut output) }; + assert_eq!(values, output); + } + + fn unchecked_pack_unpack_u16(bit_width: usize) { + let mut values = [0u16; 1024]; + let mut rng = XorShift::new(123456789); + for value in &mut values { + *value = (rng.next() % (1 << bit_width)) as u16; + } + let mut packed = vec![0; 1024 * bit_width / u16::T]; + unsafe { + BitPacking::unchecked_pack(bit_width, &values, &mut packed); + } + let mut output = [0; 1024]; + unsafe { BitPacking::unchecked_unpack(bit_width, &packed, &mut output) }; + assert_eq!(values, output); + } + + fn unchecked_pack_unpack_u32(bit_width: usize) { + let mut values = [0u32; 1024]; + let mut rng = XorShift::new(123456789); + for value in &mut values { + *value = (rng.next() % (1 << bit_width)) as u32; + } + let mut packed = vec![0; 1024 * bit_width / u32::T]; + unsafe { + BitPacking::unchecked_pack(bit_width, &values, &mut packed); + } + let mut output = [0; 1024]; + unsafe { BitPacking::unchecked_unpack(bit_width, &packed, &mut output) }; + assert_eq!(values, output); + } + + fn unchecked_pack_unpack_u64(bit_width: usize) { + let mut values = [0u64; 1024]; + let mut rng = XorShift::new(123456789); + if bit_width == 64 { + for value in &mut values { + *value = rng.next(); + } + } + let mut packed = vec![0; 1024 * bit_width / u64::T]; + unsafe { + BitPacking::unchecked_pack(bit_width, &values, &mut packed); + } + let mut output = [0; 1024]; + unsafe { BitPacking::unchecked_unpack(bit_width, &packed, &mut output) }; + assert_eq!(values, output); + } + + #[test] + fn test_unchecked_pack() { + let input = array::from_fn(|i| i as u32); + let mut packed = [0; 320]; + unsafe { BitPacking::unchecked_pack(10, &input, &mut packed) }; + let mut output = [0; 1024]; + unsafe { BitPacking::unchecked_unpack(10, &packed, &mut output) }; + assert_eq!(input, output); + + unchecked_pack_unpack_u8(1); + unchecked_pack_unpack_u8(2); + unchecked_pack_unpack_u8(3); + unchecked_pack_unpack_u8(4); + unchecked_pack_unpack_u8(5); + unchecked_pack_unpack_u8(6); + unchecked_pack_unpack_u8(7); + unchecked_pack_unpack_u8(8); + + unchecked_pack_unpack_u16(1); + unchecked_pack_unpack_u16(2); + unchecked_pack_unpack_u16(3); + unchecked_pack_unpack_u16(4); + unchecked_pack_unpack_u16(5); + unchecked_pack_unpack_u16(6); + unchecked_pack_unpack_u16(7); + unchecked_pack_unpack_u16(8); + unchecked_pack_unpack_u16(9); + unchecked_pack_unpack_u16(10); + unchecked_pack_unpack_u16(11); + unchecked_pack_unpack_u16(12); + unchecked_pack_unpack_u16(13); + unchecked_pack_unpack_u16(14); + unchecked_pack_unpack_u16(15); + unchecked_pack_unpack_u16(16); + + unchecked_pack_unpack_u32(1); + unchecked_pack_unpack_u32(2); + unchecked_pack_unpack_u32(3); + unchecked_pack_unpack_u32(4); + unchecked_pack_unpack_u32(5); + unchecked_pack_unpack_u32(6); + unchecked_pack_unpack_u32(7); + unchecked_pack_unpack_u32(8); + unchecked_pack_unpack_u32(9); + unchecked_pack_unpack_u32(10); + unchecked_pack_unpack_u32(11); + unchecked_pack_unpack_u32(12); + unchecked_pack_unpack_u32(13); + unchecked_pack_unpack_u32(14); + unchecked_pack_unpack_u32(15); + unchecked_pack_unpack_u32(16); + unchecked_pack_unpack_u32(17); + unchecked_pack_unpack_u32(18); + unchecked_pack_unpack_u32(19); + unchecked_pack_unpack_u32(20); + unchecked_pack_unpack_u32(21); + unchecked_pack_unpack_u32(22); + unchecked_pack_unpack_u32(23); + unchecked_pack_unpack_u32(24); + unchecked_pack_unpack_u32(25); + unchecked_pack_unpack_u32(26); + unchecked_pack_unpack_u32(27); + unchecked_pack_unpack_u32(28); + unchecked_pack_unpack_u32(29); + unchecked_pack_unpack_u32(30); + unchecked_pack_unpack_u32(31); + unchecked_pack_unpack_u32(32); + + unchecked_pack_unpack_u64(1); + unchecked_pack_unpack_u64(2); + unchecked_pack_unpack_u64(3); + unchecked_pack_unpack_u64(4); + unchecked_pack_unpack_u64(5); + unchecked_pack_unpack_u64(6); + unchecked_pack_unpack_u64(7); + unchecked_pack_unpack_u64(8); + unchecked_pack_unpack_u64(9); + unchecked_pack_unpack_u64(10); + unchecked_pack_unpack_u64(11); + unchecked_pack_unpack_u64(12); + unchecked_pack_unpack_u64(13); + unchecked_pack_unpack_u64(14); + unchecked_pack_unpack_u64(15); + unchecked_pack_unpack_u64(16); + unchecked_pack_unpack_u64(17); + unchecked_pack_unpack_u64(18); + unchecked_pack_unpack_u64(19); + unchecked_pack_unpack_u64(20); + unchecked_pack_unpack_u64(21); + unchecked_pack_unpack_u64(22); + unchecked_pack_unpack_u64(23); + unchecked_pack_unpack_u64(24); + unchecked_pack_unpack_u64(25); + unchecked_pack_unpack_u64(26); + unchecked_pack_unpack_u64(27); + unchecked_pack_unpack_u64(28); + unchecked_pack_unpack_u64(29); + unchecked_pack_unpack_u64(30); + unchecked_pack_unpack_u64(31); + unchecked_pack_unpack_u64(32); + unchecked_pack_unpack_u64(33); + unchecked_pack_unpack_u64(34); + unchecked_pack_unpack_u64(35); + unchecked_pack_unpack_u64(36); + unchecked_pack_unpack_u64(37); + unchecked_pack_unpack_u64(38); + unchecked_pack_unpack_u64(39); + unchecked_pack_unpack_u64(40); + unchecked_pack_unpack_u64(41); + unchecked_pack_unpack_u64(42); + unchecked_pack_unpack_u64(43); + unchecked_pack_unpack_u64(44); + unchecked_pack_unpack_u64(45); + unchecked_pack_unpack_u64(46); + unchecked_pack_unpack_u64(47); + unchecked_pack_unpack_u64(48); + unchecked_pack_unpack_u64(49); + unchecked_pack_unpack_u64(50); + unchecked_pack_unpack_u64(51); + unchecked_pack_unpack_u64(52); + unchecked_pack_unpack_u64(53); + unchecked_pack_unpack_u64(54); + unchecked_pack_unpack_u64(55); + unchecked_pack_unpack_u64(56); + unchecked_pack_unpack_u64(57); + unchecked_pack_unpack_u64(58); + unchecked_pack_unpack_u64(59); + unchecked_pack_unpack_u64(60); + unchecked_pack_unpack_u64(61); + unchecked_pack_unpack_u64(62); + unchecked_pack_unpack_u64(63); + unchecked_pack_unpack_u64(64); + } +} diff --git a/rust/lance-encoding/src/compression_algo/mod.rs b/rust/lance-encoding/src/compression_algo/mod.rs new file mode 100644 index 0000000000..4b133f003e --- /dev/null +++ b/rust/lance-encoding/src/compression_algo/mod.rs @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +pub mod fastlanes; diff --git a/rust/lance-encoding/src/encoder.rs b/rust/lance-encoding/src/encoder.rs index c2afe88eaa..eb264c8fac 100644 --- a/rust/lance-encoding/src/encoder.rs +++ b/rust/lance-encoding/src/encoder.rs @@ -3,7 +3,7 @@ use std::{collections::HashMap, env, sync::Arc}; use arrow::array::AsArray; -use arrow_array::{Array, ArrayRef, RecordBatch}; +use arrow_array::{Array, ArrayRef, RecordBatch, UInt8Array}; use arrow_schema::DataType; use bytes::{Bytes, BytesMut}; use futures::future::BoxFuture; @@ -14,6 +14,8 @@ use snafu::{location, Location}; use crate::buffer::LanceBuffer; use crate::data::DataBlock; use crate::encodings::logical::r#struct::StructFieldEncoder; +use crate::encodings::physical::bitpack_fastlanes::compute_compressed_bit_width_for_non_neg; +use crate::encodings::physical::bitpack_fastlanes::BitpackedForNonNegArrayEncoder; use crate::encodings::physical::block_compress::CompressionScheme; use crate::encodings::physical::dictionary::AlreadyDictionaryEncoder; use crate::encodings::physical::fsst::FsstArrayEncoder; @@ -277,7 +279,11 @@ impl CoreArrayEncodingStrategy { DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary => { if use_dict_encoding { let dict_indices_encoder = Self::choose_array_encoder( - arrays, + // We need to pass arrays to this method to figure out what kind of compression to + // use but we haven't actually calculated the indices yet. For now, we just assume + // worst case and use the full range. In the future maybe we can pass in statistics + // instead of the actual data + &[Arc::new(UInt8Array::from_iter_values(0_u8..255_u8))], &DataType::UInt8, data_size, false, @@ -343,6 +349,36 @@ impl CoreArrayEncodingStrategy { Ok(Box::new(PackedStructEncoder::new(inner_encoders))) } + DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 => { + if version >= LanceFileVersion::V2_1 && arrays[0].data_type() == data_type { + let compressed_bit_width = compute_compressed_bit_width_for_non_neg(arrays); + Ok(Box::new(BitpackedForNonNegArrayEncoder::new( + compressed_bit_width as usize, + data_type.clone(), + ))) + } else { + Ok(Box::new(BasicEncoder::new(Box::new( + ValueEncoder::default(), + )))) + } + } + + // TODO: for signed integers, I intend to make it a cascaded encoding, a sparse array for the negative values and very wide(bit-width) values, + // then a bitpacked array for the narrow(bit-width) values, I need `BitpackedForNeg` to be merged first, I am + // thinking about putting this sparse array in the metadata so bitpacking remain using one page buffer only. + DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => { + if version >= LanceFileVersion::V2_1 && arrays[0].data_type() == data_type { + let compressed_bit_width = compute_compressed_bit_width_for_non_neg(arrays); + Ok(Box::new(BitpackedForNonNegArrayEncoder::new( + compressed_bit_width as usize, + data_type.clone(), + ))) + } else { + Ok(Box::new(BasicEncoder::new(Box::new( + ValueEncoder::default(), + )))) + } + } _ => Ok(Box::new(BasicEncoder::new(Box::new( ValueEncoder::default(), )))), diff --git a/rust/lance-encoding/src/encodings/physical.rs b/rust/lance-encoding/src/encodings/physical.rs index 7340305843..8d61885969 100644 --- a/rust/lance-encoding/src/encodings/physical.rs +++ b/rust/lance-encoding/src/encodings/physical.rs @@ -19,6 +19,7 @@ pub mod basic; pub mod binary; pub mod bitmap; pub mod bitpack; +pub mod bitpack_fastlanes; pub mod block_compress; pub mod dictionary; pub mod fixed_size_binary; @@ -109,6 +110,19 @@ fn get_bitpacked_buffer_decoder( )) } +fn get_bitpacked_for_non_neg_buffer_decoder( + encoding: &pb::BitpackedForNonNeg, + buffers: &PageBuffers, +) -> Box { + let (buffer_offset, _buffer_size) = get_buffer(encoding.buffer.as_ref().unwrap(), buffers); + + Box::new(bitpack_fastlanes::BitpackedForNonNegScheduler::new( + encoding.compressed_bits_per_value, + encoding.uncompressed_bits_per_value, + buffer_offset, + )) +} + /// Convert a protobuf array encoding into a physical page scheduler pub fn decoder_from_array_encoding( encoding: &pb::ArrayEncoding, @@ -252,6 +266,9 @@ pub fn decoder_from_array_encoding( buffer_offset, )) } + pb::array_encoding::ArrayEncoding::BitpackedForNonNeg(bitpacked) => { + get_bitpacked_for_non_neg_buffer_decoder(bitpacked, buffers) + } // Currently there is no way to encode struct nullability and structs are encoded with a "header" column // (that has no data). We never actually decode that column and so this branch is never actually encountered. // diff --git a/rust/lance-encoding/src/encodings/physical/bitpack_fastlanes.rs b/rust/lance-encoding/src/encodings/physical/bitpack_fastlanes.rs new file mode 100644 index 0000000000..c04a8b3e0a --- /dev/null +++ b/rust/lance-encoding/src/encodings/physical/bitpack_fastlanes.rs @@ -0,0 +1,1555 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::sync::Arc; + +use arrow::datatypes::{ + Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type, +}; +use arrow_array::{Array, PrimitiveArray}; +use arrow_schema::DataType; +use bytes::Bytes; +use futures::future::{BoxFuture, FutureExt}; +use log::trace; +use snafu::{location, Location}; + +use lance_arrow::DataTypeExt; +use lance_core::{Error, Result}; + +use crate::buffer::LanceBuffer; +use crate::compression_algo::fastlanes::BitPacking; +use crate::data::{DataBlock, FixedWidthDataBlock, NullableDataBlock}; +use crate::decoder::{PageScheduler, PrimitivePageDecoder}; +use crate::encoder::{ArrayEncoder, EncodedArray}; +use crate::format::ProtobufUtils; +use arrow::array::ArrayRef; +use bytemuck::cast_slice; +const ELEMS_PER_CHUNK: u64 = 1024; + +// Compute the compressed_bit_width for a given array of integers +// todo: compute all statistics before encoding +// todo: see how to use rust macro to rewrite this function +pub fn compute_compressed_bit_width_for_non_neg(arrays: &[ArrayRef]) -> u64 { + debug_assert!(!arrays.is_empty()); + + let res; + + match arrays[0].data_type() { + DataType::UInt8 => { + let mut global_max: u8 = 0; + for array in arrays { + let primitive_array = array + .as_any() + .downcast_ref::>() + .unwrap(); + let array_max = arrow::compute::bit_or(primitive_array); + global_max = global_max.max(array_max.unwrap_or(0)); + } + let num_bits = + arrays[0].data_type().byte_width() as u64 * 8 - global_max.leading_zeros() as u64; + // we will have constant encoding later + if num_bits == 0 { + res = 1; + } else { + res = num_bits; + } + } + + DataType::Int8 => { + let mut global_max_width: u64 = 0; + for array in arrays { + let primitive_array = array + .as_any() + .downcast_ref::>() + .unwrap(); + let array_max_width = arrow::compute::bit_or(primitive_array).unwrap_or(0); + global_max_width = global_max_width.max(8 - array_max_width.leading_zeros() as u64); + } + if global_max_width == 0 { + res = 1; + } else { + res = global_max_width; + } + } + + DataType::UInt16 => { + let mut global_max: u16 = 0; + for array in arrays { + let primitive_array = array + .as_any() + .downcast_ref::>() + .unwrap(); + let array_max = arrow::compute::bit_or(primitive_array).unwrap_or(0); + global_max = global_max.max(array_max); + } + let num_bits = + arrays[0].data_type().byte_width() as u64 * 8 - global_max.leading_zeros() as u64; + if num_bits == 0 { + res = 1; + } else { + res = num_bits; + } + } + + DataType::Int16 => { + let mut global_max_width: u64 = 0; + for array in arrays { + let primitive_array = array + .as_any() + .downcast_ref::>() + .unwrap(); + let array_max_width = arrow::compute::bit_or(primitive_array).unwrap_or(0); + global_max_width = + global_max_width.max(16 - array_max_width.leading_zeros() as u64); + } + if global_max_width == 0 { + res = 1; + } else { + res = global_max_width; + } + } + + DataType::UInt32 => { + let mut global_max: u32 = 0; + for array in arrays { + let primitive_array = array + .as_any() + .downcast_ref::>() + .unwrap(); + let array_max = arrow::compute::bit_or(primitive_array).unwrap_or(0); + global_max = global_max.max(array_max); + } + let num_bits = + arrays[0].data_type().byte_width() as u64 * 8 - global_max.leading_zeros() as u64; + if num_bits == 0 { + res = 1; + } else { + res = num_bits; + } + } + + DataType::Int32 => { + let mut global_max_width: u64 = 0; + for array in arrays { + let primitive_array = array + .as_any() + .downcast_ref::>() + .unwrap(); + let array_max_width = arrow::compute::bit_or(primitive_array).unwrap_or(0); + global_max_width = + global_max_width.max(32 - array_max_width.leading_zeros() as u64); + } + if global_max_width == 0 { + res = 1; + } else { + res = global_max_width; + } + } + + DataType::UInt64 => { + let mut global_max: u64 = 0; + for array in arrays { + let primitive_array = array + .as_any() + .downcast_ref::>() + .unwrap(); + let array_max = arrow::compute::bit_or(primitive_array).unwrap_or(0); + global_max = global_max.max(array_max); + } + let num_bits = + arrays[0].data_type().byte_width() as u64 * 8 - global_max.leading_zeros() as u64; + if num_bits == 0 { + res = 1; + } else { + res = num_bits; + } + } + + DataType::Int64 => { + let mut global_max_width: u64 = 0; + for array in arrays { + let primitive_array = array + .as_any() + .downcast_ref::>() + .unwrap(); + let array_max_width = arrow::compute::bit_or(primitive_array).unwrap_or(0); + global_max_width = + global_max_width.max(64 - array_max_width.leading_zeros() as u64); + } + if global_max_width == 0 { + res = 1; + } else { + res = global_max_width; + } + } + _ => { + panic!("BitpackedForNonNegArrayEncoder only supports data types of UInt8, Int8, UInt16, Int16, UInt32, Int32, UInt64, Int64"); + } + }; + res +} + +// Bitpack integers using fastlanes algorithm, the input is sliced into chunks of 1024 integers, and bitpacked +// chunk by chunk. when the input is not a multiple of 1024, the last chunk is padded with zeros, this is fine because +// we also know the number of rows we have. +// Here self is a borrow of BitpackedForNonNegArrayEncoder, unpacked is a mutable borrow of FixedWidthDataBlock, +// data_type can be one of u8, u16, u32, or u64. +// buffer_index is a mutable borrow of u32, indicating the buffer index of the output EncodedArray. +// It outputs an fastlanes bitpacked EncodedArray +macro_rules! encode_fixed_width { + ($self:expr, $unpacked:expr, $data_type:ty, $buffer_index:expr) => {{ + let num_chunks = ($unpacked.num_values + ELEMS_PER_CHUNK - 1) / ELEMS_PER_CHUNK; + let num_full_chunks = $unpacked.num_values / ELEMS_PER_CHUNK; + let uncompressed_bit_width = std::mem::size_of::<$data_type>() as u64 * 8; + + // the output vector type is the same as the input type, for example, when input is u16, output is Vec + let packed_chunk_size = 1024 * $self.compressed_bit_width as usize / uncompressed_bit_width as usize; + + let input_slice = $unpacked.data.borrow_to_typed_slice::<$data_type>(); + let input = input_slice.as_ref(); + + let mut output = Vec::with_capacity(num_chunks as usize * packed_chunk_size); + + // Loop over all but the last chunk. + (0..num_full_chunks).for_each(|i| { + let start_elem = (i * ELEMS_PER_CHUNK) as usize; + + let output_len = output.len(); + unsafe { + output.set_len(output_len + packed_chunk_size); + BitPacking::unchecked_pack( + $self.compressed_bit_width, + &input[start_elem..][..ELEMS_PER_CHUNK as usize], + &mut output[output_len..][..packed_chunk_size], + ); + } + }); + + if num_chunks != num_full_chunks { + let last_chunk_elem_num = $unpacked.num_values % ELEMS_PER_CHUNK; + let mut last_chunk = vec![0 as $data_type; ELEMS_PER_CHUNK as usize]; + last_chunk[..last_chunk_elem_num as usize].clone_from_slice( + &input[$unpacked.num_values as usize - last_chunk_elem_num as usize..], + ); + + let output_len = output.len(); + unsafe { + output.set_len(output_len + packed_chunk_size); + BitPacking::unchecked_pack( + $self.compressed_bit_width, + &last_chunk, + &mut output[output_len..][..packed_chunk_size], + ); + } + } + + let bitpacked_for_non_neg_buffer_index = *$buffer_index; + *$buffer_index += 1; + + let encoding = ProtobufUtils::bitpacked_for_non_neg_encoding( + $self.compressed_bit_width as u64, + uncompressed_bit_width, + bitpacked_for_non_neg_buffer_index, + ); + let packed = DataBlock::FixedWidth(FixedWidthDataBlock { + bits_per_value: $self.compressed_bit_width as u64, + data: LanceBuffer::reinterpret_vec(output), + num_values: $unpacked.num_values, + }); + + Result::Ok(EncodedArray { + data: packed, + encoding, + }) + }}; +} + +#[derive(Debug)] +pub struct BitpackedForNonNegArrayEncoder { + pub compressed_bit_width: usize, + pub original_data_type: DataType, +} + +impl BitpackedForNonNegArrayEncoder { + pub fn new(compressed_bit_width: usize, data_type: DataType) -> Self { + Self { + compressed_bit_width, + original_data_type: data_type, + } + } +} + +impl ArrayEncoder for BitpackedForNonNegArrayEncoder { + fn encode( + &self, + data: DataBlock, + data_type: &DataType, + buffer_index: &mut u32, + ) -> Result { + match data { + DataBlock::AllNull(_) => { + let encoding = ProtobufUtils::basic_all_null_encoding(); + Ok(EncodedArray { data, encoding }) + } + DataBlock::FixedWidth(mut unpacked) => { + match data_type { + DataType::UInt8 | DataType::Int8 => encode_fixed_width!(self, unpacked, u8, buffer_index), + DataType::UInt16 | DataType::Int16 => encode_fixed_width!(self, unpacked, u16, buffer_index), + DataType::UInt32 | DataType::Int32 => encode_fixed_width!(self, unpacked, u32, buffer_index), + DataType::UInt64 | DataType::Int64 => encode_fixed_width!(self, unpacked, u64, buffer_index), + _ => unreachable!("BitpackedForNonNegArrayEncoder only supports data types of UInt8, Int8, UInt16, Int16, UInt32, Int32, UInt64, Int64"), + } + } + DataBlock::Nullable(nullable) => { + let validity_buffer_index = *buffer_index; + *buffer_index += 1; + + let validity_desc = ProtobufUtils::flat_encoding( + 1, + validity_buffer_index, + /*compression=*/ None, + ); + let encoded_values: EncodedArray; + match *nullable.data { + DataBlock::FixedWidth(mut unpacked) => { + match data_type { + DataType::UInt8 | DataType::Int8 => encoded_values = encode_fixed_width!(self, unpacked, u8, buffer_index)?, + DataType::UInt16 | DataType::Int16 => encoded_values = encode_fixed_width!(self, unpacked, u16, buffer_index)?, + DataType::UInt32 | DataType::Int32 => encoded_values = encode_fixed_width!(self, unpacked, u32, buffer_index)?, + DataType::UInt64 | DataType::Int64 => encoded_values = encode_fixed_width!(self, unpacked, u64, buffer_index)?, + _ => unreachable!("BitpackedForNonNegArrayEncoder only supports data types of UInt8, Int8, UInt16, Int16, UInt32, Int32, UInt64, Int64"), + } + } + _ => { + return Err(Error::InvalidInput { + source: "Bitpacking only supports fixed width data blocks or a nullable data block with fixed width data block inside or a all null data block".into(), + location: location!(), + }); + } + } + let encoding = + ProtobufUtils::basic_some_null_encoding(validity_desc, encoded_values.encoding); + let encoded = DataBlock::Nullable(NullableDataBlock { + data: Box::new(encoded_values.data), + nulls: nullable.nulls, + }); + Ok(EncodedArray { + data: encoded, + encoding, + }) + } + _ => { + Err(Error::InvalidInput { + source: "Bitpacking only supports fixed width data blocks or a nullable data block with fixed width data block inside or a all null data block".into(), + location: location!(), + }) + } + } + } +} + +#[derive(Debug)] +pub struct BitpackedForNonNegScheduler { + compressed_bit_width: u64, + uncompressed_bits_per_value: u64, + buffer_offset: u64, +} + +impl BitpackedForNonNegScheduler { + pub fn new( + compressed_bit_width: u64, + uncompressed_bits_per_value: u64, + buffer_offset: u64, + ) -> Self { + Self { + compressed_bit_width, + uncompressed_bits_per_value, + buffer_offset, + } + } + + fn locate_chunk_start(&self, relative_row_num: u64) -> u64 { + let chunk_size = ELEMS_PER_CHUNK * self.compressed_bit_width / 8; + self.buffer_offset + (relative_row_num / ELEMS_PER_CHUNK * chunk_size) + } + + fn locate_chunk_end(&self, relative_row_num: u64) -> u64 { + let chunk_size = ELEMS_PER_CHUNK * self.compressed_bit_width / 8; + self.buffer_offset + (relative_row_num / ELEMS_PER_CHUNK * chunk_size) + chunk_size + } +} + +impl PageScheduler for BitpackedForNonNegScheduler { + fn schedule_ranges( + &self, + ranges: &[std::ops::Range], + scheduler: &Arc, + top_level_row: u64, + ) -> BoxFuture<'static, Result>> { + assert!(!ranges.is_empty()); + + let mut byte_ranges = vec![]; + + // map one bytes to multiple ranges, one bytes has at least one range corresponding to it + let mut bytes_idx_to_range_indices = vec![]; + let first_byte_range = std::ops::Range { + start: self.locate_chunk_start(ranges[0].start), + end: self.locate_chunk_end(ranges[0].end - 1), + }; // the ranges are half-open + byte_ranges.push(first_byte_range); + bytes_idx_to_range_indices.push(vec![ranges[0].clone()]); + + for (i, range) in ranges.iter().enumerate().skip(1) { + let this_start = self.locate_chunk_start(range.start); + let this_end = self.locate_chunk_end(range.end - 1); + + // when the current range start is in the same chunk as the previous range's end, we colaesce this two bytes ranges + // when the current range start is not in the same chunk as the previous range's end, we create a new bytes range + if this_start == self.locate_chunk_start(ranges[i - 1].end - 1) { + byte_ranges.last_mut().unwrap().end = this_end; + bytes_idx_to_range_indices + .last_mut() + .unwrap() + .push(range.clone()); + } else { + byte_ranges.push(this_start..this_end); + bytes_idx_to_range_indices.push(vec![range.clone()]); + } + } + + trace!( + "Scheduling I/O for {} ranges spread across byte range {}..{}", + byte_ranges.len(), + byte_ranges[0].start, + byte_ranges.last().unwrap().end + ); + + let bytes = scheduler.submit_request(byte_ranges.clone(), top_level_row); + + // copy the necessary data from `self` to move into the async block + let compressed_bit_width = self.compressed_bit_width; + let uncompressed_bits_per_value = self.uncompressed_bits_per_value; + let num_rows = ranges.iter().map(|range| range.end - range.start).sum(); + + async move { + let bytes = bytes.await?; + let decompressed_output = bitpacked_for_non_neg_decode( + compressed_bit_width, + uncompressed_bits_per_value, + &bytes, + &bytes_idx_to_range_indices, + num_rows, + ); + Ok(Box::new(BitpackedForNonNegPageDecoder { + uncompressed_bits_per_value, + decompressed_buf: decompressed_output, + }) as Box) + } + .boxed() + } +} + +#[derive(Debug)] +struct BitpackedForNonNegPageDecoder { + // number of bits in the uncompressed value. E.g. this will be 32 for DataType::UInt32 + uncompressed_bits_per_value: u64, + + decompressed_buf: LanceBuffer, +} + +impl PrimitivePageDecoder for BitpackedForNonNegPageDecoder { + fn decode(&self, rows_to_skip: u64, num_rows: u64) -> Result { + if ![8, 16, 32, 64].contains(&self.uncompressed_bits_per_value) { + return Err(Error::InvalidInput { + source: "BitpackedForNonNegPageDecoder should only has uncompressed_bits_per_value of 8, 16, 32, or 64".into(), + location: location!(), + }); + } + + let elem_size_in_bytes = self.uncompressed_bits_per_value / 8; + + Ok(DataBlock::FixedWidth(FixedWidthDataBlock { + data: self.decompressed_buf.slice_with_length( + (rows_to_skip * elem_size_in_bytes) as usize, + (num_rows * elem_size_in_bytes) as usize, + ), + bits_per_value: self.uncompressed_bits_per_value, + num_values: num_rows, + })) + } +} + +macro_rules! bitpacked_decode { + ($uncompressed_type:ty, $compressed_bit_width:expr, $data:expr, $bytes_idx_to_range_indices:expr, $num_rows:expr) => {{ + let mut decompressed: Vec<$uncompressed_type> = Vec::with_capacity($num_rows as usize); + let packed_chunk_size_in_byte: usize = (ELEMS_PER_CHUNK * $compressed_bit_width) as usize / 8; + let mut decompress_chunk_buf = vec![0 as $uncompressed_type; ELEMS_PER_CHUNK as usize]; + + for (i, bytes) in $data.iter().enumerate() { + let mut ranges_idx = 0; + let mut curr_range_start = $bytes_idx_to_range_indices[i][0].start; + let mut chunk_num = 0; + + while chunk_num * packed_chunk_size_in_byte < bytes.len() { + // Copy for memory alignment + let chunk_in_u8: Vec = bytes[chunk_num * packed_chunk_size_in_byte..] + [..packed_chunk_size_in_byte] + .to_vec(); + chunk_num += 1; + let chunk = cast_slice(&chunk_in_u8); + unsafe { + BitPacking::unchecked_unpack( + $compressed_bit_width as usize, + chunk, + &mut decompress_chunk_buf, + ); + } + + loop { + // Case 1: All the elements after (curr_range_start % ELEMS_PER_CHUNK) inside this chunk are needed. + let elems_after_curr_range_start_in_this_chunk = + ELEMS_PER_CHUNK - curr_range_start % ELEMS_PER_CHUNK; + if curr_range_start + elems_after_curr_range_start_in_this_chunk + <= $bytes_idx_to_range_indices[i][ranges_idx].end + { + decompressed.extend_from_slice( + &decompress_chunk_buf[(curr_range_start % ELEMS_PER_CHUNK) as usize..], + ); + curr_range_start += elems_after_curr_range_start_in_this_chunk; + break; + } else { + // Case 2: Only part of the elements after (curr_range_start % ELEMS_PER_CHUNK) inside this chunk are needed. + let elems_this_range_needed_in_this_chunk = + ($bytes_idx_to_range_indices[i][ranges_idx].end - curr_range_start) + .min(ELEMS_PER_CHUNK - curr_range_start % ELEMS_PER_CHUNK); + decompressed.extend_from_slice( + &decompress_chunk_buf[(curr_range_start % ELEMS_PER_CHUNK) as usize..] + [..elems_this_range_needed_in_this_chunk as usize], + ); + if curr_range_start + elems_this_range_needed_in_this_chunk + == $bytes_idx_to_range_indices[i][ranges_idx].end + { + ranges_idx += 1; + if ranges_idx == $bytes_idx_to_range_indices[i].len() { + break; + } + curr_range_start = $bytes_idx_to_range_indices[i][ranges_idx].start; + } else { + curr_range_start += elems_this_range_needed_in_this_chunk; + } + } + } + } + } + + LanceBuffer::reinterpret_vec(decompressed) + }}; +} + +fn bitpacked_for_non_neg_decode( + compressed_bit_width: u64, + uncompressed_bits_per_value: u64, + data: &[Bytes], + bytes_idx_to_range_indices: &[Vec>], + num_rows: u64, +) -> LanceBuffer { + match uncompressed_bits_per_value { + 8 => bitpacked_decode!( + u8, + compressed_bit_width, + data, + bytes_idx_to_range_indices, + num_rows + ), + 16 => bitpacked_decode!( + u16, + compressed_bit_width, + data, + bytes_idx_to_range_indices, + num_rows + ), + 32 => bitpacked_decode!( + u32, + compressed_bit_width, + data, + bytes_idx_to_range_indices, + num_rows + ), + 64 => bitpacked_decode!( + u64, + compressed_bit_width, + data, + bytes_idx_to_range_indices, + num_rows + ), + _ => unreachable!( + "bitpacked_for_non_neg_decode only supports 8, 16, 32, 64 uncompressed_bits_per_value" + ), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::{ + Int16Array, Int32Array, Int64Array, Int8Array, UInt16Array, UInt32Array, UInt64Array, + UInt8Array, + }; + use arrow::datatypes::DataType; + + #[test_log::test(tokio::test)] + async fn test_compute_compressed_bit_width_for_non_neg() {} + + use std::collections::HashMap; + + use lance_datagen::RowCount; + + use crate::testing::{check_round_trip_encoding_of_data, TestCases}; + use crate::version::LanceFileVersion; + + async fn check_round_trip_bitpacked(array: Arc) { + let test_cases = TestCases::default().with_file_version(LanceFileVersion::V2_1); + check_round_trip_encoding_of_data(vec![array], &test_cases, HashMap::new()).await; + } + + #[test_log::test(tokio::test)] + async fn test_bitpack_fastlanes_u8() { + let values: Vec = vec![5; 1024]; + let array = UInt8Array::from(values); + let array: Arc = Arc::new(array); + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![66; 1000]; + let array = UInt8Array::from(values); + let array: Arc = Arc::new(array); + + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![77; 2000]; + let array = UInt8Array::from(values); + let array: Arc = Arc::new(array); + + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![0; 10000]; + let array = UInt8Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![88; 10000]; + let array = UInt8Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt8)) + .into_batch_rows(RowCount::from(1)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt8)) + .into_batch_rows(RowCount::from(20)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt8)) + .into_batch_rows(RowCount::from(50)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt8)) + .into_batch_rows(RowCount::from(100)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt8)) + .into_batch_rows(RowCount::from(1000)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt8)) + .into_batch_rows(RowCount::from(1024)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt8)) + .into_batch_rows(RowCount::from(2000)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt8)) + .into_batch_rows(RowCount::from(3000)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + } + + #[test_log::test(tokio::test)] + async fn test_bitpack_fastlanes_u16() { + let values: Vec = vec![5; 1024]; + let array = UInt16Array::from(values); + let array: Arc = Arc::new(array); + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![66; 1000]; + let array = UInt16Array::from(values); + let array: Arc = Arc::new(array); + + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![77; 2000]; + let array = UInt16Array::from(values); + let array: Arc = Arc::new(array); + + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![0; 10000]; + let array = UInt16Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![88; 10000]; + let array = UInt16Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![300; 100]; + let array = UInt16Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![800; 100]; + let array = UInt16Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt16)) + .into_batch_rows(RowCount::from(1)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt16)) + .into_batch_rows(RowCount::from(20)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt16)) + .into_batch_rows(RowCount::from(100)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt16)) + .into_batch_rows(RowCount::from(1000)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt16)) + .into_batch_rows(RowCount::from(1024)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt16)) + .into_batch_rows(RowCount::from(2000)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt16)) + .into_batch_rows(RowCount::from(3000)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + } + + #[test_log::test(tokio::test)] + async fn test_bitpack_fastlanes_u32() { + let values: Vec = vec![5; 1024]; + let array = UInt32Array::from(values); + let array: Arc = Arc::new(array); + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![7; 2000]; + let array = UInt32Array::from(values); + let array: Arc = Arc::new(array); + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![66; 1000]; + let array = UInt32Array::from(values); + let array: Arc = Arc::new(array); + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![666; 1000]; + let array = UInt32Array::from(values); + let array: Arc = Arc::new(array); + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![77; 2000]; + let array = UInt32Array::from(values); + let array: Arc = Arc::new(array); + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![0; 10000]; + let array = UInt32Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![1; 10000]; + let array = UInt32Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![88; 10000]; + let array = UInt32Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![300; 100]; + let array = UInt32Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![3000; 100]; + let array = UInt32Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![800; 100]; + let array = UInt32Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![8000; 100]; + let array = UInt32Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![65536; 100]; + let array = UInt32Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![655360; 100]; + let array = UInt32Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt32)) + .into_batch_rows(RowCount::from(1)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt32)) + .into_batch_rows(RowCount::from(20)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt32)) + .into_batch_rows(RowCount::from(50)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt32)) + .into_batch_rows(RowCount::from(100)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt32)) + .into_batch_rows(RowCount::from(1000)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt32)) + .into_batch_rows(RowCount::from(1024)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt32)) + .into_batch_rows(RowCount::from(2000)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt32)) + .into_batch_rows(RowCount::from(3000)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + } + + #[test_log::test(tokio::test)] + async fn test_bitpack_fastlanes_u64() { + let values: Vec = vec![5; 1024]; + let array = UInt64Array::from(values); + let array: Arc = Arc::new(array); + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![7; 2000]; + let array = UInt64Array::from(values); + let array: Arc = Arc::new(array); + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![66; 1000]; + let array = UInt64Array::from(values); + let array: Arc = Arc::new(array); + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![666; 1000]; + let array = UInt64Array::from(values); + let array: Arc = Arc::new(array); + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![77; 2000]; + let array = UInt64Array::from(values); + let array: Arc = Arc::new(array); + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![0; 10000]; + let array = UInt64Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![1; 10000]; + let array = UInt64Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![88; 10000]; + let array = UInt64Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![300; 100]; + let array = UInt64Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![3000; 100]; + let array = UInt64Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![800; 100]; + let array = UInt64Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![8000; 100]; + let array = UInt64Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![65536; 100]; + let array = UInt64Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![655360; 100]; + let array = UInt64Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt64)) + .into_batch_rows(RowCount::from(1)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt64)) + .into_batch_rows(RowCount::from(20)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt64)) + .into_batch_rows(RowCount::from(50)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt64)) + .into_batch_rows(RowCount::from(100)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt64)) + .into_batch_rows(RowCount::from(1000)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt64)) + .into_batch_rows(RowCount::from(1024)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt64)) + .into_batch_rows(RowCount::from(2000)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::UInt64)) + .into_batch_rows(RowCount::from(3000)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + } + + #[test_log::test(tokio::test)] + async fn test_bitpack_fastlanes_i8() { + let values: Vec = vec![-5; 1024]; + let array = Int8Array::from(values); + let array: Arc = Arc::new(array); + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![66; 1000]; + let array = Int8Array::from(values); + let array: Arc = Arc::new(array); + + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![77; 2000]; + let array = Int8Array::from(values); + let array: Arc = Arc::new(array); + + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![0; 10000]; + let array = Int8Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![88; 10000]; + let array = Int8Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![-88; 10000]; + let array = Int8Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int8)) + .into_batch_rows(RowCount::from(1)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int8)) + .into_batch_rows(RowCount::from(20)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int8)) + .into_batch_rows(RowCount::from(50)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int8)) + .into_batch_rows(RowCount::from(100)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int8)) + .into_batch_rows(RowCount::from(1000)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int8)) + .into_batch_rows(RowCount::from(1024)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int8)) + .into_batch_rows(RowCount::from(2000)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int8)) + .into_batch_rows(RowCount::from(3000)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + } + + #[test_log::test(tokio::test)] + async fn test_bitpack_fastlanes_i16() { + let values: Vec = vec![-5; 1024]; + let array = Int16Array::from(values); + let array: Arc = Arc::new(array); + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![66; 1000]; + let array = Int16Array::from(values); + let array: Arc = Arc::new(array); + + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![77; 2000]; + let array = Int16Array::from(values); + let array: Arc = Arc::new(array); + + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![0; 10000]; + let array = Int16Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![88; 10000]; + let array = Int16Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![300; 100]; + let array = Int16Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![800; 100]; + let array = Int16Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int16)) + .into_batch_rows(RowCount::from(1)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int16)) + .into_batch_rows(RowCount::from(20)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int16)) + .into_batch_rows(RowCount::from(50)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int16)) + .into_batch_rows(RowCount::from(100)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int16)) + .into_batch_rows(RowCount::from(1000)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int16)) + .into_batch_rows(RowCount::from(1024)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int16)) + .into_batch_rows(RowCount::from(2000)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int16)) + .into_batch_rows(RowCount::from(3000)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + } + + #[test_log::test(tokio::test)] + async fn test_bitpack_fastlanes_i32() { + let values: Vec = vec![-5; 1024]; + let array = Int32Array::from(values); + let array: Arc = Arc::new(array); + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![66; 1000]; + let array = Int32Array::from(values); + let array: Arc = Arc::new(array); + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![-66; 1000]; + let array = Int32Array::from(values); + let array: Arc = Arc::new(array); + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![77; 2000]; + let array = Int32Array::from(values); + let array: Arc = Arc::new(array); + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![-77; 2000]; + let array = Int32Array::from(values); + let array: Arc = Arc::new(array); + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![0; 10000]; + let array = Int32Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![88; 10000]; + let array = Int32Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![-88; 10000]; + let array = Int32Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![300; 100]; + let array = Int32Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![-300; 100]; + let array = Int32Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![800; 100]; + let array = Int32Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![-800; 100]; + let array = Int32Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![65536; 100]; + let array = Int32Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![-65536; 100]; + let array = Int32Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) + .into_batch_rows(RowCount::from(1)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) + .into_batch_rows(RowCount::from(20)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) + .into_batch_rows(RowCount::from(50)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) + .into_batch_rows(RowCount::from(100)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) + .into_batch_rows(RowCount::from(1000)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) + .into_batch_rows(RowCount::from(1024)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) + .into_batch_rows(RowCount::from(2000)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int32)) + .into_batch_rows(RowCount::from(3000)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + } + + #[test_log::test(tokio::test)] + async fn test_bitpack_fastlanes_i64() { + let values: Vec = vec![-5; 1024]; + let array = Int64Array::from(values); + let array: Arc = Arc::new(array); + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![66; 1000]; + let array = Int64Array::from(values); + let array: Arc = Arc::new(array); + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![-66; 1000]; + let array = Int64Array::from(values); + let array: Arc = Arc::new(array); + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![77; 2000]; + let array = Int64Array::from(values); + let array: Arc = Arc::new(array); + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![-77; 2000]; + let array = Int64Array::from(values); + let array: Arc = Arc::new(array); + check_round_trip_bitpacked(array).await; + + let values: Vec = vec![0; 10000]; + let array = Int64Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![88; 10000]; + let array = Int64Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![-88; 10000]; + let array = Int64Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![300; 100]; + let array = Int64Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![-300; 100]; + let array = Int64Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![800; 100]; + let array = Int64Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![-800; 100]; + let array = Int64Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![65536; 100]; + let array = Int64Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let values: Vec = vec![-65536; 100]; + let array = Int64Array::from(values); + let arr = Arc::new(array) as ArrayRef; + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int64)) + .into_batch_rows(RowCount::from(1)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int64)) + .into_batch_rows(RowCount::from(20)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int64)) + .into_batch_rows(RowCount::from(50)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int64)) + .into_batch_rows(RowCount::from(100)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int64)) + .into_batch_rows(RowCount::from(1000)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int64)) + .into_batch_rows(RowCount::from(1024)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int64)) + .into_batch_rows(RowCount::from(2000)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + + let arr = lance_datagen::gen() + .anon_col(lance_datagen::array::rand_type(&DataType::Int64)) + .into_batch_rows(RowCount::from(3000)) + .unwrap() + .column(0) + .clone(); + check_round_trip_bitpacked(arr).await; + } +} diff --git a/rust/lance-encoding/src/format.rs b/rust/lance-encoding/src/format.rs index ff16706768..117985751b 100644 --- a/rust/lance-encoding/src/format.rs +++ b/rust/lance-encoding/src/format.rs @@ -18,8 +18,8 @@ use pb::{ array_encoding::ArrayEncoding as ArrayEncodingEnum, buffer::BufferType, nullable::{AllNull, NoNull, Nullability, SomeNull}, - ArrayEncoding, Binary, Bitpacked, Dictionary, FixedSizeBinary, FixedSizeList, Flat, Fsst, - Nullable, PackedStruct, + ArrayEncoding, Binary, Bitpacked, BitpackedForNonNeg, Dictionary, FixedSizeBinary, + FixedSizeList, Flat, Fsst, Nullable, PackedStruct, }; use crate::encodings::physical::block_compress::CompressionScheme; @@ -98,6 +98,23 @@ impl ProtobufUtils { } } + pub fn bitpacked_for_non_neg_encoding( + compressed_bits_per_value: u64, + uncompressed_bits_per_value: u64, + buffer_index: u32, + ) -> ArrayEncoding { + ArrayEncoding { + array_encoding: Some(ArrayEncodingEnum::BitpackedForNonNeg(BitpackedForNonNeg { + compressed_bits_per_value, + buffer: Some(pb::Buffer { + buffer_index, + buffer_type: BufferType::Page as i32, + }), + uncompressed_bits_per_value, + })), + } + } + pub fn packed_struct( child_encodings: Vec, packed_buffer_index: u32, diff --git a/rust/lance-encoding/src/lib.rs b/rust/lance-encoding/src/lib.rs index 17a897df9f..7d0d45bbc8 100644 --- a/rust/lance-encoding/src/lib.rs +++ b/rust/lance-encoding/src/lib.rs @@ -9,6 +9,7 @@ use futures::{future::BoxFuture, FutureExt, TryFutureExt}; use lance_core::Result; pub mod buffer; +pub mod compression_algo; pub mod data; pub mod decoder; pub mod encoder; diff --git a/rust/lance-encoding/src/testing.rs b/rust/lance-encoding/src/testing.rs index 4089291f2b..2d27087414 100644 --- a/rust/lance-encoding/src/testing.rs +++ b/rust/lance-encoding/src/testing.rs @@ -269,6 +269,7 @@ pub struct TestCases { batch_size: u32, skip_validation: bool, max_page_size: Option, + file_version: LanceFileVersion, } impl Default for TestCases { @@ -279,6 +280,7 @@ impl Default for TestCases { indices: Vec::new(), skip_validation: false, max_page_size: None, + file_version: LanceFileVersion::default(), } } } @@ -312,6 +314,11 @@ impl TestCases { fn get_max_page_size(&self) -> u64 { self.max_page_size.unwrap_or(MAX_PAGE_BYTES) } + + pub fn with_file_version(mut self, version: LanceFileVersion) -> Self { + self.file_version = version; + self + } } /// Given specific data and test cases we check round trip encoding and decoding @@ -330,7 +337,12 @@ pub async fn check_round_trip_encoding_of_data( field = field.with_metadata(metadata); let lance_field = lance_core::datatypes::Field::try_from(&field).unwrap(); for page_size in [4096, 1024 * 1024] { - let encoding_strategy = CoreFieldEncodingStrategy::default(); + let encoding_strategy = CoreFieldEncodingStrategy { + array_encoding_strategy: Arc::new(CoreArrayEncodingStrategy { + version: test_cases.file_version, + }), + version: test_cases.file_version, + }; let mut column_index_seq = ColumnIndexSequence::default(); let encoding_options = EncodingOptions { cache_bytes_per_column: page_size,