diff --git a/README.md b/README.md index dc692f8b..2d0c259a 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,8 @@ Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you shall be dual licensed as above, without any additional terms or conditions. -[github-actions-badge]: https://img.shields.io/github/workflow/status/RoaringBitmap/roaring-rs/Continuous%20integration.svg?style=flat-square +[github-actions-badge]: +https://github.com/RoaringBitmap/roaring-rs/actions/workflows/test.yml/badge.svg [github-actions]: https://github.com/RoaringBitmap/roaring-rs/actions [release-badge]: https://img.shields.io/github/release/RoaringBitmap/roaring-rs.svg?style=flat-square [cargo]: https://crates.io/crates/roaring diff --git a/src/bitmap/container.rs b/src/bitmap/container.rs index e6d0cf84..9cbab634 100644 --- a/src/bitmap/container.rs +++ b/src/bitmap/container.rs @@ -6,7 +6,7 @@ use std::ops::{ use super::store::{self, Store}; use super::util; -const ARRAY_LIMIT: u64 = 4096; +pub const ARRAY_LIMIT: u64 = 4096; #[derive(PartialEq, Clone)] pub struct Container { diff --git a/src/bitmap/serialization.rs b/src/bitmap/serialization.rs index c2871c4a..6ae84df4 100644 --- a/src/bitmap/serialization.rs +++ b/src/bitmap/serialization.rs @@ -3,15 +3,19 @@ use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use std::convert::{Infallible, TryFrom}; use std::error::Error; use std::io; +use std::ops::RangeInclusive; -use super::container::Container; -use crate::bitmap::store::{ArrayStore, BitmapStore, Store}; +use crate::bitmap::container::{Container, ARRAY_LIMIT}; +use crate::bitmap::store::{ArrayStore, BitmapStore, Store, BITMAP_LENGTH}; use crate::RoaringBitmap; const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346; const SERIAL_COOKIE: u16 = 12347; -// TODO: Need this once run containers are supported -// const NO_OFFSET_THRESHOLD: u8 = 4; +const NO_OFFSET_THRESHOLD: usize = 4; + +// Sizes of header structures +const DESCRIPTION_BYTES: usize = 4; +const OFFSET_BYTES: usize = 4; impl RoaringBitmap { /// Return the size in bytes of the serialized output. @@ -163,49 +167,81 @@ impl RoaringBitmap { B: Fn(u64, Box<[u64; 1024]>) -> Result, BErr: Error + Send + Sync + 'static, { - let (size, has_offsets) = { + // First read the cookie to determine which version of the format we are reading + let (size, has_offsets, has_run_containers) = { let cookie = reader.read_u32::()?; if cookie == SERIAL_COOKIE_NO_RUNCONTAINER { - (reader.read_u32::()? as usize, true) + (reader.read_u32::()? as usize, true, false) } else if (cookie as u16) == SERIAL_COOKIE { - return Err(io::Error::new(io::ErrorKind::Other, "run containers are unsupported")); + let size = ((cookie >> 16) + 1) as usize; + (size, size >= NO_OFFSET_THRESHOLD, true) } else { return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value")); } }; + // Read the run container bitmap if necessary + let run_container_bitmap = if has_run_containers { + let mut bitmap = vec![0u8; (size + 7) / 8]; + reader.read_exact(&mut bitmap)?; + Some(bitmap) + } else { + None + }; + if size > u16::MAX as usize + 1 { return Err(io::Error::new(io::ErrorKind::Other, "size is greater than supported")); } - let mut description_bytes = vec![0u8; size * 4]; + // Read the container descriptions + let mut description_bytes = vec![0u8; size * DESCRIPTION_BYTES]; reader.read_exact(&mut description_bytes)?; let mut description_bytes = &description_bytes[..]; if has_offsets { - let mut offsets = vec![0u8; size * 4]; + let mut offsets = vec![0u8; size * OFFSET_BYTES]; reader.read_exact(&mut offsets)?; drop(offsets); // Not useful when deserializing into memory } let mut containers = Vec::with_capacity(size); - for _ in 0..size { + // Read each container + for i in 0..size { let key = description_bytes.read_u16::()?; - let len = u64::from(description_bytes.read_u16::()?) + 1; + let cardinality = u64::from(description_bytes.read_u16::()?) + 1; + + // If the run container bitmap is present, check if this container is a run container + let is_run_container = + run_container_bitmap.as_ref().map_or(false, |bm| bm[i / 8] & (1 << (i % 8)) != 0); + + let store = if is_run_container { + let runs = reader.read_u16::()?; + let mut intervals = vec![[0, 0]; runs as usize]; + reader.read_exact(cast_slice_mut(&mut intervals))?; + intervals.iter_mut().for_each(|[s, len]| { + *s = u16::from_le(*s); + *len = u16::from_le(*len); + }); - let store = if len <= 4096 { - let mut values = vec![0; len as usize]; + let cardinality = intervals.iter().map(|[_, len]| *len as usize).sum(); + let mut store = Store::with_capacity(cardinality); + intervals.into_iter().for_each(|[s, len]| { + store.insert_range(RangeInclusive::new(s, s + len)); + }); + store + } else if cardinality <= ARRAY_LIMIT { + let mut values = vec![0; cardinality as usize]; reader.read_exact(cast_slice_mut(&mut values))?; values.iter_mut().for_each(|n| *n = u16::from_le(*n)); let array = a(values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; Store::Array(array) } else { - let mut values = Box::new([0; 1024]); + let mut values = Box::new([0; BITMAP_LENGTH]); reader.read_exact(cast_slice_mut(&mut values[..]))?; values.iter_mut().for_each(|n| *n = u64::from_le(*n)); - let bitmap = - b(len, values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + let bitmap = b(cardinality, values) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; Store::Bitmap(bitmap) }; diff --git a/src/bitmap/store/array_store/mod.rs b/src/bitmap/store/array_store/mod.rs index ca6ee206..543663df 100644 --- a/src/bitmap/store/array_store/mod.rs +++ b/src/bitmap/store/array_store/mod.rs @@ -21,6 +21,10 @@ impl ArrayStore { ArrayStore { vec: vec![] } } + pub fn with_capacity(capacity: usize) -> ArrayStore { + ArrayStore { vec: Vec::with_capacity(capacity) } + } + /// /// Create a new SortedU16Vec from a given vec /// It is up to the caller to ensure the vec is sorted and deduplicated diff --git a/src/bitmap/store/mod.rs b/src/bitmap/store/mod.rs index 653e14fe..9172a01a 100644 --- a/src/bitmap/store/mod.rs +++ b/src/bitmap/store/mod.rs @@ -7,12 +7,14 @@ use std::ops::{ }; use std::{slice, vec}; -use self::bitmap_store::BITMAP_LENGTH; +pub use self::bitmap_store::BITMAP_LENGTH; use self::Store::{Array, Bitmap}; pub use self::array_store::ArrayStore; pub use self::bitmap_store::{BitmapIter, BitmapStore}; +use crate::bitmap::container::ARRAY_LIMIT; + #[derive(Clone)] pub enum Store { Array(ArrayStore), @@ -31,6 +33,14 @@ impl Store { Store::Array(ArrayStore::new()) } + pub fn with_capacity(capacity: usize) -> Store { + if capacity <= ARRAY_LIMIT as usize { + Store::Array(ArrayStore::with_capacity(capacity)) + } else { + Store::Bitmap(BitmapStore::new()) + } + } + pub fn full() -> Store { Store::Bitmap(BitmapStore::full()) } diff --git a/tests/bitmapwithruns.bin b/tests/bitmapwithruns.bin new file mode 100644 index 00000000..5ed24375 Binary files /dev/null and b/tests/bitmapwithruns.bin differ diff --git a/tests/serialization.rs b/tests/serialization.rs index 0b31b060..42efdd43 100644 --- a/tests/serialization.rs +++ b/tests/serialization.rs @@ -4,6 +4,7 @@ use roaring::RoaringBitmap; // Test data from https://github.com/RoaringBitmap/RoaringFormatSpec/tree/master/testdata static BITMAP_WITHOUT_RUNS: &[u8] = include_bytes!("bitmapwithoutruns.bin"); +static BITMAP_WITH_RUNS: &[u8] = include_bytes!("bitmapwithruns.bin"); fn test_data_bitmap() -> RoaringBitmap { (0..100) @@ -21,10 +22,18 @@ fn serialize_and_deserialize(bitmap: &RoaringBitmap) -> RoaringBitmap { } #[test] -fn test_deserialize_from_provided_data() { +fn test_deserialize_without_runs_from_provided_data() { assert_eq!(RoaringBitmap::deserialize_from(BITMAP_WITHOUT_RUNS).unwrap(), test_data_bitmap()); } +#[test] +fn test_deserialize_with_runs_from_provided_data() { + assert_eq!( + RoaringBitmap::deserialize_from(&mut &BITMAP_WITH_RUNS[..]).unwrap(), + test_data_bitmap() + ); +} + #[test] fn test_serialize_into_provided_data() { let bitmap = test_data_bitmap();