Small refactoring estimate.

quickwit-oss · Aug 27, 2022 · 52bb503 · 52bb503
1 parent 43a4c82
commit 52bb503
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 60 deletions.
diff --git a/fastfield_codecs/src/bitpacked.rs b/fastfield_codecs/src/bitpacked.rs
@@ -141,14 +141,12 @@ impl FastFieldCodec for BitpackedCodec {
 
         Ok(())
     }
-    fn is_applicable(_fastfield_accessor: &impl FastFieldDataAccess) -> bool {
-        true
-    }
-    fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 {
+
+    fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> Option<f32> {
         let amplitude = fastfield_accessor.max_value() - fastfield_accessor.min_value();
         let num_bits = compute_num_bits(amplitude);
         let num_bits_uncompressed = 64;
-        num_bits as f32 / num_bits_uncompressed as f32
+        Some(num_bits as f32 / num_bits_uncompressed as f32)
     }
 }
 

diff --git a/fastfield_codecs/src/blockwise_linear.rs b/fastfield_codecs/src/blockwise_linear.rs
@@ -289,9 +289,12 @@ impl FastFieldCodec for BlockwiseLinearCodec {
         Ok(())
     }
 
-    fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool {
+    /// estimation for linear interpolation is hard because, you don't know
+    /// where the local maxima are for the deviation of the calculated value and
+    /// the offset is also unknown.
+    fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> Option<f32> {
         if fastfield_accessor.num_vals() < 5_000 {
-            return false;
+            return None;
         }
         // On serialization the offset is added to the actual value.
         // We need to make sure this won't run into overflow calculation issues.
@@ -304,14 +307,9 @@ impl FastFieldCodec for BlockwiseLinearCodec {
             .checked_add(theorethical_maximum_offset)
             .is_none()
         {
-            return false;
+            return None;
         }
-        true
-    }
-    /// estimation for linear interpolation is hard because, you don't know
-    /// where the local maxima are for the deviation of the calculated value and
-    /// the offset is also unknown.
-    fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 {
+
         let first_val_in_first_block = fastfield_accessor.get_val(0);
         let last_elem_in_first_chunk = CHUNK_SIZE.min(fastfield_accessor.num_vals());
         let last_val_in_first_block =
@@ -350,7 +348,7 @@ impl FastFieldCodec for BlockwiseLinearCodec {
             // function metadata per block
             + 29 * (fastfield_accessor.num_vals() / CHUNK_SIZE);
         let num_bits_uncompressed = 64 * fastfield_accessor.num_vals();
-        num_bits as f32 / num_bits_uncompressed as f32
+        Some(num_bits as f32 / num_bits_uncompressed as f32)
     }
 }
 
@@ -368,7 +366,7 @@ mod tests {
     use crate::tests::get_codec_test_data_sets;
 
     fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
-        crate::tests::create_and_validate::<BlockwiseLinearCodec, BlockwiseLinearReader>(data, name)
+        crate::tests::create_and_validate::<BlockwiseLinearCodec>(data, name).unwrap()
     }
 
     const HIGHEST_BIT: u64 = 1 << 63;

diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs
@@ -82,15 +82,14 @@ pub trait FastFieldCodec {
         fastfield_accessor: &dyn FastFieldDataAccess,
     ) -> io::Result<()>;
 
-    /// Check if the Codec is able to compress the data
-    fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool;
-
     /// Returns an estimate of the compression ratio.
+    /// If the codec is not applicable, returns `None`.
+    ///
     /// The baseline is uncompressed 64bit data.
     ///
     /// It could make sense to also return a value representing
     /// computational complexity.
-    fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32;
+    fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> Option<f32>;
 }
 
 #[derive(Debug, Clone)]
@@ -152,28 +151,28 @@ mod tests {
     use crate::blockwise_linear::BlockwiseLinearCodec;
     use crate::linear::LinearCodec;
 
-    pub fn create_and_validate<Codec: FastFieldCodec>(data: &[u64], name: &str) -> (f32, f32) {
-        if !Codec::is_applicable(&data) {
-            return (f32::MAX, 0.0);
-        }
-        let estimation = Codec::estimate(&data);
+    pub fn create_and_validate<Codec: FastFieldCodec>(
+        data: &[u64],
+        name: &str,
+    ) -> Option<(f32, f32)> {
+        let estimation = Codec::estimate(&data)?;
+
         let mut out: Vec<u8> = Vec::new();
         Codec::serialize(&mut out, &data).unwrap();
 
         let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
 
         let reader = Codec::open_from_bytes(OwnedBytes::new(out)).unwrap();
         assert_eq!(reader.num_vals(), data.len() as u64);
-        for (doc, orig_val) in data.iter().enumerate() {
+        for (doc, orig_val) in data.iter().copied().enumerate() {
             let val = reader.get_val(doc as u64);
-            if val != *orig_val {
-                panic!(
-                    "val {val:?} does not match orig_val {orig_val:?}, in data set {name}, data \
-                     {data:?}",
-                );
-            }
+            assert_eq!(
+                val, orig_val,
+                "val `{val}` does not match orig_val {orig_val:?}, in data set {name}, data \
+                 `{data:?}`",
+            );
         }
-        (estimation, actual_compression)
+        Some((estimation, actual_compression))
     }
 
     proptest! {
@@ -212,11 +211,12 @@ mod tests {
     fn test_codec<C: FastFieldCodec>() {
         let codec_name = format!("{:?}", C::CODEC_TYPE);
         for (data, dataset_name) in get_codec_test_data_sets() {
-            let (estimate, actual) = crate::tests::create_and_validate::<C>(&data, dataset_name);
-            let result = if estimate == f32::MAX {
-                "Disabled".to_string()
-            } else {
+            let estimate_actual_opt: Option<(f32, f32)> =
+                crate::tests::create_and_validate::<C>(&data, dataset_name);
+            let result = if let Some((estimate, actual)) = estimate_actual_opt {
                 format!("Estimate `{estimate}` Actual `{actual}`")
+            } else {
+                "Disabled".to_string()
             };
             println!("Codec {codec_name}, DataSet {dataset_name}, {result}");
         }
@@ -240,37 +240,37 @@ mod tests {
     fn estimation_good_interpolation_case() {
         let data = (10..=20000_u64).collect::<Vec<_>>();
 
-        let linear_interpol_estimation = LinearCodec::estimate(&data);
+        let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
         assert_le!(linear_interpol_estimation, 0.01);
 
-        let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data);
+        let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data).unwrap();
         assert_le!(multi_linear_interpol_estimation, 0.2);
         assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation);
 
-        let bitpacked_estimation = BitpackedCodec::estimate(&data);
+        let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
         assert_le!(linear_interpol_estimation, bitpacked_estimation);
     }
     #[test]
     fn estimation_test_bad_interpolation_case() {
         let data = vec![200, 10, 10, 10, 10, 1000, 20];
 
-        let linear_interpol_estimation = LinearCodec::estimate(&data);
+        let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
         assert_le!(linear_interpol_estimation, 0.32);
 
-        let bitpacked_estimation = BitpackedCodec::estimate(&data);
+        let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
         assert_le!(bitpacked_estimation, linear_interpol_estimation);
     }
     #[test]
     fn estimation_test_bad_interpolation_case_monotonically_increasing() {
-        let mut data = (200..=20000_u64).collect::<Vec<_>>();
+        let mut data: Vec<u64> = (200..=20000_u64).collect();
         data.push(1_000_000);
 
         // in this case the linear interpolation can't in fact not be worse than bitpacking,
         // but the estimator adds some threshold, which leads to estimated worse behavior
-        let linear_interpol_estimation = LinearCodec::estimate(&data);
+        let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
         assert_le!(linear_interpol_estimation, 0.35);
 
-        let bitpacked_estimation = BitpackedCodec::estimate(&data);
+        let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
         assert_le!(bitpacked_estimation, 0.32);
         assert_le!(bitpacked_estimation, linear_interpol_estimation);
     }

diff --git a/fastfield_codecs/src/linear.rs b/fastfield_codecs/src/linear.rs
@@ -192,10 +192,15 @@ impl FastFieldCodec for LinearCodec {
         footer.serialize(write)?;
         Ok(())
     }
-    fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool {
+
+    /// estimation for linear interpolation is hard because, you don't know
+    /// where the local maxima for the deviation of the calculated value are and
+    /// the offset to shift all values to >=0 is also unknown.
+    fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> Option<f32> {
         if fastfield_accessor.num_vals() < 3 {
-            return false; // disable compressor for this case
+            return None; // disable compressor for this case
         }
+
         // On serialisation the offset is added to the actual value.
         // We need to make sure this won't run into overflow calculation issues.
         // For this we take the maximum theroretical offset and add this to the max value.
@@ -207,14 +212,9 @@ impl FastFieldCodec for LinearCodec {
             .checked_add(theorethical_maximum_offset)
             .is_none()
         {
-            return false;
+            return None;
         }
-        true
-    }
-    /// estimation for linear interpolation is hard because, you don't know
-    /// where the local maxima for the deviation of the calculated value are and
-    /// the offset to shift all values to >=0 is also unknown.
-    fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 {
+
         let first_val = fastfield_accessor.get_val(0);
         let last_val = fastfield_accessor.get_val(fastfield_accessor.num_vals() as u64 - 1);
         let slope = get_slope(first_val, last_val, fastfield_accessor.num_vals());
@@ -246,7 +246,7 @@ impl FastFieldCodec for LinearCodec {
             * fastfield_accessor.num_vals()
             + LinearFooter::SIZE_IN_BYTES as u64;
         let num_bits_uncompressed = 64 * fastfield_accessor.num_vals();
-        num_bits as f32 / num_bits_uncompressed as f32
+        Some(num_bits as f32 / num_bits_uncompressed as f32)
     }
 }
 
@@ -265,7 +265,7 @@ mod tests {
     use crate::tests::get_codec_test_data_sets;
 
     fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
-        crate::tests::create_and_validate::<LinearCodec, LinearReader>(data, name)
+        crate::tests::create_and_validate::<LinearCodec>(data, name).unwrap()
     }
 
     #[test]

diff --git a/src/fastfield/serializer/mod.rs b/src/fastfield/serializer/mod.rs
@@ -68,11 +68,9 @@ fn codec_estimation<C: FastFieldCodec>(
     fastfield_accessor: &impl FastFieldDataAccess,
     estimations: &mut Vec<(f32, FastFieldCodecType)>,
 ) {
-    if !C::is_applicable(fastfield_accessor) {
-        return;
+    if let Some(ratio) = C::estimate(fastfield_accessor) {
+        estimations.push((ratio, C::CODEC_TYPE));
     }
-    let ratio = C::estimate(fastfield_accessor);
-    estimations.push((ratio, C::CODEC_TYPE));
 }
 
 impl CompositeFastFieldSerializer {