Skip to content

Commit

Permalink
Small refactoring estimate.
Browse files Browse the repository at this point in the history
  • Loading branch information
fulmicoton committed Aug 27, 2022
1 parent 43a4c82 commit 52bb503
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 60 deletions.
8 changes: 3 additions & 5 deletions fastfield_codecs/src/bitpacked.rs
Original file line number Diff line number Diff line change
Expand Up @@ -141,14 +141,12 @@ impl FastFieldCodec for BitpackedCodec {

Ok(())
}
fn is_applicable(_fastfield_accessor: &impl FastFieldDataAccess) -> bool {
true
}
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 {

fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> Option<f32> {
let amplitude = fastfield_accessor.max_value() - fastfield_accessor.min_value();
let num_bits = compute_num_bits(amplitude);
let num_bits_uncompressed = 64;
num_bits as f32 / num_bits_uncompressed as f32
Some(num_bits as f32 / num_bits_uncompressed as f32)
}
}

Expand Down
20 changes: 9 additions & 11 deletions fastfield_codecs/src/blockwise_linear.rs
Original file line number Diff line number Diff line change
Expand Up @@ -289,9 +289,12 @@ impl FastFieldCodec for BlockwiseLinearCodec {
Ok(())
}

fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool {
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima are for the deviation of the calculated value and
/// the offset is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> Option<f32> {
if fastfield_accessor.num_vals() < 5_000 {
return false;
return None;
}
// On serialization the offset is added to the actual value.
// We need to make sure this won't run into overflow calculation issues.
Expand All @@ -304,14 +307,9 @@ impl FastFieldCodec for BlockwiseLinearCodec {
.checked_add(theorethical_maximum_offset)
.is_none()
{
return false;
return None;
}
true
}
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima are for the deviation of the calculated value and
/// the offset is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 {

let first_val_in_first_block = fastfield_accessor.get_val(0);
let last_elem_in_first_chunk = CHUNK_SIZE.min(fastfield_accessor.num_vals());
let last_val_in_first_block =
Expand Down Expand Up @@ -350,7 +348,7 @@ impl FastFieldCodec for BlockwiseLinearCodec {
// function metadata per block
+ 29 * (fastfield_accessor.num_vals() / CHUNK_SIZE);
let num_bits_uncompressed = 64 * fastfield_accessor.num_vals();
num_bits as f32 / num_bits_uncompressed as f32
Some(num_bits as f32 / num_bits_uncompressed as f32)
}
}

Expand All @@ -368,7 +366,7 @@ mod tests {
use crate::tests::get_codec_test_data_sets;

fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
crate::tests::create_and_validate::<BlockwiseLinearCodec, BlockwiseLinearReader>(data, name)
crate::tests::create_and_validate::<BlockwiseLinearCodec>(data, name).unwrap()
}

const HIGHEST_BIT: u64 = 1 << 63;
Expand Down
58 changes: 29 additions & 29 deletions fastfield_codecs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,15 +82,14 @@ pub trait FastFieldCodec {
fastfield_accessor: &dyn FastFieldDataAccess,
) -> io::Result<()>;

/// Check if the Codec is able to compress the data
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool;

/// Returns an estimate of the compression ratio.
/// If the codec is not applicable, returns `None`.
///
/// The baseline is uncompressed 64bit data.
///
/// It could make sense to also return a value representing
/// computational complexity.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32;
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> Option<f32>;
}

#[derive(Debug, Clone)]
Expand Down Expand Up @@ -152,28 +151,28 @@ mod tests {
use crate::blockwise_linear::BlockwiseLinearCodec;
use crate::linear::LinearCodec;

pub fn create_and_validate<Codec: FastFieldCodec>(data: &[u64], name: &str) -> (f32, f32) {
if !Codec::is_applicable(&data) {
return (f32::MAX, 0.0);
}
let estimation = Codec::estimate(&data);
pub fn create_and_validate<Codec: FastFieldCodec>(
data: &[u64],
name: &str,
) -> Option<(f32, f32)> {
let estimation = Codec::estimate(&data)?;

let mut out: Vec<u8> = Vec::new();
Codec::serialize(&mut out, &data).unwrap();

let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);

let reader = Codec::open_from_bytes(OwnedBytes::new(out)).unwrap();
assert_eq!(reader.num_vals(), data.len() as u64);
for (doc, orig_val) in data.iter().enumerate() {
for (doc, orig_val) in data.iter().copied().enumerate() {
let val = reader.get_val(doc as u64);
if val != *orig_val {
panic!(
"val {val:?} does not match orig_val {orig_val:?}, in data set {name}, data \
{data:?}",
);
}
assert_eq!(
val, orig_val,
"val `{val}` does not match orig_val {orig_val:?}, in data set {name}, data \
`{data:?}`",
);
}
(estimation, actual_compression)
Some((estimation, actual_compression))
}

proptest! {
Expand Down Expand Up @@ -212,11 +211,12 @@ mod tests {
fn test_codec<C: FastFieldCodec>() {
let codec_name = format!("{:?}", C::CODEC_TYPE);
for (data, dataset_name) in get_codec_test_data_sets() {
let (estimate, actual) = crate::tests::create_and_validate::<C>(&data, dataset_name);
let result = if estimate == f32::MAX {
"Disabled".to_string()
} else {
let estimate_actual_opt: Option<(f32, f32)> =
crate::tests::create_and_validate::<C>(&data, dataset_name);
let result = if let Some((estimate, actual)) = estimate_actual_opt {
format!("Estimate `{estimate}` Actual `{actual}`")
} else {
"Disabled".to_string()
};
println!("Codec {codec_name}, DataSet {dataset_name}, {result}");
}
Expand All @@ -240,37 +240,37 @@ mod tests {
fn estimation_good_interpolation_case() {
let data = (10..=20000_u64).collect::<Vec<_>>();

let linear_interpol_estimation = LinearCodec::estimate(&data);
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
assert_le!(linear_interpol_estimation, 0.01);

let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data);
let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data).unwrap();
assert_le!(multi_linear_interpol_estimation, 0.2);
assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation);

let bitpacked_estimation = BitpackedCodec::estimate(&data);
let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
assert_le!(linear_interpol_estimation, bitpacked_estimation);
}
#[test]
fn estimation_test_bad_interpolation_case() {
let data = vec![200, 10, 10, 10, 10, 1000, 20];

let linear_interpol_estimation = LinearCodec::estimate(&data);
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
assert_le!(linear_interpol_estimation, 0.32);

let bitpacked_estimation = BitpackedCodec::estimate(&data);
let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
assert_le!(bitpacked_estimation, linear_interpol_estimation);
}
#[test]
fn estimation_test_bad_interpolation_case_monotonically_increasing() {
let mut data = (200..=20000_u64).collect::<Vec<_>>();
let mut data: Vec<u64> = (200..=20000_u64).collect();
data.push(1_000_000);

// in this case the linear interpolation can't in fact not be worse than bitpacking,
// but the estimator adds some threshold, which leads to estimated worse behavior
let linear_interpol_estimation = LinearCodec::estimate(&data);
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
assert_le!(linear_interpol_estimation, 0.35);

let bitpacked_estimation = BitpackedCodec::estimate(&data);
let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
assert_le!(bitpacked_estimation, 0.32);
assert_le!(bitpacked_estimation, linear_interpol_estimation);
}
Expand Down
22 changes: 11 additions & 11 deletions fastfield_codecs/src/linear.rs
Original file line number Diff line number Diff line change
Expand Up @@ -192,10 +192,15 @@ impl FastFieldCodec for LinearCodec {
footer.serialize(write)?;
Ok(())
}
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool {

/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima for the deviation of the calculated value are and
/// the offset to shift all values to >=0 is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> Option<f32> {
if fastfield_accessor.num_vals() < 3 {
return false; // disable compressor for this case
return None; // disable compressor for this case
}

// On serialisation the offset is added to the actual value.
// We need to make sure this won't run into overflow calculation issues.
// For this we take the maximum theroretical offset and add this to the max value.
Expand All @@ -207,14 +212,9 @@ impl FastFieldCodec for LinearCodec {
.checked_add(theorethical_maximum_offset)
.is_none()
{
return false;
return None;
}
true
}
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima for the deviation of the calculated value are and
/// the offset to shift all values to >=0 is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 {

let first_val = fastfield_accessor.get_val(0);
let last_val = fastfield_accessor.get_val(fastfield_accessor.num_vals() as u64 - 1);
let slope = get_slope(first_val, last_val, fastfield_accessor.num_vals());
Expand Down Expand Up @@ -246,7 +246,7 @@ impl FastFieldCodec for LinearCodec {
* fastfield_accessor.num_vals()
+ LinearFooter::SIZE_IN_BYTES as u64;
let num_bits_uncompressed = 64 * fastfield_accessor.num_vals();
num_bits as f32 / num_bits_uncompressed as f32
Some(num_bits as f32 / num_bits_uncompressed as f32)
}
}

Expand All @@ -265,7 +265,7 @@ mod tests {
use crate::tests::get_codec_test_data_sets;

fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
crate::tests::create_and_validate::<LinearCodec, LinearReader>(data, name)
crate::tests::create_and_validate::<LinearCodec>(data, name).unwrap()
}

#[test]
Expand Down
6 changes: 2 additions & 4 deletions src/fastfield/serializer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,9 @@ fn codec_estimation<C: FastFieldCodec>(
fastfield_accessor: &impl FastFieldDataAccess,
estimations: &mut Vec<(f32, FastFieldCodecType)>,
) {
if !C::is_applicable(fastfield_accessor) {
return;
if let Some(ratio) = C::estimate(fastfield_accessor) {
estimations.push((ratio, C::CODEC_TYPE));
}
let ratio = C::estimate(fastfield_accessor);
estimations.push((ratio, C::CODEC_TYPE));
}

impl CompositeFastFieldSerializer {
Expand Down

0 comments on commit 52bb503

Please sign in to comment.