Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bitround Codec #299

Merged
merged 26 commits into from
May 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions docs/bitround.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
Bitround
========
.. automodule:: numcodecs.bitround

.. autoclass:: BitRound

.. autoattribute:: codec_id
.. automethod:: encode
.. automethod:: decode
.. automethod:: get_config
.. automethod:: from_config
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ Contents
delta
fixedscaleoffset
quantize
bitround
packbits
categorize
checksum32
Expand Down
3 changes: 3 additions & 0 deletions docs/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ Unreleased
By :user:`Dimitri Papadopoulos Orfanos <DimitriPapadopoulos>`,
:issue:`295`, :issue:`294`, :issue:`293`, and :issue:`292`.

* Add bitround codec
By :user:`Ryan Abernathy <rabernat>` and :user:`Martin Durant <martindurant>`, :issue:`298`.

* Drop Python 3.6
By :user:`Josh Moore <joshmoore>`, :issue:`318`.

Expand Down
80 changes: 80 additions & 0 deletions numcodecs/bitround.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import numpy as np


from .abc import Codec
from .compat import ensure_ndarray_like, ndarray_copy

# The size in bits of the mantissa/significand for the various floating types
# You cannot keep more bits of data than you have available
# https://en.wikipedia.org/wiki/IEEE_754
max_bits = {
"float16": 10,
"float32": 23,
"float64": 52,
}
martindurant marked this conversation as resolved.
Show resolved Hide resolved


class BitRound(Codec):
"""Floating-point bit rounding codec

Drops a specified number of bits from the floating point mantissa,
leaving an array more amenable to compression. The number of bits to keep should
be determined by an information analysis of the data to be compressed.
The approach is based on the paper by Klöwer et al. 2021
(https://www.nature.com/articles/s43588-021-00156-2). See
https://github.com/zarr-developers/numcodecs/issues/298 for discussion
and the original implementation in Julia referred to at
https://github.com/milankl/BitInformation.jl

Parameters
----------

keepbits: int
The number of bits of the mantissa to keep. The range allowed
depends on the dtype input data. If keepbits is
equal to the maximum allowed for the data type, this is equivalent
to no transform.
"""

codec_id = 'bitround'

def __init__(self, keepbits: int):
if keepbits < 0:
raise ValueError("keepbits must be zero or positive")
martindurant marked this conversation as resolved.
Show resolved Hide resolved
self.keepbits = keepbits

def encode(self, buf):
"""Create int array by rounding floating-point data

The itemsize will be preserved, but the output should be much more
compressible.
"""
a = ensure_ndarray_like(buf)
if not a.dtype.kind == "f" or a.dtype.itemsize > 8:
raise TypeError("Only float arrays (16-64bit) can be bit-rounded")
bits = max_bits[str(a.dtype)]
# cast float to int type of same width (preserve endianness)
a_int_dtype = np.dtype(a.dtype.str.replace("f", "i"))
all_set = np.array(-1, dtype=a_int_dtype)
if self.keepbits == bits:
return a
if self.keepbits > bits:
raise ValueError("Keepbits too large for given dtype")
b = a.view(a_int_dtype)
maskbits = bits - self.keepbits
mask = (all_set >> maskbits) << maskbits
half_quantum1 = (1 << (maskbits - 1)) - 1
b += ((b >> maskbits) & 1) + half_quantum1
b &= mask
return b

def decode(self, buf, out=None):
"""Remake floats from ints

As with ``encode``, preserves itemsize.
"""
buf = ensure_ndarray_like(buf)
# Cast back from `int` to `float` type (noop if a `float`ing type buffer is provided)
dt = np.dtype(buf.dtype.str.replace("i", "f"))
data = buf.view(dt)
return ndarray_copy(data, out)
80 changes: 80 additions & 0 deletions numcodecs/tests/test_bitround.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import numpy as np

import pytest

from numcodecs.bitround import BitRound, max_bits

# adapted from https://github.com/milankl/BitInformation.jl/blob/main/test/round_nearest.jl


# TODO: add other dtypes
@pytest.fixture(params=["float32", "float64"])
def dtype(request):
return request.param


def round(data, keepbits):
codec = BitRound(keepbits=keepbits)
data = data.copy() # otherwise overwrites the input
encoded = codec.encode(data)
return codec.decode(encoded)


def test_round_zero_to_zero(dtype):
a = np.zeros((3, 2), dtype=dtype)
# Don't understand Milan's original test:
# How is it possible to have negative keepbits?
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You just end up rounding the exponent bits, see other comment

# for k in range(-5, 50):
for k in range(0, max_bits[dtype]):
ar = round(a, k)
np.testing.assert_equal(a, ar)


def test_round_one_to_one(dtype):
a = np.ones((3, 2), dtype=dtype)
for k in range(0, max_bits[dtype]):
ar = round(a, k)
np.testing.assert_equal(a, ar)


def test_round_minus_one_to_minus_one(dtype):
a = -np.ones((3, 2), dtype=dtype)
for k in range(0, max_bits[dtype]):
ar = round(a, k)
np.testing.assert_equal(a, ar)


def test_no_rounding(dtype):
a = np.random.random_sample((300, 200)).astype(dtype)
keepbits = max_bits[dtype]
ar = round(a, keepbits)
np.testing.assert_equal(a, ar)


APPROX_KEEPBITS = {"float32": 11, "float64": 18}


def test_approx_equal(dtype):
a = np.random.random_sample((300, 200)).astype(dtype)
ar = round(a, APPROX_KEEPBITS[dtype])
# Mimic julia behavior - https://docs.julialang.org/en/v1/base/math/#Base.isapprox
rtol = np.sqrt(np.finfo(np.float32).eps)
# This gets us much closer but still failing for ~6% of the array
# It does pass if we add 1 to keepbits (11 instead of 10)
# Is there an off-by-one issue here?
np.testing.assert_allclose(a, ar, rtol=rtol)
Comment on lines +57 to +65
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The fact that this test passes if we use keepbits=11 instead of 10 makes me think we are dealing with a off-by-one issue, perhaps related to julia vs. python indexing

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be honest, I just guessed the tolerances here. The motivation for this test was less to test exactness, but just to flag immediately if rounding is completely off.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok then I will just bump keepbits to 11 for this test.



def test_idempotence(dtype):
a = np.random.random_sample((300, 200)).astype(dtype)
for k in range(20):
ar = round(a, k)
ar2 = round(a, k)
np.testing.assert_equal(ar, ar2)


def test_errors():
with pytest.raises(ValueError):
BitRound(keepbits=99).encode(np.array([0], dtype="float32"))
with pytest.raises(TypeError):
BitRound(keepbits=10).encode(np.array([0]))