zarr-developers · rabernat · Jan 15, 2023 · Dec 20, 2022 · Dec 20, 2022 · Dec 20, 2022
diff --git a/numcodecs/__init__.py b/numcodecs/__init__.py
@@ -111,3 +111,6 @@
     register_codec(VLenUTF8)
     register_codec(VLenBytes)
     register_codec(VLenArray)
+
+from numcodecs.fletcher32 import Fletcher32
+register_codec(Fletcher32)
diff --git a/numcodecs/_fletcher.c b/numcodecs/_fletcher.c
@@ -0,0 +1,43 @@
+#include <stdint.h>
+#include <stddef.h>
+
+// https://github.com/Unidata/netcdf-c/blob/8eb71290eb9360dcfd4955ba94759ba8d02c40a9/plugins/H5checksum.c
+
+
+uint32_t H5_checksum_fletcher32(const void *_data, size_t _len)
+{
+    const uint8_t *data = (const uint8_t *)_data;  /* Pointer to the data to be summed */
+    size_t len = _len / 2;      /* Length in 16-bit words */
+    uint32_t sum1 = 0, sum2 = 0;
+
+
+    /* Compute checksum for pairs of bytes */
+    /* (the magic "360" value is is the largest number of sums that can be
+     *  performed without numeric overflow)
+     */
+    while (len) {
+        size_t tlen = len > 360 ? 360 : len;
+        len -= tlen;
+        do {
+            sum1 += (uint32_t)(((uint16_t)data[0]) << 8) | ((uint16_t)data[1]);
+            data += 2;
+            sum2 += sum1;
+        } while (--tlen);
+        sum1 = (sum1 & 0xffff) + (sum1 >> 16);
+        sum2 = (sum2 & 0xffff) + (sum2 >> 16);
+    }
+
+    /* Check for odd # of bytes */
+    if(_len % 2) {
+        sum1 += (uint32_t)(((uint16_t)*data) << 8);
+        sum2 += sum1;
+        sum1 = (sum1 & 0xffff) + (sum1 >> 16);
+        sum2 = (sum2 & 0xffff) + (sum2 >> 16);
+    } /* end if */
+
+    /* Second reduction step to reduce sums to 16 bits */
+    sum1 = (sum1 & 0xffff) + (sum1 >> 16);
+    sum2 = (sum2 & 0xffff) + (sum2 >> 16);
+
+    return (sum2 << 16) | sum1;
+} /* end H5_checksum_fletcher32() */
diff --git a/numcodecs/fletcher32.pyx b/numcodecs/fletcher32.pyx
@@ -0,0 +1,45 @@
+import struct
+
+from numcodecs.abc import Codec
+from numcodecs.compat import ensure_contiguous_ndarray
+
+from libc.stdint cimport uint8_t, uint16_t, uint32_t
+
+cdef extern from "_fletcher.c":
+    uint32_t H5_checksum_fletcher32(const void *_data, size_t _len)
+
+
+class Fletcher32(Codec):
+    """The fletcher checksum with 16-bit words and 32-bit output
+
+    With this codec, the checksum is concatenated on the end of the data
+    bytes when encoded. At decode time, the checksum is performed on
+    the data portion and compared with the four-byte checksum, raising
+    ValueError if inconsistent.
+    """
+
+    codec_id = "fletcher32"
+
+    def encode(self, buf):
+        """Return buffer plus 4-byte fletcher checksum"""
+        buf = ensure_contiguous_ndarray(buf).ravel().view('uint8')
+        cdef const uint8_t[::1] b_ptr = buf
+        val = H5_checksum_fletcher32(&b_ptr[0], buf.nbytes)
+        return buf.tobytes() + struct.pack("<I", val)
+
+    def decode(self, buf, out=None):
+        """Check fletcher checksum, and return buffer without it"""
+        b = ensure_contiguous_ndarray(buf).view('uint8')
+        cdef const uint8_t[::1] b_ptr = b
+        val = H5_checksum_fletcher32(&b_ptr[0], b.nbytes - 4)
+        found = b[-4:].view("<u4")[0]
+        if val != found:
+            raise ValueError(
+                f"The fletcher32 checksum of the data ({val}) did not"
+                f" match the expected checksum ({found}).\n"
+                "This could be a sign that the data has been corrupted."
+            )
+        if out:
+            out.view("uint8")[:] = b[:-4]
+            return out
+        return memoryview(b[:-4])
diff --git a/numcodecs/tests/test_fletcher32.py b/numcodecs/tests/test_fletcher32.py
@@ -0,0 +1,42 @@
+import numpy as np
+import pytest
+
+from numcodecs.fletcher32 import Fletcher32
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    ["uint8", "int32", "float32"]
+)
+def test_with_data(dtype):
+    data = np.arange(100, dtype=dtype)
+    f = Fletcher32()
+    arr = np.frombuffer(f.decode(f.encode(data)), dtype=dtype)
+    assert (arr == data).all()
+
+
+def test_error():
+    data = np.arange(100)
+    f = Fletcher32()
+    enc = f.encode(data)
+    enc2 = bytearray(enc)
+    enc2[0] += 1
+    with pytest.raises(ValueError) as e:
+        f.decode(enc2)
+    assert "fletcher32 checksum" in str(e.value)
+
+
+def test_known():
+    data = (
+        b'\xf04\xfe\x1a\x03\xb2\xb1?^\x99j\xf3\xd6f\xef?\xbbm\x04n'
+        b'\x9a\xdf\xeb?x\x9eIL\xdeW\xc8?A\xef\x88\xa8&\xad\xef?'
+        b'\xf2\xc6a\x01a\xb8\xe8?#&\x96\xabY\xf2\xe7?\xe2Pw\xba\xd0w\xea?'
+        b'\x80\xc5\xf8M@0\x9a?\x98H+\xb4\x03\xfa\xc6?\xb9P\x1e1'
+    )
+    data3 = Fletcher32().decode(data)
+    outarr = np.frombuffer(data3, dtype="<f8")
+    expected = [
+        0.0691225, 0.98130367, 0.87104532, 0.19018153, 0.9898866,
+        0.77250719, 0.74833377, 0.8271259, 0.02557469, 0.17950484
+    ]
+    assert np.allclose(outarr, expected)
diff --git a/setup.py b/setup.py
@@ -198,6 +198,31 @@ def vlen_extension():
     return extensions
 
 
+def fletcher_extension():
+    info('setting up fletcher32 extension')
+
+    extra_compile_args = base_compile_args.copy()
+    define_macros = []
+
+    # setup sources
+    include_dirs = ['numcodecs']
+    # define_macros += [('CYTHON_TRACE', '1')]
+
+    sources = ['numcodecs/fletcher32.pyx']
+
+    # define extension module
+    extensions = [
+        Extension('numcodecs.fletcher32',
+                  sources=sources,
+                  include_dirs=include_dirs,
+                  define_macros=define_macros,
+                  extra_compile_args=extra_compile_args,
+                  ),
+    ]
+
+    return extensions
+
+
 def compat_extension():
     info('setting up compat extension')
 
@@ -265,7 +290,8 @@ def run_setup(with_extensions):
 
     if with_extensions:
         ext_modules = (blosc_extension() + zstd_extension() + lz4_extension() +
-                       compat_extension() + shuffle_extension() + vlen_extension())
+                       compat_extension() + shuffle_extension() + vlen_extension() +
+                       fletcher_extension())
 
         cmdclass = dict(build_ext=ve_build_ext)
     else: