tlambert03 · tlambert03 · Nov 1, 2022 · Oct 30, 2022 · Oct 30, 2022 · Oct 31, 2022
diff --git a/README.md b/README.md
@@ -70,14 +70,16 @@ f.rois              # Dict[int, nd2.structures.ROI]
 f.voxel_size()      # VoxelSize(x=0.65, y=0.65, z=1.0)
 f.text_info         # dict of misc info
 
+f.binary_data       # any binary masks stored in the file.  See below.
+f.custom_data       # bits of unstructured metadata that start with CustomData
+f.recorded_data     # returns a dict of lists (passable to pandas.DataFrame) that
+                    # the tabular "Recorded Data" view from in NIS Elements/Viewer
+                    # with info for each frame in the experiment.
+
 # allll the metadata we can find...
 # no attempt made to standardize or parse it
 # look in here if you're searching for metdata that isn't exposed in the above
 f.unstructured_metadata()
-f.custom_data       # bits of unstructured metadata that start with CustomData
-f.recorded_data   # returns a dict of lists (passable to pandas.DataFrame) that
-                    # the tabular "Recorded Data" view from in NIS Elements/Viewer
-                    # with info for each frame in the experiment.
 
 f.close()           # don't forget to close when done!
 f.closed            # boolean, whether the file is closed
@@ -575,7 +577,7 @@ No attempt is made to parse this data.  It will vary from file to file, but you
 
 <summary><code>recorded_data</code></summary>
 
-This method returns a `dict` of equal-length sequences.
+This property returns a `dict` of equal-length sequences.
 It matches the tabular data reported in the `Image Properties > Recorded Data` tab of the NIS Viewer.
 
 (There will be a column for each tag in the `CustomDataV2_0` section of `custom_data` above.)
@@ -627,7 +629,49 @@ Out[13]:
 14  12.665469       2.0               100.0              0              0       31452.2       -1801.6        556.68           556.68
 
 ```
+
+</details>
+
+<details>
+
+<summary><code>binary_data</code></summary>
+
+This property returns an `nd2.BinaryLayers` object representing all of the
+binary masks in the nd2 file.
+
+A `nd2.BinaryLayers` object is a sequence of individual `nd2.BinaryLayer`
+objects (one for each binary layer found in the file).  Each `BinaryLayer` in
+the sequence is a named tuple that has, among other things, a `name` attribute,
+and a `data` attribute that is list of numpy arrays (one for each frame in the
+experiment) or `None` if the binary layer had no data in that frame.
+
+The most common use case will be to cast either the entire `BinaryLayers` object
+or an individual `BinaryLayer` to a `numpy.ndarray`:
+
+```python
+>>> import nd2
+>>> nd2file = nd2.ND2File('path/to/file.nd2')
+>>> binary_layers = nd2file.binary_data
+
+# The output array will have shape
+# (n_binary_layers, *coord_shape, *frame_shape).
+>>> np.asarray(binary_layers)
+```
+
+For example, if the data in the nd2 file has shape `(nT, nZ, nC, nY, nX)`, and
+there are 4 binary layers, then the output of `np.asarray(nd2file.binary_data)` will
+have shape `(4, nT, nZ, nY, nX)`.  (Note that the `nC` dimension is not present
+in the output array, and the binary layers are always in the first axis).
+
+You can also cast an individual `BinaryLayer` to a numpy array:
+
+```python
+>>> binary_layer = binary_layers[0]
+>>> np.asarray(binary_layer)
+```
+
 </details>
+
 ## alternatives
 
 - [pims_nd2](https://github.com/soft-matter/pims_nd2) - *pims-based reader. ctypes wrapper around the v9.00 (2015) SDK*

diff --git a/scripts/download_samples.py b/scripts/download_samples.py
@@ -1,12 +1,13 @@
+import shutil
 import sys
 from io import BytesIO
 from pathlib import Path
 from zipfile import ZipFile
 
 import requests
 
-TEST_DATA = str(Path(__file__).parent.parent / "tests" / "data")
-URL = "https://www.dropbox.com/s/q57orjfzzagzull/nd2_test_data.zip?dl=1"
+TEST_DATA = Path(__file__).parent.parent / "tests" / "data"
+URL = "https://www.dropbox.com/s/heo9ss4tcsi15x5/nd2_test_data.zip?dl=1"
 
 
 def main():
@@ -26,13 +27,8 @@ def main():
             sys.stdout.write(f'\r[{"=" * done}{" " * (50 - done)}]')
             sys.stdout.flush()
     with ZipFile(f) as zf:
-        zf.extractall(TEST_DATA)
-
-
-# def main(dest: str = TEST_DATA):
-#     with request.urlopen(URL) as resp:
-#         with ZipFile(BytesIO(resp.read())) as zf:
-#             zf.extractall(dest)
+        zf.extractall(str(TEST_DATA))
+    shutil.rmtree(TEST_DATA / "__MACOSX")
 
 
 if __name__ == "__main__":

diff --git a/src/nd2/__init__.py b/src/nd2/__init__.py
@@ -5,17 +5,20 @@
 __author__ = "Talley Lambert"
 __email__ = "talley.lambert@gmail.com"
 __all__ = [
-    "ND2File",
-    "imread",
-    "structures",
     "AXIS",
+    "BinaryLayer",
+    "BinaryLayers",
+    "imread",
     "is_supported_file",
+    "ND2File",
     "read_chunkmap",
     "rescue_nd2",
+    "structures",
 ]
 
 
 from . import structures
+from ._binary import BinaryLayer, BinaryLayers
 from ._chunkmap import read_chunkmap, rescue_nd2
 from ._util import AXIS, is_supported_file
 from .nd2file import ND2File, imread
diff --git a/src/nd2/_binary.py b/src/nd2/_binary.py
@@ -0,0 +1,248 @@
+"""Utilities for binary layers in ND2 files."""
+from __future__ import annotations
+
+import io
+import struct
+import warnings
+from typing import (
+    TYPE_CHECKING,
+    Iterator,
+    List,
+    NamedTuple,
+    Sequence,
+    Tuple,
+    cast,
+    overload,
+)
+
+import numpy as np
+
+if TYPE_CHECKING:
+    from ._sdk.latest import ND2Reader as LatestSDKReader
+    from .nd2file import ND2File
+
+I7 = struct.Struct("<" + "I" * 7)
+I9 = struct.Struct("<" + "I" * 9)
+I2 = struct.Struct("<" + "I" * 2)
+
+
+class BinaryLayer(NamedTuple):
+    """Wrapper for data from a single binary layer in an ND2 file.
+
+    `data` will have length of num_sequences, with `None` for any frames
+    that lack binary data.
+
+    Parameters
+    ----------
+    data : list of numpy.ndarray or None
+        The data for each frame. If a frame has no binary data, the value
+        will be None.  Data will have the same length as the number of sequences
+        in the file.
+    name: str
+        The name of the binary layer.
+    comp_name: str
+        The name of the associated component, if Any.
+    comp_order: int
+        The order of the associated component, if Any.
+    color: int
+        The color of the binary layer.
+    color_mode: int
+        The color mode of the binary layer.  I believe this is related to how colors
+        are chosen in NIS-Elements software.  Where "0" is direct color (i.e. use,
+        the color value), "8" is color by 3D ... and I'm not sure about the rest :)
+    state: int
+        The state of the binary layer. (meaning still unclear)
+    file_tag: str
+        The key for the binary layer in the CustomData metadata,
+        e.g. `RleZipBinarySequence_1_v1`
+    layer_id: int
+        The ID of the binary layer.
+    coordinate_shape: tuple of int
+        The shape of the coordinates for the associated nd2 file.  This is used
+        to reshape the data into a 3D array in `asarray`.
+    """
+
+    data: List[np.ndarray | None]
+    name: str
+    comp_name: str
+    comp_order: int
+    color: int
+    color_mode: int
+    state: int
+    file_tag: str
+    layer_id: int
+    coordinate_shape: Tuple[int, ...]
+
+    @property
+    def frame_shape(self) -> Tuple[int, ...]:
+        """Shape (Y, X) of each mask in `data`."""
+        return next((s.shape for s in self.data if s is not None), (0, 0))
+
+    def __array__(self) -> np.ndarray:
+        """Return the data as a numpy array."""
+        ary = self.asarray()
+        return ary if ary is not None else np.ndarray([])
+
+    def asarray(self) -> np.ndarray | None:
+        """Stack all the frames into a single array.
+
+        If there are no frames, returns None.
+        """
+        frame_shape = self.frame_shape
+        if frame_shape == (0, 0):
+            return None
+
+        # TODO: this is a bit of a hack (takes up memory), but it works for now
+        # could do something with dask
+        d = [
+            i if i is not None else np.zeros(frame_shape, dtype="uint16")
+            for i in self.data
+        ]
+        return np.stack(d).reshape(self.coordinate_shape + frame_shape)
+
+    def __repr__(self) -> str:
+        """Return a nicely formatted string."""
+        field_names = (f for f in self._fields if f != "data")
+        repr_fmt = "(" + ", ".join(f"{name}=%r" for name in field_names) + ")"
+        return self.__class__.__name__ + repr_fmt % self[1:]
+
+
+class BinaryLayers(Sequence[BinaryLayer]):
+    """Sequence of Binary Layers found in an ND2 file.
+
+    This object is a sequence of `BinaryLayer` objects, one for each binary layer in the
+    file.  Each layer has a `name` attribute, and a `data` attribute that is list of
+    numpy arrays - one for each frame in the experiment - or None if the layer was not
+    present in that frame.
+
+    The wrapper can be cast to a numpy array (with `BinaryLayers.asarray()` or
+    np.asarray(BinaryLayers)) to stack all the layers into a single array.  The output
+    array will have shape (n_layers, *coord_shape, *frame_shape).
+    """
+
+    def __init__(self, data: list[BinaryLayer]) -> None:
+        self._data = data
+
+    @overload
+    def __getitem__(self, key: int) -> BinaryLayer:
+        ...
+
+    @overload
+    def __getitem__(self, key: slice) -> List[BinaryLayer]:
+        ...
+
+    def __getitem__(self, key: int | slice) -> BinaryLayer | List[BinaryLayer]:
+        return self._data[key]
+
+    def __iter__(self) -> Iterator[BinaryLayer]:
+        return iter(self._data)
+
+    def __len__(self) -> int:
+        return len(self._data)
+
+    def __repr__(self) -> str:
+        return f"<{type(self).__name__} with {len(self)} layers>"
+
+    def __array__(self) -> np.ndarray:
+        """Compatibility with np.asarray(BinaryLayers)."""
+        return self.asarray()
+
+    def asarray(self) -> np.ndarray:
+        """Stack all the layers/frames into a single array.
+
+        The output array will have shape (n_layers, *coord_shape, *frame_shape).
+        """
+        out = []
+        for bin_layer in self._data:
+            d = bin_layer.asarray()
+            if d is not None:
+                out.append(d)
+        return np.stack(out)
+
+    @classmethod
+    def from_nd2file(cls, nd2file: ND2File) -> BinaryLayers | None:
+        """Extract binary layers from an ND2 file."""
+        if nd2file.is_legacy:
+            warnings.warn(
+                "`binary_data` is not supported for legacy ND2 files", UserWarning
+            )
+            return None
+        rdr = cast("LatestSDKReader", nd2file._rdr)
+
+        binary_meta = nd2file.custom_data.get("BinaryMetadata_v1")
+        if binary_meta is None:
+            return None
+        try:
+            items: List[dict] = binary_meta["BinaryMetadata_v1"]["BinaryItem"]
+        except KeyError:
+            warnings.warn(
+                "Could not find 'BinaryMetadata_v1->BinaryItem' tag, please open an "
+                "issue with this file at https://github.com/tlambert03/nd2/issues/new",
+            )
+            return None
+        if isinstance(items, dict):
+            items = [items]
+
+        binseqs = sorted(x for x in rdr._meta_map if "RleZipBinarySequence" in x)
+        mask_items = []
+        for item in items:
+            key = item["FileTag"]
+            _masks: List[np.ndarray | None] = []
+            for bs in binseqs:
+                if key in bs:
+                    data = rdr._get_meta_chunk(bs)[4:]
+                    _masks.append(_decode_binary_mask(data) if data else None)
+            mask_items.append(
+                BinaryLayer(
+                    data=_masks,
+                    name=item["Name"],
+                    comp_name=item["CompName"],
+                    comp_order=item["CompOrder"],
+                    color_mode=item["ColorMode"],
+                    state=item["State"],
+                    color=item["Color"],
+                    file_tag=key,
+                    layer_id=item["BinLayerID"],
+                    coordinate_shape=nd2file._coord_shape,
+                )
+            )
+
+        return cls(mask_items)
+
+
+def _unpack(stream: io.BufferedIOBase, strct: struct.Struct):
+    return strct.unpack(stream.read(strct.size))
+
+
+def _decode_binary_mask(data: bytes, dtype="uint16") -> np.ndarray:
+    # this receives data as would be extracted from a
+    # `CustomDataSeq|RleZipBinarySequence...` section in the metadata
+    # data = f._rdr._get_meta_chunk('CustomDataSeq|RleZipBinarySequence_1_v1|0')[:4]
+
+    # NOTE it is up to ND2File to strip the first 4 bytes... and not call this if there
+    # is no data (i.e. if the chunk is just '\x00')
+    import zlib
+
+    decomp = zlib.decompress(data)
+    stream = io.BytesIO(decomp)
+
+    # still not sure what _q is
+    # tot_bytes should be length of the stream remaining after this
+    (v, ncols, nrows, nmasks, tot_bytes, _q, _zero) = _unpack(stream, I7)
+    if v != 3:
+        warnings.warn(
+            f"Expected first byte to be 3 but got {v}. "
+            "Please submit this file :) https://github.com/tlambert03/nd2/issues/."
+        )
+
+    output = np.zeros((nrows, ncols), dtype=dtype)
+    for _m in range(nmasks):
+        # (1,     1,  0, 15, 11,       412,      12, 396, 0)
+        (roi_id, c0, r0, c1, r1, roi_bytes, maskrows, _y, _zero) = _unpack(stream, I9)
+        for _r in range(maskrows):
+            (row, nruns) = _unpack(stream, I2)
+            for _s in range(nruns):
+                (col, n) = _unpack(stream, I2)
+                output[row, col : col + n] = roi_id  # noqa: E203
+
+    return output