From 1858e08b5aa089033a9f0facdecc6e9cac30469d Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Tue, 1 Oct 2024 14:44:52 +0200 Subject: [PATCH] Add better support for cparams in Proxy and C2Array instances --- doc/reference/c2array.rst | 2 ++ doc/reference/ndarray.rst | 4 +++ doc/reference/proxy.rst | 4 +++ examples/ndarray/proxy-carray.py | 54 +++++++++++++++++++++++++++++ examples/ndarray/proxy-ndarray.py | 56 +++++++++++++++++++++++++++++++ src/blosc2/c2array.py | 14 ++++++++ src/blosc2/ndarray.py | 20 +++++++++++ src/blosc2/proxy.py | 20 ++++++++++- src/blosc2/storage.py | 4 +++ 9 files changed, 177 insertions(+), 1 deletion(-) create mode 100644 examples/ndarray/proxy-carray.py create mode 100644 examples/ndarray/proxy-ndarray.py diff --git a/doc/reference/c2array.rst b/doc/reference/c2array.rst index fa9fb13f..152573df 100644 --- a/doc/reference/c2array.rst +++ b/doc/reference/c2array.rst @@ -28,6 +28,8 @@ Attributes chunks blocks dtype + cparams + urlpath .. _URLPath: diff --git a/doc/reference/ndarray.rst b/doc/reference/ndarray.rst index 961f0b0f..a6952372 100644 --- a/doc/reference/ndarray.rst +++ b/doc/reference/ndarray.rst @@ -45,6 +45,10 @@ Attributes info schunk size + cparams + dparams + urlpath + vlmeta .. currentmodule:: blosc2 diff --git a/doc/reference/proxy.rst b/doc/reference/proxy.rst index 323ee722..76e57959 100644 --- a/doc/reference/proxy.rst +++ b/doc/reference/proxy.rst @@ -29,6 +29,10 @@ Attributes :toctree: autofiles/proxy shape + chunks + blocks dtype + cparams + info fields vlmeta diff --git a/examples/ndarray/proxy-carray.py b/examples/ndarray/proxy-carray.py new file mode 100644 index 00000000..e7b3282d --- /dev/null +++ b/examples/ndarray/proxy-carray.py @@ -0,0 +1,54 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# This source code is licensed under a BSD-style license (found in the +# LICENSE file in the root directory of this source tree) +####################################################################### + +# Shows how you can make a proxy of a remote array (served with Caterva2) on disk +# Note that, for running this example, you will need the blosc2-grok package. + +import os +from time import time + +import blosc2 + +urlbase = "https://demo.caterva2.net/" +path = "example/lung-jpeg2000_10x.b2nd" +a = blosc2.C2Array(path, urlbase=urlbase) +blosc2.remove_urlpath("proxy.b2nd") +b = blosc2.Proxy(a, urlpath="proxy.b2nd") + +# Check metadata (note that all should be the same) +print("*** Metadata ***") +print(f"Codec in 'a': {a.cparams.codec}") +print(f"Codec in 'b': {b.cparams.codec}") +print(f"Filters in 'a': {a.cparams.filters}") +print(f"Filters in 'b': {b.cparams.filters}") + +# Check array properties +print("*** Array properties ***") +print(f"Shape in 'a': {a.shape}") +print(f"Shape in 'b': {b.shape}") +print(f"Type in 'a': {a.dtype}") +print(f"Type in 'b': {b.dtype}") + +print("*** Fetching data ***") +t0 = time() +print(f"Data in 'a': {a[0, 0, 0:10]}") +print(f"Time to fetch data in 'a': {time() - t0:.3f}s") +t0 = time() +print(f"Data in 'b': {b[0, 0, 0:10]}") +print(f"Time to fetch data in 'b': {time() - t0:.3f}s") +t0 = time() +print(f"Data in 'b': {b[0, 0, 0:10]}") +print(f"Time to fetch data in 'b' (cached): {time() - t0:.3f}s") + +# Check sizes. Note that the proxy will only have the 'touched' chunks (only 1 in this case) +print("*** Sizes ***") +print(f"Size in 'a': {a.meta['schunk']['cbytes']}") +print(f"Size in 'b': {b.schunk.cbytes}") +# Check sizes on disk +print("*** Disk sizes ***") +print(f"Size 'b' (disk): {os.stat(b.urlpath).st_size}") diff --git a/examples/ndarray/proxy-ndarray.py b/examples/ndarray/proxy-ndarray.py new file mode 100644 index 00000000..58aee9b0 --- /dev/null +++ b/examples/ndarray/proxy-ndarray.py @@ -0,0 +1,56 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# This source code is licensed under a BSD-style license (found in the +# LICENSE file in the root directory of this source tree) +####################################################################### + +# Shows how you can make a proxy of a local array on disk. + +import os + +import blosc2 + +cparams = blosc2.CParams( + clevel=5, codec=blosc2.Codec.LZ4, filters=[blosc2.Filter.BITSHUFFLE], filters_meta=[0] +) +cwd = os.getcwd() +a = blosc2.full((128, 128), 1, dtype="float64", urlpath=f"{cwd}/a.b2nd", mode="w", cparams=cparams) +blosc2.remove_urlpath(f"{cwd}/proxy.b2nd") +b = blosc2.Proxy(a, urlpath=f"{cwd}/proxy.b2nd") + +# Check metadata +print("*** Metadata ***") +print(f"Codec in 'a': {a.cparams.codec}") +print(f"Codec in 'b': {b.cparams.codec}") +print(f"Clevel in 'a': {a.cparams.clevel}") +print(f"Clevel in 'b': {b.cparams.clevel}") +print(f"Filters in 'a': {a.cparams.filters}") +print(f"Filters in 'b': {b.cparams.filters}") + +# Check array properties +print("*** Array properties ***") +print(f"Shape in 'a': {a.shape}") +print(f"Shape in 'b': {b.shape}") +print(f"Type in 'a': {a.dtype}") +print(f"Type in 'b': {b.dtype}") + +# Check data +print("*** Fetching data ***") +print(f"Data in 'a': {a[0, 0:10]}") +print(f"Data in 'b': {b[0, 0:10]}") + +# Check sizes. Note that the proxy will only have the 'touched' chunks (only 1 in this case) +print("*** Sizes ***") +print(f"Size in 'a': {a.schunk.cbytes}") +print(f"Size in 'b': {b.schunk.cbytes}") +# Check sizes on disk +print("*** Disk sizes ***") +print(f"Size 'a' (disk): {os.stat(a.urlpath).st_size}") +print(f"Size 'b' (disk): {os.stat(b.urlpath).st_size}") + +# Check vlmeta +print("*** VLmeta ***") +print(f"VLmeta in 'a': {list(a.vlmeta)}") +print(f"VLmeta in 'b': {list(b.vlmeta)}") diff --git a/src/blosc2/c2array.py b/src/blosc2/c2array.py index bf088881..5cca9132 100644 --- a/src/blosc2/c2array.py +++ b/src/blosc2/c2array.py @@ -235,6 +235,10 @@ def __init__(self, path: str, /, urlbase: str = None, auth_token: str = None): self.meta = info(self.path, self.urlbase, auth_token=self.auth_token) except httpx.HTTPStatusError as err: raise FileNotFoundError(f"Remote path not found: {path}.\nError was: {err}") from err + cparams = self.meta["schunk"]["cparams"] + # Remove "filters, meta" from cparams; this is an artifact from the server + cparams.pop("filters, meta", None) + self._cparams = blosc2.CParams(**cparams) def __getitem__(self, slice_: int | slice | Sequence[slice]) -> np.ndarray: """ @@ -322,6 +326,16 @@ def dtype(self) -> np.dtype: """The dtype of the remote array""" return np.dtype(self.meta["dtype"]) + @property + def cparams(self) -> blosc2.CParams: + """The compression parameters of the remote array""" + return self._cparams + + @property + def urlpath(self) -> str: + """The URL path of the remote array""" + return self.path + class URLPath: def __init__(self, path: str, /, urlbase: str = None, auth_token: str = None): diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index 8d503076..e81aa230 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -791,6 +791,26 @@ def __init__(self, **kwargs): for field in self.dtype.fields: self._fields[field] = NDField(self, field) + @property + def cparams(self) -> dict: + """The compression parameters used by the array.""" + return self.schunk.cparams + + @property + def dparams(self) -> dict: + """The decompression parameters used by the array.""" + return self.schunk.dparams + + @property + def urlpath(self) -> str: + """The URL path of the array.""" + return self.schunk.urlpath + + @property + def vlmeta(self) -> dict: + """The variable-length metadata of the array.""" + return self.schunk.vlmeta + @property def fields(self) -> dict: """ diff --git a/src/blosc2/proxy.py b/src/blosc2/proxy.py index 0d34269e..ab7b2ba4 100644 --- a/src/blosc2/proxy.py +++ b/src/blosc2/proxy.py @@ -209,14 +209,15 @@ def __init__(self, src: ProxySource or ProxyNDSource, urlpath: str = None, **kwa self.src.dtype, chunks=self.src.chunks, blocks=self.src.blocks, + cparams=self.src.cparams, urlpath=urlpath, meta=meta, ) else: self._cache = blosc2.SChunk( chunksize=self.src.chunksize, + cparams=self.src.cparams, urlpath=urlpath, - cparams={"typesize": self.src.typesize}, meta=meta, ) self._cache.fill_special(self.src.nbytes // self.src.typesize, blosc2.SpecialValue.UNINIT) @@ -408,6 +409,23 @@ def shape(self) -> tuple[int]: """The shape of :paramref:`self`""" return self._cache.shape if isinstance(self._cache, blosc2.NDArray) else len(self._cache) + @property + def schunk(self) -> blosc2.schunk.SChunk: + """The :ref:`SChunk` of the cache""" + return self._schunk_cache + + @property + def cparams(self) -> blosc2.CParams: + """The compression parameters of the cache""" + return self._cache.cparams + + @property + def info(self) -> str: + """The info of the cache""" + if isinstance(self._cache, blosc2.NDArray): + return self._cache.info + raise NotImplementedError("info is only available if the source is a NDArray") + def __str__(self): return f"Proxy({self.src}, urlpath={self.urlpath})" diff --git a/src/blosc2/storage.py b/src/blosc2/storage.py index 8bd4d90b..1653c331 100644 --- a/src/blosc2/storage.py +++ b/src/blosc2/storage.py @@ -82,6 +82,8 @@ class CParams: tuner: blosc2.Tuner = blosc2.Tuner.STUNE def __post_init__(self): + if not isinstance(self.codec, blosc2.Codec): + self.codec = blosc2.Codec(self.codec) if len(self.filters) > 6: raise ValueError("Number of filters exceeds 6") if len(self.filters) < len(self.filters_meta): @@ -91,6 +93,8 @@ def __post_init__(self): raise ValueError("Number of filters cannot exceed number of filters meta") for i in range(len(self.filters)): + if not isinstance(self.filters[i], blosc2.Filter): + self.filters[i] = blosc2.Filter(self.filters[i]) if self.filters_meta[i] == 0 and self.filters[i] == blosc2.Filter.BYTEDELTA: self.filters_meta[i] = self.typesize