From 97c4d4c9b4e4ba0fbab4c4c0499e0528ab6696dc Mon Sep 17 00:00:00 2001 From: oumaima-ech-chdig Date: Wed, 18 Sep 2024 13:37:17 +0200 Subject: [PATCH 01/18] Proxy examples --- src/blosc2/proxy.py | 123 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 109 insertions(+), 14 deletions(-) diff --git a/src/blosc2/proxy.py b/src/blosc2/proxy.py index 2e1d5cd2..9905bcda 100644 --- a/src/blosc2/proxy.py +++ b/src/blosc2/proxy.py @@ -8,7 +8,6 @@ from abc import ABC, abstractmethod import blosc2 -import numpy as np class ProxySource(ABC): @@ -25,7 +24,7 @@ class ProxySource(ABC): """ @abstractmethod - def get_chunk(self, nchunk: int) -> bytes: + def get_chunk(self, nchunk): """ Return the compressed chunk in :paramref:`self`. @@ -49,7 +48,7 @@ class Proxy(blosc2.Operand): which follows the :ref:`ProxySource` interface in an urlpath. """ - def __init__(self, src: ProxySource, urlpath: str = None, **kwargs: dict): + def __init__(self, src, urlpath=None, **kwargs): """ Create a new :ref:`Proxy` to serve like a cache to save accessed chunks locally. @@ -57,7 +56,7 @@ def __init__(self, src: ProxySource, urlpath: str = None, **kwargs: dict): Parameters ---------- src: :ref:`ProxySource` - The original container. + The original container urlpath: str, optional The urlpath where to save the container that will work as a cache. @@ -116,7 +115,7 @@ def __init__(self, src: ProxySource, urlpath: str = None, **kwargs: dict): for key in vlmeta: self._schunk_cache.vlmeta[key] = vlmeta[key] - def fetch(self, item: slice | list[slice] = None) -> blosc2.NDArray | blosc2.schunk.SChunk: + def fetch(self, item=None): """ Get the container used as cache with the requested data updated. @@ -166,7 +165,7 @@ def fetch(self, item: slice | list[slice] = None) -> blosc2.NDArray | blosc2.sch return self._cache - async def afetch(self, item: slice | list[slice] = None) -> blosc2.NDArray | blosc2.schunk.SChunk: + async def afetch(self, item=None): """ Get the container used as cache with the requested data updated in an asynchronous way. @@ -186,6 +185,50 @@ async def afetch(self, item: slice | list[slice] = None) -> blosc2.NDArray | blo ----- This method is only available if the :ref:`ProxySource` has an async `aget_chunk` method. + + Examples + -------- + >>> import numpy as np + >>> import blosc2 + >>> import asyncio + >>> class MyProxySource: + >>> def __init__(self, data): + >>> # If the source is multidimensional, it must have the attributes: + >>> self.data = data + >>> self.shape = data.shape + >>> self.chunks = [data.shape[i] for i in range(data.ndim)] + >>> self.blocks = [data.shape[i] for i in range(data.ndim)] + >>> self.dtype = data.dtype + >>> f"Data shape: {self.shape}, Chunks: {self.chunks}" + Data shape: (4, 5), Chunks: [4, 5] + >>> f"Blocks: {self.blocks}, Dtype: {self.dtype}" + Blocks: [4, 5], Dtype: int64 + >>> async def aget_chunk(self, nchunk): + >>> await asyncio.sleep(0.1) + >>> return self.data[nchunk] + >>> # Class that inherits from blosc2.Proxy + >>> class MyProxy(blosc2.Proxy): + >>> def __init__(self, source): + >>> super().__init__(source) + >>> self._cache = source.data + >>> # Asynchronous method to get the cache data + >>> async def afetch(self, slice_=None): + >>> return self._cache if slice_ is None else self._cache[slice_] + >>> data = np.arange(20).reshape(4, 5) + >>> source = MyProxySource(data) + >>> proxy = MyProxy(source) + >>> async def fetch_data(): + >>> full_data = await proxy.afetch() + >>> f"Full data cache: {full_data[:]}" + Full data cache: [[ 0 1 2 3 4] + [ 5 6 7 8 9] + [10 11 12 13 14] + [15 16 17 18 19]] + >>> slice_data = await proxy.afetch(slice(0, 2)) + >>> f"Slice data cache: {slice_data[:]}" + Slice data cache: [[0 1 2 3 4] + [5 6 7 8 9]] + >>> asyncio.run(fetch_data()) """ if not callable(getattr(self.src, "aget_chunk", None)): raise NotImplementedError("afetch is only available if the source has an aget_chunk method") @@ -205,7 +248,7 @@ async def afetch(self, item: slice | list[slice] = None) -> blosc2.NDArray | blo return self._cache - def __getitem__(self, item: slice | list[slice]) -> np.ndarray: + def __getitem__(self, item): """ Get a slice as a numpy.ndarray using the :ref:`Proxy`. @@ -244,31 +287,63 @@ def __getitem__(self, item: slice | list[slice]) -> np.ndarray: return self._cache[item] @property - def dtype(self) -> np.dtype: - """The dtype of :paramref:`self` or None if the data is unidimensional""" + def dtype(self): + """The dtype of :paramref:`self` or None if the data is unidimensional + + Examples + -------- + >>> import numpy as np + >>> import blosc2 + >>> data = np.arange(100).reshape(10, 10) + >>> ndarray = blosc2.asarray(data) + >>> proxy = blosc2.Proxy(ndarray) + >>> proxy.dtype + dtype('int64') + """ return self._cache.dtype if isinstance(self._cache, blosc2.NDArray) else None @property - def shape(self) -> tuple[int]: - """The shape of :paramref:`self`""" + def shape(self): + """The shape of :paramref:`self` + + Examples + -------- + >>> import numpy as np + >>> import blosc2 + >>> data = np.arange(100).reshape(10, 10) + >>> ndarray = blosc2.asarray(data) + >>> proxy = blosc2.Proxy(ndarray) + >>> proxy.shape + (10, 10) + """ return self._cache.shape if isinstance(self._cache, blosc2.NDArray) else len(self._cache) def __str__(self): return f"Proxy({self.src}, urlpath={self.urlpath})" @property - def vlmeta(self) -> blosc2.schunk.vlmeta: + def vlmeta(self): """ Get the vlmeta of the cache. See Also -------- :ref:`SChunk.vlmeta` + + Examples + -------- + >>> import numpy as np + >>> import blosc2 + >>> data = np.arange(100).reshape(10, 10) + >>> ndarray = blosc2.asarray(data) + >>> proxy = blosc2.Proxy(ndarray) + >>> f"VLMeta data: {proxy.vlmeta}" + VLMeta data: """ return self._schunk_cache.vlmeta @property - def fields(self) -> dict: + def fields(self): """ Dictionary with the fields of :paramref:`self`. @@ -280,6 +355,26 @@ def fields(self) -> dict: See Also -------- :ref:`NDField` + + Examples + -------- + >>> import numpy as np + >>> import blosc2 + >>> data = np.zeros(16, dtype=[('field1', 'i4'), ('field2', 'f4')]).reshape(4, 4) + >>> ndarray = blosc2.asarray(data) + >>> proxy = blosc2.Proxy(ndarray) + >>> # Get a dictionary of fields from the proxy, where each field can be accessed individually + >>> fields_dict = proxy.fields + >>> for field_name, field_proxy in fields_dict.items(): + >>> f"Field name: {field_name}, Field data: {field_proxy}" + Field name: field1, Field data: + Field name: field2, Field data: + >>> field1_data = fields_dict['field1'][:] + >>> field1_data + [[0 0 0 0] + [0 0 0 0] + [0 0 0 0] + [0 0 0 0]] """ _fields = getattr(self._cache, "fields", None) if _fields is None: @@ -294,7 +389,7 @@ def __init__(self, proxy: Proxy, field: str): self.shape = proxy.shape self.dtype = proxy.dtype - def __getitem__(self, item: slice | list[slice]) -> np.ndarray: + def __getitem__(self, item: slice): """ Get a slice as a numpy.ndarray using the `field` in `proxy`. From 6ef1029b4c0639f25f4e96a1d65920c90a0b411a Mon Sep 17 00:00:00 2001 From: oumaima-ech-chdig Date: Thu, 19 Sep 2024 13:24:16 +0200 Subject: [PATCH 02/18] Proxy examples: first corrections. --- src/blosc2/proxy.py | 123 +++++++++++++++++--------------------------- 1 file changed, 48 insertions(+), 75 deletions(-) diff --git a/src/blosc2/proxy.py b/src/blosc2/proxy.py index 9905bcda..9f566bc1 100644 --- a/src/blosc2/proxy.py +++ b/src/blosc2/proxy.py @@ -137,17 +137,12 @@ def fetch(self, item=None): >>> data = np.arange(20).reshape(10, 2) >>> ndarray = blosc2.asarray(data) >>> proxy = blosc2.Proxy(ndarray) - >>> full_data = proxy.fetch() - >>> f"Full data cache: {full_data[:]}" - Full data cache: - [[ 0 1][ 2 3][ 4 5] - [ 6 7][ 8 9][10 11] - [12 13][14 15][16 17] - [18 19]] - >>> slice_data = proxy[0:2, :] - >>> f"Slice data cache: {slice_data}" + >>> slice_data = proxy.fetch((slice(0, 3), slice(0, 2))) + >>> f"Slice data cache: {slice_data[:3, :2]}" Slice data cache: - [[0 1][2 3]] + [[0 1] + [2 3] + [4 5]] """ if item is None: # Full realization @@ -192,42 +187,50 @@ async def afetch(self, item=None): >>> import blosc2 >>> import asyncio >>> class MyProxySource: - >>> def __init__(self, data): - >>> # If the source is multidimensional, it must have the attributes: - >>> self.data = data - >>> self.shape = data.shape - >>> self.chunks = [data.shape[i] for i in range(data.ndim)] - >>> self.blocks = [data.shape[i] for i in range(data.ndim)] - >>> self.dtype = data.dtype - >>> f"Data shape: {self.shape}, Chunks: {self.chunks}" - Data shape: (4, 5), Chunks: [4, 5] - >>> f"Blocks: {self.blocks}, Dtype: {self.dtype}" - Blocks: [4, 5], Dtype: int64 - >>> async def aget_chunk(self, nchunk): - >>> await asyncio.sleep(0.1) - >>> return self.data[nchunk] - >>> # Class that inherits from blosc2.Proxy + >>> def __init__(self, data, chunk, block): + >>> # If the next source is multidimensional, it must have the attributes: + >>> self.data = data + >>> self.shape = data.shape + >>> self.chunks = chunk + >>> self.blocks = block + >>> self.dtype = data.dtype + >>> f"Data shape: {self.shape}, Chunks: {self.chunks}" + Data shape: (4, 5), Chunks: [2, 2] + >>> f"Blocks: {self.blocks}, Dtype: {self.dtype}" + Blocks: [2, 2], Dtype: int64 + >>> # This method must be present. + >>> async def aget_chunk(self, nchunk): + >>> await asyncio.sleep(0.1) + >>> return self.data[nchunk] >>> class MyProxy(blosc2.Proxy): - >>> def __init__(self, source): - >>> super().__init__(source) - >>> self._cache = source.data - >>> # Asynchronous method to get the cache data - >>> async def afetch(self, slice_=None): - >>> return self._cache if slice_ is None else self._cache[slice_] + >>> def __init__(self, source): + >>> super().__init__(source) + >>> self._cache = source.data + >>> # Asynchronous method to get the cache data + >>> async def afetch(self, slice_=None): + >>> return self._cache if slice_ is None else self._cache[slice_] >>> data = np.arange(20).reshape(4, 5) - >>> source = MyProxySource(data) + >>> chunk = [2, 2] + >>> block = [2, 2] + >>> source = MyProxySource(data, chunk, block) >>> proxy = MyProxy(source) >>> async def fetch_data(): - >>> full_data = await proxy.afetch() - >>> f"Full data cache: {full_data[:]}" - Full data cache: [[ 0 1 2 3 4] - [ 5 6 7 8 9] - [10 11 12 13 14] - [15 16 17 18 19]] + >>> # Fetch a slice of the data from the proxy asynchronously >>> slice_data = await proxy.afetch(slice(0, 2)) >>> f"Slice data cache: {slice_data[:]}" - Slice data cache: [[0 1 2 3 4] + Slice data cache: + [[0 1 2 3 4] [5 6 7 8 9]] + >>> # Note: It is not possible to print the intermediate state of the proxy + >>> # because accessing the proxy causes the entire data to be copied. + >>> # Fetch the full data from the proxy asynchronously + >>> full_data = await proxy.afetch() + >>> f"Full data cache: {full_data[:]}" + Full data cache: + [[ 0 1 2 3 4] + [ 5 6 7 8 9] + [10 11 12 13 14] + [15 16 17 18 19]] >>> asyncio.run(fetch_data()) """ if not callable(getattr(self.src, "aget_chunk", None)): @@ -266,21 +269,21 @@ def __getitem__(self, item): -------- >>> import numpy as np >>> import blosc2 - >>> data = np.arange(100).reshape(10, 10) + >>> data = np.arange(25).reshape(5, 5) >>> ndarray = blosc2.asarray(data) >>> proxy = blosc2.Proxy(ndarray) >>> slice_1 = proxy[0:3, 0:3] >>> f"Slice 1: {slice_1}" Slice 1: [[ 0 1 2] + [ 5 6 7] [10 11 12] - [20 21 22]] - >>> slice_2 = proxy[5:8, 2:5] + >>> slice_2 = proxy[2:5, 2:5] >>> f"Slice 2: {slice_2}" Slice 2: - [[52 53 54] - [62 63 64] - [72 73 74]] + [[12 13 14] + [17 18 19] + [22 23 24]] """ # Populate the cache self.fetch(item) @@ -289,32 +292,12 @@ def __getitem__(self, item): @property def dtype(self): """The dtype of :paramref:`self` or None if the data is unidimensional - - Examples - -------- - >>> import numpy as np - >>> import blosc2 - >>> data = np.arange(100).reshape(10, 10) - >>> ndarray = blosc2.asarray(data) - >>> proxy = blosc2.Proxy(ndarray) - >>> proxy.dtype - dtype('int64') """ return self._cache.dtype if isinstance(self._cache, blosc2.NDArray) else None @property def shape(self): """The shape of :paramref:`self` - - Examples - -------- - >>> import numpy as np - >>> import blosc2 - >>> data = np.arange(100).reshape(10, 10) - >>> ndarray = blosc2.asarray(data) - >>> proxy = blosc2.Proxy(ndarray) - >>> proxy.shape - (10, 10) """ return self._cache.shape if isinstance(self._cache, blosc2.NDArray) else len(self._cache) @@ -329,16 +312,6 @@ def vlmeta(self): See Also -------- :ref:`SChunk.vlmeta` - - Examples - -------- - >>> import numpy as np - >>> import blosc2 - >>> data = np.arange(100).reshape(10, 10) - >>> ndarray = blosc2.asarray(data) - >>> proxy = blosc2.Proxy(ndarray) - >>> f"VLMeta data: {proxy.vlmeta}" - VLMeta data: """ return self._schunk_cache.vlmeta From f016ee887927805f85406068d56c71ed497e8f30 Mon Sep 17 00:00:00 2001 From: oumaima-ech-chdig Date: Mon, 23 Sep 2024 13:57:25 +0200 Subject: [PATCH 03/18] Proxy examples: second corrections. --- src/blosc2/c2array.py | 48 +++++++++++++++++++++++++++ src/blosc2/proxy.py | 77 +++++++++++++++++++++++-------------------- 2 files changed, 89 insertions(+), 36 deletions(-) diff --git a/src/blosc2/c2array.py b/src/blosc2/c2array.py index 23a68def..17e74c80 100644 --- a/src/blosc2/c2array.py +++ b/src/blosc2/c2array.py @@ -198,6 +198,24 @@ def __init__(self, path: str, /, urlbase: str = None, auth_token: str = None): ------- out: C2Array + Examples + -------- + >>> import blosc2 + >>> import pathlib + >>> host = "https://demo.caterva2.net/" + >>> root = "b2tests" + >>> dir = "expr/" + >>> name = "ds-0-10-linspace-float64-(True, True)-a1-(60, 60)d.b2nd" + >>> path = pathlib.Path(f"{root}/{dir + name}").as_posix() + >>> remote_array = blosc2.C2Array(path, urlbase=host) + >>> f"Shape of the remote array: {remote_array.shape}" + >>> f"Chunks of the remote array: {remote_array.chunks}" + >>> f"Blocks of the remote array: {remote_array.blocks}" + >>> f"Dtype of the remote array: {remote_array.dtype}" + Shape of the remote array: (60, 60) + Chunks of the remote array: (30, 60) + Blocks of the remote array: (10, 60) + Dtype of the remote array: float64 """ if path.startswith("/"): raise ValueError("The path should start with a root name, not a slash") @@ -252,6 +270,36 @@ def get_chunk(self, nchunk: int) -> bytes: ------- out: bytes The requested compressed chunk. + + Examples + -------- + >>> import pathlib + >>> import numpy as np + >>> import blosc2 + >>> host = "https://demo.caterva2.net/" + >>> root = "b2tests" + >>> dir = "expr/" + >>> root = "b2tests" + >>> dir = "expr/" + >>> name1 = "ds-0-10-linspace-float64-(True, True)-a1-(60, 60)d.b2nd" + >>> name2 = "ds-0-10-linspace-float64-(True, True)-a2-(60, 60)d.b2nd" + >>> path1 = pathlib.Path(f"{root}/{dir + name1}").as_posix() + >>> path2 = pathlib.Path(f"{root}/{dir + name2}").as_posix() + >>> a = blosc2.C2Array(path1, host) + >>> b = blosc2.C2Array(path2, host) + >>> c = a + b + >>> # Get the compressed chunk from array 'a' for index 0 + >>> chunk_index = 0 + >>> compressed_chunk = c.get_chunk(chunk_index) + >>> f"Size of chunk {chunk_index} from a: {len(compressed_chunk)} bytes" + Size of chunk 0 from 'a': 8604 bytes + >>> # Decompress the chunk and convert it to a NumPy array + >>> decompressed_chunk = blosc2.decompress(compressed_chunk) + >>> chunk_np_array = np.frombuffer(decompressed_chunk, dtype=a.dtype) + >>> f"Content of chunk {chunk_index} as NumPy array:{chunk_np_array}" + Content of chunk 0 as NumPy array: + [0.00000000e+00 5.55709919e-03 1.11141984e-02 ... 9.98610725e+00 + 9.99166435e+00 9.99722145e+00] """ url = _sub_url(self.urlbase, f"api/chunk/{self.path}") params = {"nchunk": nchunk} diff --git a/src/blosc2/proxy.py b/src/blosc2/proxy.py index 7e92087d..828b6ab8 100644 --- a/src/blosc2/proxy.py +++ b/src/blosc2/proxy.py @@ -10,8 +10,6 @@ import blosc2 import numpy as np -import blosc2 - class ProxyNDSource(ABC): """ @@ -51,7 +49,7 @@ def dtype(self) -> np.dtype: pass @abstractmethod - def get_chunk(self, nchunk): + def get_chunk(self, nchunk: int) -> bytes: """ Return the compressed chunk in :paramref:`self`. @@ -227,7 +225,7 @@ def __init__(self, src: ProxySource or ProxyNDSource, urlpath: str = None, **kwa for key in vlmeta: self._schunk_cache.vlmeta[key] = vlmeta[key] - def fetch(self, item=None): + def fetch(self, item: slice | list[slice] = None) -> blosc2.NDArray | blosc2.schunk.SChunk: """ Get the container used as cache with the requested data updated. @@ -272,7 +270,7 @@ def fetch(self, item=None): return self._cache - async def afetch(self, item=None): + async def afetch(self, item: slice | list[slice] = None) -> blosc2.NDArray | blosc2.schunk.SChunk: """ Get the container used as cache with the requested data updated in an asynchronous way. @@ -299,44 +297,51 @@ async def afetch(self, item=None): >>> import blosc2 >>> import asyncio >>> class MyProxySource: - >>> def __init__(self, data, chunk, block): + >>> def __init__(self, data): >>> # If the next source is multidimensional, it must have the attributes: >>> self.data = data - >>> self.shape = data.shape - >>> self.chunks = chunk - >>> self.blocks = block - >>> self.dtype = data.dtype >>> f"Data shape: {self.shape}, Chunks: {self.chunks}" - Data shape: (4, 5), Chunks: [2, 2] + Data shape: (4, 5), Chunks: [2, 5] >>> f"Blocks: {self.blocks}, Dtype: {self.dtype}" - Blocks: [2, 2], Dtype: int64 - >>> # This method must be present. + Blocks: [1, 5], Dtype: int64 + >>> @property + >>> def shape(self): + >>> return self.data.shape + >>> @property + >>> def chunks(self): + >>> return self.data.chunks + >>> @property + >>> def blocks(self): + >>> return self.data.blocks + >>> @property + >>> def dtype(self): + >>> return self.data.dtype + >>> # This method must be present + >>> def get_chunk(self, nchunk): + >>> return self.data.get_chunk(nchunk) + >>> # This method is optional >>> async def aget_chunk(self, nchunk): - >>> await asyncio.sleep(0.1) - >>> return self.data[nchunk] - >>> class MyProxy(blosc2.Proxy): - >>> def __init__(self, source): - >>> super().__init__(source) - >>> self._cache = source.data - >>> # Asynchronous method to get the cache data - >>> async def afetch(self, slice_=None): - >>> return self._cache if slice_ is None else self._cache[slice_] + >>> await asyncio.sleep(0.1) # Simulate an asynchronous operation + >>> return self.data.get_chunk(nchunk) >>> data = np.arange(20).reshape(4, 5) - >>> chunk = [2, 2] - >>> block = [2, 2] - >>> source = MyProxySource(data, chunk, block) - >>> proxy = MyProxy(source) + >>> chunks = [2, 5] + >>> blocks = [1, 5] + >>> data = blosc2.asarray(data, chunks=chunks, blocks=blocks) + >>> source = MyProxySource(data2) + >>> proxy = blosc2.Proxy(source) >>> async def fetch_data(): - >>> # Fetch a slice of the data from the proxy asynchronously - >>> slice_data = await proxy.afetch(slice(0, 2)) - >>> f"Slice data cache: {slice_data[:]}" + >>> # Fetch a slice of the data from the proxy asynchronously + >>> slice_data = await proxy.afetch(slice(0, 2)) + >>> # Note that only data fetched is shown, the rest is uninitialized + >>> f"Slice data cache: {slice_data[:]}" Slice data cache: [[0 1 2 3 4] - [5 6 7 8 9]] - >>> # Note: It is not possible to print the intermediate state of the proxy - >>> # because accessing the proxy causes the entire data to be copied. + [5 6 7 8 9] + [0 0 0 0 0] + [0 0 0 0 0]] >>> # Fetch the full data from the proxy asynchronously >>> full_data = await proxy.afetch() + >>> # Now, all data is shown, meaning the full data has been fetched >>> f"Full data cache: {full_data[:]}" Full data cache: [[ 0 1 2 3 4] @@ -363,7 +368,7 @@ async def afetch(self, item=None): return self._cache - def __getitem__(self, item): + def __getitem__(self, item: slice | list[slice]) -> np.ndarray: """ Get a slice as a numpy.ndarray using the :ref:`Proxy`. @@ -417,7 +422,7 @@ def __str__(self): return f"Proxy({self.src}, urlpath={self.urlpath})" @property - def vlmeta(self): + def vlmeta(self) -> blosc2.schunk.vlmeta: """ Get the vlmeta of the cache. @@ -428,7 +433,7 @@ def vlmeta(self): return self._schunk_cache.vlmeta @property - def fields(self): + def fields(self)-> dict: """ Dictionary with the fields of :paramref:`self`. @@ -474,7 +479,7 @@ def __init__(self, proxy: Proxy, field: str): self.shape = proxy.shape self.dtype = proxy.dtype - def __getitem__(self, item: slice): + def __getitem__(self, item: slice | list[slice]) -> np.ndarray: """ Get a slice as a numpy.ndarray using the `field` in `proxy`. From 499d313344b34e7e327f22e2bf755f9e4de770a8 Mon Sep 17 00:00:00 2001 From: oumaima-ech-chdig Date: Tue, 24 Sep 2024 13:01:12 +0200 Subject: [PATCH 04/18] Examples of C2Array and Proxy. --- src/blosc2/c2array.py | 65 ++++++++++++++++++++++++------------------- src/blosc2/proxy.py | 46 ++++++++++++++++-------------- 2 files changed, 61 insertions(+), 50 deletions(-) diff --git a/src/blosc2/c2array.py b/src/blosc2/c2array.py index 17e74c80..73c6958e 100644 --- a/src/blosc2/c2array.py +++ b/src/blosc2/c2array.py @@ -202,20 +202,18 @@ def __init__(self, path: str, /, urlbase: str = None, auth_token: str = None): -------- >>> import blosc2 >>> import pathlib - >>> host = "https://demo.caterva2.net/" - >>> root = "b2tests" - >>> dir = "expr/" - >>> name = "ds-0-10-linspace-float64-(True, True)-a1-(60, 60)d.b2nd" - >>> path = pathlib.Path(f"{root}/{dir + name}").as_posix() - >>> remote_array = blosc2.C2Array(path, urlbase=host) - >>> f"Shape of the remote array: {remote_array.shape}" - >>> f"Chunks of the remote array: {remote_array.chunks}" - >>> f"Blocks of the remote array: {remote_array.blocks}" - >>> f"Dtype of the remote array: {remote_array.dtype}" - Shape of the remote array: (60, 60) - Chunks of the remote array: (30, 60) - Blocks of the remote array: (10, 60) - Dtype of the remote array: float64 + >>> urlbase = "https://demo.caterva2.net/" + >>> root = "example" + >>> path = pathlib.Path(f"{root}/dir1/ds-3d.b2nd").as_posix() + >>> remote_array = blosc2.C2Array(path, urlbase=urlbase) + >>> remote_array.shape + (3, 4, 5) + >>> remote_array.chunks + (2, 3, 4) + >>> remote_array.blocks + (2, 2, 2) + >>> remote_array.dtype + float32 """ if path.startswith("/"): raise ValueError("The path should start with a root name, not a slash") @@ -253,6 +251,21 @@ def __getitem__(self, slice_: int | slice | Sequence[slice]) -> np.ndarray: ------- out: numpy.ndarray A numpy.ndarray containing the data slice. + + Examples + -------- + >>> import pathlib + >>> import blosc2 + >>> urlbase = "https://demo.caterva2.net/" + >>> root = "example" + >>> path = pathlib.Path(f"{root}/dir1/ds-2d.b2nd").as_posix() + >>> remote_array = blosc2.C2Array(path, urlbase=urlbase) + >>> data_slice = remote_array[3:5, 1:4] + >>> data_slice.shape + (2, 3) + >>> data_slice[:] + [[61 62 63] + [81 82 83]] """ slice_ = slice_to_string(slice_) return fetch_data(self.path, self.urlbase, {"slice_": slice_}, auth_token=self.auth_token) @@ -276,30 +289,24 @@ def get_chunk(self, nchunk: int) -> bytes: >>> import pathlib >>> import numpy as np >>> import blosc2 - >>> host = "https://demo.caterva2.net/" - >>> root = "b2tests" - >>> dir = "expr/" - >>> root = "b2tests" - >>> dir = "expr/" - >>> name1 = "ds-0-10-linspace-float64-(True, True)-a1-(60, 60)d.b2nd" - >>> name2 = "ds-0-10-linspace-float64-(True, True)-a2-(60, 60)d.b2nd" - >>> path1 = pathlib.Path(f"{root}/{dir + name1}").as_posix() - >>> path2 = pathlib.Path(f"{root}/{dir + name2}").as_posix() - >>> a = blosc2.C2Array(path1, host) - >>> b = blosc2.C2Array(path2, host) + >>> urlbase = "https://demo.caterva2.net/" + >>> root = "example" + >>> path = pathlib.Path(f"{root}/dir1/ds-3d.b2nd").as_posix() + >>> a = blosc2.C2Array(path, urlbase) + >>> b = blosc2.C2Array(path, urlbase) >>> c = a + b >>> # Get the compressed chunk from array 'a' for index 0 >>> chunk_index = 0 >>> compressed_chunk = c.get_chunk(chunk_index) >>> f"Size of chunk {chunk_index} from a: {len(compressed_chunk)} bytes" - Size of chunk 0 from 'a': 8604 bytes + Size of chunk 0 from a: 160 bytes >>> # Decompress the chunk and convert it to a NumPy array >>> decompressed_chunk = blosc2.decompress(compressed_chunk) >>> chunk_np_array = np.frombuffer(decompressed_chunk, dtype=a.dtype) - >>> f"Content of chunk {chunk_index} as NumPy array:{chunk_np_array}" + >>> chunk_np_array Content of chunk 0 as NumPy array: - [0.00000000e+00 5.55709919e-03 1.11141984e-02 ... 9.98610725e+00 - 9.99166435e+00 9.99722145e+00] + [ 0. 2. 10. 12. 40. 42. 50. 52. 4. 6. 14. 16. 44. 46. 54. 56. 20. 22. + 0. 0. 60. 62. 0. 0. 24. 26. 0. 0. 64. 66. 0. 0.] """ url = _sub_url(self.urlbase, f"api/chunk/{self.path}") params = {"nchunk": nchunk} diff --git a/src/blosc2/proxy.py b/src/blosc2/proxy.py index 828b6ab8..43c6cada 100644 --- a/src/blosc2/proxy.py +++ b/src/blosc2/proxy.py @@ -296,38 +296,39 @@ async def afetch(self, item: slice | list[slice] = None) -> blosc2.NDArray | blo >>> import numpy as np >>> import blosc2 >>> import asyncio - >>> class MyProxySource: + >>> from blosc2 import ProxyNDSource + >>> class MyProxySource(ProxyNDSource): >>> def __init__(self, data): - >>> # If the next source is multidimensional, it must have the attributes: - >>> self.data = data - >>> f"Data shape: {self.shape}, Chunks: {self.chunks}" + >>> # If the next source is multidimensional, it must have the attributes: + >>> self.data = data + >>> f"Data shape: {self.shape}, Chunks: {self.chunks}" Data shape: (4, 5), Chunks: [2, 5] - >>> f"Blocks: {self.blocks}, Dtype: {self.dtype}" + >>> f"Blocks: {self.blocks}, Dtype: {self.dtype}" Blocks: [1, 5], Dtype: int64 >>> @property >>> def shape(self): - >>> return self.data.shape + >>> return self.data.shape >>> @property >>> def chunks(self): - >>> return self.data.chunks + >>> return self.data.chunks >>> @property >>> def blocks(self): - >>> return self.data.blocks + >>> return self.data.blocks >>> @property >>> def dtype(self): - >>> return self.data.dtype + >>> return self.data.dtype >>> # This method must be present >>> def get_chunk(self, nchunk): - >>> return self.data.get_chunk(nchunk) + >>> return self.data.get_chunk(nchunk) >>> # This method is optional >>> async def aget_chunk(self, nchunk): - >>> await asyncio.sleep(0.1) # Simulate an asynchronous operation - >>> return self.data.get_chunk(nchunk) + >>> await asyncio.sleep(0.1) # Simulate an asynchronous operation + >>> return self.data.get_chunk(nchunk) >>> data = np.arange(20).reshape(4, 5) >>> chunks = [2, 5] >>> blocks = [1, 5] >>> data = blosc2.asarray(data, chunks=chunks, blocks=blocks) - >>> source = MyProxySource(data2) + >>> source = MyProxySource(data) >>> proxy = blosc2.Proxy(source) >>> async def fetch_data(): >>> # Fetch a slice of the data from the proxy asynchronously @@ -335,20 +336,23 @@ async def afetch(self, item: slice | list[slice] = None) -> blosc2.NDArray | blo >>> # Note that only data fetched is shown, the rest is uninitialized >>> f"Slice data cache: {slice_data[:]}" Slice data cache: - [[0 1 2 3 4] - [5 6 7 8 9] - [0 0 0 0 0] - [0 0 0 0 0]] + [[0 1 2 3 4] + [5 6 7 8 9] + [0 0 0 0 0] + [0 0 0 0 0]] >>> # Fetch the full data from the proxy asynchronously >>> full_data = await proxy.afetch() >>> # Now, all data is shown, meaning the full data has been fetched >>> f"Full data cache: {full_data[:]}" Full data cache: - [[ 0 1 2 3 4] - [ 5 6 7 8 9] - [10 11 12 13 14] - [15 16 17 18 19]] + [[ 0 1 2 3 4] + [ 5 6 7 8 9] + [10 11 12 13 14] + [15 16 17 18 19]] >>> asyncio.run(fetch_data()) + >>> # Using getitem to get a slice of the data + >>> result = proxy[1:2, 1:3] + [[6 7]] """ if not callable(getattr(self.src, "aget_chunk", None)): raise NotImplementedError("afetch is only available if the source has an aget_chunk method") From 5f2d9e8c5116e4643f80c01ff23926fe7a4f0fc4 Mon Sep 17 00:00:00 2001 From: Marta Iborra Date: Wed, 18 Sep 2024 11:40:27 +0200 Subject: [PATCH 05/18] Unify cparams default values && add dataclass cparams --- doc/reference/index.rst | 1 + doc/reference/storage.rst | 17 +++++ src/blosc2/__init__.py | 5 +- src/blosc2/blosc2_ext.pyx | 4 +- src/blosc2/core.py | 112 ++++++++++++++++++++++--------- tests/ndarray/test_reductions.py | 2 +- tests/test_compress2.py | 16 ++--- 7 files changed, 113 insertions(+), 44 deletions(-) create mode 100644 doc/reference/storage.rst diff --git a/doc/reference/index.rst b/doc/reference/index.rst index d6cf97b4..2caef31c 100644 --- a/doc/reference/index.rst +++ b/doc/reference/index.rst @@ -4,6 +4,7 @@ API Reference .. toctree:: :maxdepth: 2 + storage top_level classes array_operations diff --git a/doc/reference/storage.rst b/doc/reference/storage.rst new file mode 100644 index 00000000..92338884 --- /dev/null +++ b/doc/reference/storage.rst @@ -0,0 +1,17 @@ +.. _Storage: + +Storage +======= + +This is a class for ....... + +.. currentmodule:: blosc2 + +CParams +------- + +.. autosummary:: + :toctree: autofiles/storage + :nosignatures: + + CParams diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py index 489e8577..e7ffd979 100644 --- a/src/blosc2/__init__.py +++ b/src/blosc2/__init__.py @@ -155,6 +155,7 @@ class Tuner(Enum): compress2, compressor_list, compute_chunks_blocks, + CParams, decompress, decompress2, detect_number_of_cores, @@ -266,7 +267,6 @@ class Tuner(Enum): "nthreads": nthreads, "blocksize": 0, "splitmode": SplitMode.ALWAYS_SPLIT, - "schunk": None, "filters": [ Filter.NOFILTER, Filter.NOFILTER, @@ -276,10 +276,7 @@ class Tuner(Enum): Filter.SHUFFLE, ], "filters_meta": [0, 0, 0, 0, 0, 0], - "prefilter": None, - "preparams": None, "tuner": Tuner.STUNE, - "instr_codec": False, } """ Compression params defaults. diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx index 3f6f2d73..23f3eb00 100644 --- a/src/blosc2/blosc2_ext.pyx +++ b/src/blosc2/blosc2_ext.pyx @@ -716,7 +716,9 @@ cdef _check_dparams(blosc2_dparams* dparams, blosc2_cparams* cparams=NULL): cdef create_cparams_from_kwargs(blosc2_cparams *cparams, kwargs): if "compcode" in kwargs: - raise NameError("`compcode` has been renamed to `codec`. Please go update your code.") + raise NameError("`compcode` has been renamed to `codec`. Please go update your code.") + if "shuffle" in kwargs: + raise NameError("`shuffle` has been substituted by `filters`. Please go update your code.") codec = kwargs.get('codec', blosc2.cparams_dflts['codec']) cparams.compcode = codec if not isinstance(codec, blosc2.Codec) else codec.value cparams.compcode_meta = kwargs.get('codec_meta', blosc2.cparams_dflts['codec_meta']) diff --git a/src/blosc2/core.py b/src/blosc2/core.py index 977a0a81..0b192fe6 100644 --- a/src/blosc2/core.py +++ b/src/blosc2/core.py @@ -18,6 +18,7 @@ import platform import sys from collections.abc import Callable +from dataclasses import dataclass, field, asdict import cpuinfo import numpy as np @@ -53,12 +54,76 @@ def _check_codec(codec): raise ValueError(f"codec can only be one of: {codecs}, not '{codec}'") +def default_filters(): + return [blosc2.Filter.NOFILTER, + blosc2.Filter.NOFILTER, + blosc2.Filter.NOFILTER, + blosc2.Filter.NOFILTER, + blosc2.Filter.NOFILTER, + blosc2.Filter.SHUFFLE] + + +def default_filters_meta(): + return [0] * 6 + +@dataclass +class CParams: + """Dataclass for hosting the different compression parameters. + + Parameters + ---------- + codec: :class:`Codec` + The compressor code. Default is :py:obj:`Codec.ZSTD `. + codec_meta: int + The metadata for the compressor code, 0 by default. + clevel: int + The compression level from 0 (no compression) to 9 + (maximum compression). Default: 1. + use_dict: bool + Use dicts or not when compressing + (only for :py:obj:`blosc2.Codec.ZSTD `). Default: `False`. + typesize: int from 1 to 255 + The data type size. Default: 8. + nthreads: int + The number of threads to use internally. By default, blosc2 computes + a good guess. + blocksize: int + The requested size of the compressed blocks. If 0 (the default) + blosc2 chooses it automatically. + splitmode: :class:`SplitMode` + The split mode for the blocks. + The default value is :py:obj:`SplitMode.ALWAYS_SPLIT `. + filters: :class:`Filter` list + The sequence of filters. Default: [:py:obj:`Filter.NOFILTER `, + :py:obj:`Filter.NOFILTER `, :py:obj:`Filter.NOFILTER `, :py:obj:`Filter.NOFILTER `, + :py:obj:`Filter.NOFILTER `, :py:obj:`Filter.SHUFFLE `]. + filters_meta: list + The metadata for filters. Default: `[0, 0, 0, 0, 0, 0]`. + tuner: :class:`Tuner` + The tuner to use. Default: :py:obj:`Tuner.STUNE `. + """ + codec: blosc2.Codec = blosc2.Codec.ZSTD + codec_meta: int = 0 + clevel: int = 1 + use_dict: bool = False + typesize: int = 8 + nthreads: int = blosc2.nthreads + blocksize: int = 0 + splitmode: blosc2.SplitMode = blosc2.SplitMode.ALWAYS_SPLIT + filters: list[blosc2.Filter] = field(default_factory=default_filters) + filters_meta: list[int] = field(default_factory=default_filters_meta) + tuner: blosc2.Tuner = blosc2.Tuner.STUNE + + # def __post_init__(self): + # if len(self.filters) > 6: + + def compress( src: object, - typesize: int = None, - clevel: int = 9, + typesize: int = 8, + clevel: int = 1, filter: blosc2.Filter = blosc2.Filter.SHUFFLE, - codec: blosc2.Codec = blosc2.Codec.BLOSCLZ, + codec: blosc2.Codec = blosc2.Codec.ZSTD, _ignore_multiple_size: bool = False, ) -> str | bytes: """Compress src, with a given type size. @@ -1382,7 +1447,7 @@ def compute_chunks_blocks( return tuple(chunks), tuple(blocks) -def compress2(src: object, **kwargs: dict) -> str | bytes: +def compress2(src: object, **kwargs: CParams | dict) -> str | bytes: """Compress :paramref:`src` with the given compression params (if given) Parameters @@ -1393,34 +1458,15 @@ def compress2(src: object, **kwargs: dict) -> str | bytes: Other Parameters ---------------- kwargs: dict, optional + Compression parameters. The default values are in :ref:`blosc2.CParams`. Keyword arguments supported: - codec: :class:`Codec` - The compressor code. Default is :py:obj:`Codec.BLOSCLZ `. - codec_meta: int - The metadata for the compressor code, 0 by default. - clevel: int - The compression level from 0 (no compression) to 9 - (maximum compression). Default: 5. - use_dict: bool - Use dicts or not when compressing - (only for :py:obj:`blosc2.Codec.ZSTD `). By default `False`. - typesize: int from 1 to 255 - The data type size. Default: 8. - nthreads: int - The number of threads to use internally (1 by default). - blocksize: int - The requested size of the compressed blocks. If 0 (the default) - blosc2 chooses it automatically. - splitmode: :class:`SplitMode` - The split mode for the blocks. - The default value is :py:obj:`SplitMode.FORWARD_COMPAT_SPLIT `. - filters: :class:`Filter` list - The sequence of filters. Default: {0, 0, 0, 0, 0, :py:obj:`Filter.SHUFFLE `}. - filters_meta: list - The metadata for filters. Default: `{0, 0, 0, 0, 0, 0}`. - tuner: :class:`Tuner` - The tuner to use. Default: :py:obj:`Tuner.STUNE `. + cparams: :class:`CParams` + All the compression parameters that you want to use as + a :class:`CParams` instance. + others: Any + If `cparams` is not passed, all the parameters of a :class:`CParams` + can be passed as keyword arguments. Returns ------- @@ -1434,6 +1480,12 @@ def compress2(src: object, **kwargs: dict) -> str | bytes: If an internal error occurred, probably because some parameter is not a valid parameter. """ + if kwargs is not None: + if 'cparams' in kwargs: + if len(kwargs) > 1: + raise AttributeError("Cannot pass both cparams and other kwargs already included in CParams") + kwargs = asdict(kwargs.get('cparams')) + return blosc2_ext.compress2(src, **kwargs) diff --git a/tests/ndarray/test_reductions.py b/tests/ndarray/test_reductions.py index 3f56e544..7b456fbd 100644 --- a/tests/ndarray/test_reductions.py +++ b/tests/ndarray/test_reductions.py @@ -65,7 +65,7 @@ def test_reduce_bool(array_fixture, reduce_op): @pytest.mark.parametrize("axis", [0, 1, (0, 1), None]) @pytest.mark.parametrize("keepdims", [True, False]) @pytest.mark.parametrize("dtype_out", [np.int16, np.float64]) -@pytest.mark.parametrize("kwargs", [{}, {"cparams": dict(clevel=1, shuffle=blosc2.Filter.BITSHUFFLE)}]) +@pytest.mark.parametrize("kwargs", [{}, {"cparams": dict(clevel=1, filters=[blosc2.Filter.BITSHUFFLE], filters_meta=[0])}]) def test_reduce_params(array_fixture, axis, keepdims, dtype_out, reduce_op, kwargs): a1, a2, a3, a4, na1, na2, na3, na4 = array_fixture if axis is not None and np.isscalar(axis) and len(a1.shape) >= axis: diff --git a/tests/test_compress2.py b/tests/test_compress2.py index f9b1851d..113169c4 100644 --- a/tests/test_compress2.py +++ b/tests/test_compress2.py @@ -19,15 +19,15 @@ @pytest.mark.parametrize( "obj, cparams, dparams", [ - (random.integers(0, 10, 10), {"codec": blosc2.Codec.LZ4, "clevel": 6}, {}), + (random.integers(0, 10, 10), {'cparams': blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=6)}, {}), ( np.arange(10, dtype="float32"), # Select an absolute precision of 10 bits in mantissa - { - "filters": [blosc2.Filter.TRUNC_PREC, blosc2.Filter.BITSHUFFLE], - "filters_meta": [10, 0], - "typesize": 4, - }, + {'cparams': blosc2.CParams( + filters=[blosc2.Filter.TRUNC_PREC, blosc2.Filter.BITSHUFFLE], + filters_meta=[10, 0], + typesize=4 + )}, {"nthreads": 4}, ), ( @@ -42,10 +42,10 @@ ), ( random.integers(0, 1000, 1000, endpoint=True), - {"splitmode": blosc2.SplitMode.ALWAYS_SPLIT, "nthreads": 5, "typesize": 4}, + {'cparams': blosc2.CParams(splitmode=blosc2.SplitMode.ALWAYS_SPLIT, nthreads=5, typesize=4)}, {}, ), - (np.arange(45, dtype=np.float64), {"codec": blosc2.Codec.LZ4HC, "typesize": 4}, {}), + (np.arange(45, dtype=np.float64), {'cparams': blosc2.CParams(codec=blosc2.Codec.LZ4HC, typesize=4)}, {}), (np.arange(50, dtype=np.int64), {"typesize": 4}, blosc2.dparams_dflts), ], ) From 8a6fc22d60bc89d30c2060c6e9cc6ae4b3e4ab64 Mon Sep 17 00:00:00 2001 From: Marta Iborra Date: Wed, 18 Sep 2024 12:45:45 +0200 Subject: [PATCH 06/18] Move params defaults to storage.py && fix tests for new defaults --- src/blosc2/__init__.py | 50 +-------------- src/blosc2/core.py | 74 ++-------------------- src/blosc2/storage.py | 126 +++++++++++++++++++++++++++++++++++++ tests/test_bytes_array.py | 2 +- tests/test_decompress.py | 5 +- tests/test_python_blosc.py | 4 +- 6 files changed, 140 insertions(+), 121 deletions(-) create mode 100644 src/blosc2/storage.py diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py index e7ffd979..815fa6ce 100644 --- a/src/blosc2/__init__.py +++ b/src/blosc2/__init__.py @@ -155,7 +155,6 @@ class Tuner(Enum): compress2, compressor_list, compute_chunks_blocks, - CParams, decompress, decompress2, detect_number_of_cores, @@ -189,9 +188,6 @@ class Tuner(Enum): unpack_tensor, ) -# Get CPU info -cpu_info = get_cpu_info() - from .ndarray import ( # noqa: I001 NDArray, @@ -217,6 +213,8 @@ class Tuner(Enum): from .schunk import SChunk, open +from .storage import cpu_info, CParams, cparams_dflts, dparams_dflts, ncores, nthreads, storage_dflts + # Registry for postfilters postfilter_funcs = {} @@ -244,55 +242,13 @@ class Tuner(Enum): """ The blosc2 version + date. """ + # Internal Blosc threading -nthreads = ncores = cpu_info.get("count", 1) -"""Number of threads to be used in compression/decompression. -""" -# Protection against too many threads -nthreads = min(nthreads, 32) -# Experiments say that, when using a large number of threads, it is better to not use them all -nthreads -= nthreads // 8 set_nthreads(nthreads) # Set the number of threads for NumExpr numexpr.set_num_threads(nthreads) -# Defaults for compression params -cparams_dflts = { - "codec": Codec.ZSTD, - "codec_meta": 0, - "clevel": 1, - "use_dict": False, - "typesize": 8, - "nthreads": nthreads, - "blocksize": 0, - "splitmode": SplitMode.ALWAYS_SPLIT, - "filters": [ - Filter.NOFILTER, - Filter.NOFILTER, - Filter.NOFILTER, - Filter.NOFILTER, - Filter.NOFILTER, - Filter.SHUFFLE, - ], - "filters_meta": [0, 0, 0, 0, 0, 0], - "tuner": Tuner.STUNE, -} -""" -Compression params defaults. -""" - -# Defaults for decompression params -dparams_dflts = {"nthreads": nthreads, "schunk": None, "postfilter": None, "postparams": None} -""" -Decompression params defaults. -""" -# Default for storage -storage_dflts = {"contiguous": False, "urlpath": None, "cparams": None, "dparams": None, "io": None} -""" -Storage params defaults. This is meant only for :ref:`SChunk ` or :ref:`NDArray `. -""" - _disable_overloaded_equal = False # Delayed imports for avoiding overwriting of python builtins diff --git a/src/blosc2/core.py b/src/blosc2/core.py index 0b192fe6..b0d680d2 100644 --- a/src/blosc2/core.py +++ b/src/blosc2/core.py @@ -18,7 +18,7 @@ import platform import sys from collections.abc import Callable -from dataclasses import dataclass, field, asdict +from dataclasses import asdict import cpuinfo import numpy as np @@ -54,70 +54,6 @@ def _check_codec(codec): raise ValueError(f"codec can only be one of: {codecs}, not '{codec}'") -def default_filters(): - return [blosc2.Filter.NOFILTER, - blosc2.Filter.NOFILTER, - blosc2.Filter.NOFILTER, - blosc2.Filter.NOFILTER, - blosc2.Filter.NOFILTER, - blosc2.Filter.SHUFFLE] - - -def default_filters_meta(): - return [0] * 6 - -@dataclass -class CParams: - """Dataclass for hosting the different compression parameters. - - Parameters - ---------- - codec: :class:`Codec` - The compressor code. Default is :py:obj:`Codec.ZSTD `. - codec_meta: int - The metadata for the compressor code, 0 by default. - clevel: int - The compression level from 0 (no compression) to 9 - (maximum compression). Default: 1. - use_dict: bool - Use dicts or not when compressing - (only for :py:obj:`blosc2.Codec.ZSTD `). Default: `False`. - typesize: int from 1 to 255 - The data type size. Default: 8. - nthreads: int - The number of threads to use internally. By default, blosc2 computes - a good guess. - blocksize: int - The requested size of the compressed blocks. If 0 (the default) - blosc2 chooses it automatically. - splitmode: :class:`SplitMode` - The split mode for the blocks. - The default value is :py:obj:`SplitMode.ALWAYS_SPLIT `. - filters: :class:`Filter` list - The sequence of filters. Default: [:py:obj:`Filter.NOFILTER `, - :py:obj:`Filter.NOFILTER `, :py:obj:`Filter.NOFILTER `, :py:obj:`Filter.NOFILTER `, - :py:obj:`Filter.NOFILTER `, :py:obj:`Filter.SHUFFLE `]. - filters_meta: list - The metadata for filters. Default: `[0, 0, 0, 0, 0, 0]`. - tuner: :class:`Tuner` - The tuner to use. Default: :py:obj:`Tuner.STUNE `. - """ - codec: blosc2.Codec = blosc2.Codec.ZSTD - codec_meta: int = 0 - clevel: int = 1 - use_dict: bool = False - typesize: int = 8 - nthreads: int = blosc2.nthreads - blocksize: int = 0 - splitmode: blosc2.SplitMode = blosc2.SplitMode.ALWAYS_SPLIT - filters: list[blosc2.Filter] = field(default_factory=default_filters) - filters_meta: list[int] = field(default_factory=default_filters_meta) - tuner: blosc2.Tuner = blosc2.Tuner.STUNE - - # def __post_init__(self): - # if len(self.filters) > 6: - - def compress( src: object, typesize: int = 8, @@ -1447,7 +1383,7 @@ def compute_chunks_blocks( return tuple(chunks), tuple(blocks) -def compress2(src: object, **kwargs: CParams | dict) -> str | bytes: +def compress2(src: object, **kwargs: dict) -> str | bytes: """Compress :paramref:`src` with the given compression params (if given) Parameters @@ -1461,11 +1397,11 @@ def compress2(src: object, **kwargs: CParams | dict) -> str | bytes: Compression parameters. The default values are in :ref:`blosc2.CParams`. Keyword arguments supported: - cparams: :class:`CParams` + cparams: :class:`blosc2.CParams` All the compression parameters that you want to use as - a :class:`CParams` instance. + a :class:`blosc2.CParams` instance. others: Any - If `cparams` is not passed, all the parameters of a :class:`CParams` + If `cparams` is not passed, all the parameters of a :class:`blosc2.CParams` can be passed as keyword arguments. Returns diff --git a/src/blosc2/storage.py b/src/blosc2/storage.py new file mode 100644 index 00000000..171257f3 --- /dev/null +++ b/src/blosc2/storage.py @@ -0,0 +1,126 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# This source code is licensed under a BSD-style license (found in the +# LICENSE file in the root directory of this source tree) +####################################################################### + +from dataclasses import dataclass, field, asdict + +import blosc2 + + +# Internal Blosc threading +# Get CPU info +cpu_info = blosc2.get_cpu_info() +nthreads = ncores = cpu_info.get("count", 1) +"""Number of threads to be used in compression/decompression. +""" +# Protection against too many threads +nthreads = min(nthreads, 32) +# Experiments say that, when using a large number of threads, it is better to not use them all +nthreads -= nthreads // 8 + +# Defaults for compression params +cparams_dflts = { + "codec": blosc2.Codec.ZSTD, + "codec_meta": 0, + "clevel": 1, + "use_dict": False, + "typesize": 8, + "nthreads": nthreads, + "blocksize": 0, + "splitmode": blosc2.SplitMode.ALWAYS_SPLIT, + "filters": [ + blosc2.Filter.NOFILTER, + blosc2.Filter.NOFILTER, + blosc2.Filter.NOFILTER, + blosc2.Filter.NOFILTER, + blosc2.Filter.NOFILTER, + blosc2.Filter.SHUFFLE, + ], + "filters_meta": [0, 0, 0, 0, 0, 0], + "tuner": blosc2.Tuner.STUNE, +} +""" +Compression params defaults. +""" + +# Defaults for decompression params +dparams_dflts = {"nthreads": nthreads} +""" +Decompression params defaults. +""" +# Default for storage +storage_dflts = {"contiguous": False, "urlpath": None, "cparams": None, "dparams": None, "io": None} +""" +Storage params defaults. This is meant only for :ref:`SChunk ` or :ref:`NDArray `. +""" + + +def default_nthreads(): + return nthreads + +def default_filters(): + return [blosc2.Filter.NOFILTER, + blosc2.Filter.NOFILTER, + blosc2.Filter.NOFILTER, + blosc2.Filter.NOFILTER, + blosc2.Filter.NOFILTER, + blosc2.Filter.SHUFFLE] + + +def default_filters_meta(): + return [0] * 6 + +@dataclass +class CParams: + """Dataclass for hosting the different compression parameters. + + Parameters + ---------- + codec: :class:`Codec` + The compressor code. Default is :py:obj:`Codec.ZSTD `. + codec_meta: int + The metadata for the compressor code, 0 by default. + clevel: int + The compression level from 0 (no compression) to 9 + (maximum compression). Default: 1. + use_dict: bool + Use dicts or not when compressing + (only for :py:obj:`blosc2.Codec.ZSTD `). Default: `False`. + typesize: int from 1 to 255 + The data type size. Default: 8. + nthreads: int + The number of threads to use internally. By default, blosc2 computes + a good guess. + blocksize: int + The requested size of the compressed blocks. If 0 (the default) + blosc2 chooses it automatically. + splitmode: :class:`SplitMode` + The split mode for the blocks. + The default value is :py:obj:`SplitMode.ALWAYS_SPLIT `. + filters: :class:`Filter` list + The sequence of filters. Default: [:py:obj:`Filter.NOFILTER `, + :py:obj:`Filter.NOFILTER `, :py:obj:`Filter.NOFILTER `, :py:obj:`Filter.NOFILTER `, + :py:obj:`Filter.NOFILTER `, :py:obj:`Filter.SHUFFLE `]. + filters_meta: list + The metadata for filters. Default: `[0, 0, 0, 0, 0, 0]`. + tuner: :class:`Tuner` + The tuner to use. Default: :py:obj:`Tuner.STUNE `. + """ + codec: blosc2.Codec = blosc2.Codec.ZSTD + codec_meta: int = 0 + clevel: int = 1 + use_dict: bool = False + typesize: int = 8 + nthreads: int = field(default_factory=default_nthreads) + blocksize: int = 0 + splitmode: blosc2.SplitMode = blosc2.SplitMode.ALWAYS_SPLIT + filters: list[blosc2.Filter] = field(default_factory=default_filters) + filters_meta: list[int] = field(default_factory=default_filters_meta) + tuner: blosc2.Tuner = blosc2.Tuner.STUNE + + # def __post_init__(self): + # if len(self.filters) > 6: \ No newline at end of file diff --git a/tests/test_bytes_array.py b/tests/test_bytes_array.py index 2b0025b7..8c50b2da 100644 --- a/tests/test_bytes_array.py +++ b/tests/test_bytes_array.py @@ -22,6 +22,6 @@ def test_bytes_array(arr, gil): @pytest.mark.parametrize("data", [bytearray(7241), bytearray(7241) * 7]) def test_bytearray(data): - cdata = blosc2.compress(data) + cdata = blosc2.compress(data, typesize=1) uncomp = blosc2.decompress(cdata) assert data == uncomp diff --git a/tests/test_decompress.py b/tests/test_decompress.py index 9560fba4..aa2c9f81 100644 --- a/tests/test_decompress.py +++ b/tests/test_decompress.py @@ -28,7 +28,8 @@ ) def test_decompress_numpy(object, codec, gil): blosc2.set_releasegil(gil) - c = blosc2.compress(object, codec=codec) + typesize = None if hasattr(object, "itemsize") else 1 + c = blosc2.compress(object, typesize=typesize, codec=codec) dest = bytearray(object) blosc2.decompress(c, dst=dest) @@ -59,7 +60,7 @@ def test_decompress_numpy(object, codec, gil): ], ) def test_decompress(object, codec): - c = blosc2.compress(object, codec=codec) + c = blosc2.compress(object, typesize=1, codec=codec) dest = bytearray(object) blosc2.decompress(c, dst=dest) diff --git a/tests/test_python_blosc.py b/tests/test_python_blosc.py index fe06eac3..6d1001c6 100644 --- a/tests/test_python_blosc.py +++ b/tests/test_python_blosc.py @@ -239,7 +239,7 @@ def test_bitshuffle_not_multiple(self): xx = x.tobytes() with pytest.raises(ValueError): blosc2.compress(xx, typesize=8, filter=blosc2.Filter.BITSHUFFLE) - zxx = blosc2.compress(xx, filter=blosc2.Filter.BITSHUFFLE) + zxx = blosc2.compress(xx, typesize=1, filter=blosc2.Filter.BITSHUFFLE) last_xx = blosc2.decompress(zxx)[-3:] assert last_xx == b"\x01\x01\x01" @@ -248,7 +248,7 @@ def test_bitshuffle_leftovers(self): buffer = b" " * 641091 # a buffer that is not divisible by 8 with pytest.raises(ValueError): blosc2.compress(buffer, typesize=8, filter=blosc2.Filter.BITSHUFFLE, clevel=1) - cbuffer = blosc2.compress(buffer, filter=blosc2.Filter.BITSHUFFLE, clevel=1) + cbuffer = blosc2.compress(buffer, typesize=1, filter=blosc2.Filter.BITSHUFFLE, clevel=1) dbuffer = blosc2.decompress(cbuffer) assert buffer == dbuffer From 68f6d4949bc31bc30829e01fc1041d6305455068 Mon Sep 17 00:00:00 2001 From: Marta Iborra Date: Wed, 18 Sep 2024 13:02:41 +0200 Subject: [PATCH 07/18] Add DParams dataclass && avoid rewriting the defaults --- doc/reference/storage.rst | 9 ++++++ src/blosc2/__init__.py | 2 +- src/blosc2/core.py | 15 +++++++-- src/blosc2/storage.py | 68 ++++++++++++++++++--------------------- tests/test_compress2.py | 4 +-- 5 files changed, 56 insertions(+), 42 deletions(-) diff --git a/doc/reference/storage.rst b/doc/reference/storage.rst index 92338884..2162191c 100644 --- a/doc/reference/storage.rst +++ b/doc/reference/storage.rst @@ -15,3 +15,12 @@ CParams :nosignatures: CParams + +DParams +------- + +.. autosummary:: + :toctree: autofiles/storage + :nosignatures: + + DParams diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py index 815fa6ce..f075e7f5 100644 --- a/src/blosc2/__init__.py +++ b/src/blosc2/__init__.py @@ -213,7 +213,7 @@ class Tuner(Enum): from .schunk import SChunk, open -from .storage import cpu_info, CParams, cparams_dflts, dparams_dflts, ncores, nthreads, storage_dflts +from .storage import cpu_info, CParams, cparams_dflts, DParams, dparams_dflts, ncores, nthreads, storage_dflts # Registry for postfilters diff --git a/src/blosc2/core.py b/src/blosc2/core.py index b0d680d2..96dcc3ae 100644 --- a/src/blosc2/core.py +++ b/src/blosc2/core.py @@ -1444,10 +1444,15 @@ def decompress2(src: object, dst: object | bytearray = None, **kwargs: dict) -> Other Parameters ---------------- kwargs: dict, optional + Decompression parameters. The default values are in :ref:`blosc2.DParams`. Keyword arguments supported: - nthreads: int - The number of threads to use internally (1 by default). + cparams: :class:`blosc2.DParams` + All the decompression parameters that you want to use as + a :class:`blosc2.DParams` instance. + others: Any + If `dparams` is not passed, all the parameters of a :class:`blosc2.DParams` + can be passed as keyword arguments. Returns ------- @@ -1469,6 +1474,12 @@ def decompress2(src: object, dst: object | bytearray = None, **kwargs: dict) -> If the length of :paramref:`src` is smaller than the minimum. If :paramref:`dst` is not None and its length is 0. """ + if kwargs is not None: + if 'dparams' in kwargs: + if len(kwargs) > 1: + raise AttributeError("Cannot pass both dparams and other kwargs already included in DParams") + kwargs = asdict(kwargs.get('dparams')) + return blosc2_ext.decompress2(src, dst, **kwargs) diff --git a/src/blosc2/storage.py b/src/blosc2/storage.py index 171257f3..23cabb51 100644 --- a/src/blosc2/storage.py +++ b/src/blosc2/storage.py @@ -22,42 +22,6 @@ # Experiments say that, when using a large number of threads, it is better to not use them all nthreads -= nthreads // 8 -# Defaults for compression params -cparams_dflts = { - "codec": blosc2.Codec.ZSTD, - "codec_meta": 0, - "clevel": 1, - "use_dict": False, - "typesize": 8, - "nthreads": nthreads, - "blocksize": 0, - "splitmode": blosc2.SplitMode.ALWAYS_SPLIT, - "filters": [ - blosc2.Filter.NOFILTER, - blosc2.Filter.NOFILTER, - blosc2.Filter.NOFILTER, - blosc2.Filter.NOFILTER, - blosc2.Filter.NOFILTER, - blosc2.Filter.SHUFFLE, - ], - "filters_meta": [0, 0, 0, 0, 0, 0], - "tuner": blosc2.Tuner.STUNE, -} -""" -Compression params defaults. -""" - -# Defaults for decompression params -dparams_dflts = {"nthreads": nthreads} -""" -Decompression params defaults. -""" -# Default for storage -storage_dflts = {"contiguous": False, "urlpath": None, "cparams": None, "dparams": None, "io": None} -""" -Storage params defaults. This is meant only for :ref:`SChunk ` or :ref:`NDArray `. -""" - def default_nthreads(): return nthreads @@ -123,4 +87,34 @@ class CParams: tuner: blosc2.Tuner = blosc2.Tuner.STUNE # def __post_init__(self): - # if len(self.filters) > 6: \ No newline at end of file + # if len(self.filters) > 6: + + +@dataclass +class DParams: + """Dataclass for hosting the different decompression parameters. + + Parameters + ---------- + nthreads: int + The number of threads to use internally. By default, blosc2 computes + a good guess. + """ + nthreads: int = field(default_factory=default_nthreads) + +# Defaults for compression params +cparams_dflts = asdict(CParams()) +""" +Compression params defaults. +""" + +# Defaults for decompression params +dparams_dflts = asdict(DParams()) +""" +Decompression params defaults. +""" +# Default for storage +storage_dflts = {"contiguous": False, "urlpath": None, "cparams": blosc2.CParams(), "dparams": blosc2.DParams} +""" +Storage params defaults. This is meant only for :ref:`SChunk ` or :ref:`NDArray `. +""" diff --git a/tests/test_compress2.py b/tests/test_compress2.py index 113169c4..257b8d5b 100644 --- a/tests/test_compress2.py +++ b/tests/test_compress2.py @@ -28,7 +28,7 @@ filters_meta=[10, 0], typesize=4 )}, - {"nthreads": 4}, + {'dparams': blosc2.DParams(nthreads=4)}, ), ( np.arange(10, dtype="float32"), @@ -43,7 +43,7 @@ ( random.integers(0, 1000, 1000, endpoint=True), {'cparams': blosc2.CParams(splitmode=blosc2.SplitMode.ALWAYS_SPLIT, nthreads=5, typesize=4)}, - {}, + {'dparams': blosc2.DParams()}, ), (np.arange(45, dtype=np.float64), {'cparams': blosc2.CParams(codec=blosc2.Codec.LZ4HC, typesize=4)}, {}), (np.arange(50, dtype=np.int64), {"typesize": 4}, blosc2.dparams_dflts), From f0cf2b6a91fcda40d1ff54841e08cf76529eda50 Mon Sep 17 00:00:00 2001 From: Marta Iborra Date: Thu, 19 Sep 2024 13:01:32 +0200 Subject: [PATCH 08/18] Add Storage dataclass --- doc/reference/storage.rst | 18 ++++-- src/blosc2/__init__.py | 16 +++++- src/blosc2/blosc2_ext.pyx | 5 +- src/blosc2/core.py | 4 +- src/blosc2/schunk.py | 114 ++++++++---------------------------- src/blosc2/storage.py | 118 +++++++++++++++++++++++++++++++++++++- tests/test_schunk.py | 48 +++++++++------- 7 files changed, 200 insertions(+), 123 deletions(-) diff --git a/doc/reference/storage.rst b/doc/reference/storage.rst index 2162191c..9bcbb9fa 100644 --- a/doc/reference/storage.rst +++ b/doc/reference/storage.rst @@ -1,9 +1,8 @@ -.. _Storage: +Dataclasses +=========== -Storage -======= - -This is a class for ....... +Dataclasses for setting the compression, decompression +and storage parameters. All their parameters are optional. .. currentmodule:: blosc2 @@ -24,3 +23,12 @@ DParams :nosignatures: DParams + +Storage +------- + +.. autosummary:: + :toctree: autofiles/storage + :nosignatures: + + Storage diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py index f075e7f5..0672bb35 100644 --- a/src/blosc2/__init__.py +++ b/src/blosc2/__init__.py @@ -188,6 +188,18 @@ class Tuner(Enum): unpack_tensor, ) +# This import must be before ndarray and schunk +from .storage import ( + CParams, + cparams_dflts, + cpu_info, + DParams, + dparams_dflts, + ncores, + nthreads, + Storage, + storage_dflts, +) from .ndarray import ( # noqa: I001 NDArray, @@ -213,7 +225,6 @@ class Tuner(Enum): from .schunk import SChunk, open -from .storage import cpu_info, CParams, cparams_dflts, DParams, dparams_dflts, ncores, nthreads, storage_dflts # Registry for postfilters @@ -294,7 +305,9 @@ class Tuner(Enum): "__version__", "compress", "decompress", + "CParams", "cparams_dflts", + "DParams", "dparams_dflts", "storage_dflts", "set_compressor", @@ -326,6 +339,7 @@ class Tuner(Enum): "compress2", "decompress2", "SChunk", + "Storage", "open", "remove_urlpath", "nthreads", diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx index 23f3eb00..5211a659 100644 --- a/src/blosc2/blosc2_ext.pyx +++ b/src/blosc2/blosc2_ext.pyx @@ -929,7 +929,7 @@ cdef class SChunk: self._urlpath = urlpath.encode() if isinstance(urlpath, str) else urlpath kwargs["urlpath"] = self._urlpath - self.mode = kwargs.get("mode", "a") + self.mode = blosc2.Storage().mode if kwargs.get("mode", None) is None else kwargs.get("mode") self.mmap_mode = kwargs.get("mmap_mode") self.initial_mapping_size = kwargs.get("initial_mapping_size") if self.mmap_mode is not None: @@ -1077,7 +1077,8 @@ cdef class SChunk: "typesize": self.schunk.storage.cparams.typesize, "nthreads": self.schunk.storage.cparams.nthreads, "blocksize": self.schunk.storage.cparams.blocksize, - "splitmode": blosc2.SplitMode(self.schunk.storage.cparams.splitmode) + "splitmode": blosc2.SplitMode(self.schunk.storage.cparams.splitmode), + "tuner": blosc2.Tuner(self.schunk.storage.cparams.tuner_id), } filters = [0] * BLOSC2_MAX_FILTERS diff --git a/src/blosc2/core.py b/src/blosc2/core.py index 96dcc3ae..093a8264 100644 --- a/src/blosc2/core.py +++ b/src/blosc2/core.py @@ -1394,7 +1394,7 @@ def compress2(src: object, **kwargs: dict) -> str | bytes: Other Parameters ---------------- kwargs: dict, optional - Compression parameters. The default values are in :ref:`blosc2.CParams`. + Compression parameters. The default values are in :class:`blosc2.CParams`. Keyword arguments supported: cparams: :class:`blosc2.CParams` @@ -1444,7 +1444,7 @@ def decompress2(src: object, dst: object | bytearray = None, **kwargs: dict) -> Other Parameters ---------------- kwargs: dict, optional - Decompression parameters. The default values are in :ref:`blosc2.DParams`. + Decompression parameters. The default values are in :class:`blosc2.DParams`. Keyword arguments supported: cparams: :class:`blosc2.DParams` diff --git a/src/blosc2/schunk.py b/src/blosc2/schunk.py index c9f7c49f..8b107399 100644 --- a/src/blosc2/schunk.py +++ b/src/blosc2/schunk.py @@ -11,6 +11,7 @@ import pathlib from collections import namedtuple from collections.abc import Mapping, MutableMapping +from dataclasses import asdict from typing import Any, Iterator, NamedTuple import numpy as np @@ -156,95 +157,14 @@ def __init__(self, chunksize: int = None, data: object = None, **kwargs: dict): Other parameters ---------------- kwargs: dict, optional + Storage parameters. The default values are in :class:`blosc2.Storage`. Keyword arguments supported: - - contiguous: bool, optional - If the chunks are stored contiguously or not. - Default is True when :paramref:`urlpath` is not None; - False otherwise. - urlpath: str | pathlib.Path, optional - If the storage is persistent, the name of the file (when - `contiguous = True`) or the directory (if `contiguous = False`). - If the storage is in-memory, then this field is `None`. - mode: str, optional - Persistence mode: ‘r’ means read only (must exist); - ‘a’ means read/write (create if it doesn’t exist); - ‘w’ means create (overwrite if it exists). - mmap_mode: str, optional - If set, the file will be memory-mapped instead of using the default - I/O functions and the `mode` argument will be ignored. The memory-mapping - modes are similar as used by the - `numpy.memmap `_ - function, but it is possible to extend the file: - - .. list-table:: - :widths: 10 90 - :header-rows: 1 - - * - mode - - description - * - 'r' - - Open an existing file for reading only. - * - 'r+' - - Open an existing file for reading and writing. Use this mode if you want - to append data to an existing schunk file. - * - 'w+' - - Create or overwrite an existing file for reading and writing. Use this - mode if you want to create a new schunk. - * - 'c' - - Open an existing file in copy-on-write mode: all changes affect the data - in memory but changes are not saved to disk. The file on disk is - read-only. On Windows, the size of the mapping cannot change. - - Only contiguous storage can be memory-mapped. Hence, `urlpath` must point to a - file (and not a directory). - - .. note:: - Memory-mapped files are opened once and the file contents remain in (virtual) - memory for the lifetime of the schunk. Using memory-mapped I/O can be faster - than using the default I/O functions depending on the use case. Whereas - reading performance is generally better, writing performance may also be - slower in some cases on certain systems. In any case, memory-mapped files - can be especially beneficial when operating with network file systems - (like NFS). - - This is currently a beta feature (especially write operations) and we - recommend trying it out and reporting any issues you may encounter. - - initial_mapping_size: int, optional - The initial size of the mapping for the memory-mapped file when writes are - allowed (r+ w+, or c mode). Once a file is memory-mapped and extended beyond the - initial mapping size, the file must be remapped which may be expensive. This - parameter allows to decouple the mapping size from the actual file size to early - reserve memory for future writes and avoid remappings. The memory is only - reserved virtually and does not occupy physical memory unless actual writes - happen. Since the virtual address space is large enough, it is ok to be generous - with this parameter (with special consideration on Windows, see note below). - For best performance, set this to the maximum expected size of the compressed - data (see example in :obj:`SChunk.__init__ `). - The size is in bytes. - - Default: 1 GiB. - - .. note:: - On Windows, the size of the mapping is directly coupled to the file size. - When the schunk gets destroyed, the file size will be truncated to the - actual size of the schunk. - - cparams: dict - A dictionary with the compression parameters, which are the same - as those can be used in the :func:`~blosc2.compress2` function. - dparams: dict - A dictionary with the decompression parameters, which are the same - as those that can be used in the :func:`~blosc2.decompress2` - function. - meta: dict or None - A dictionary with different metalayers. One entry per metalayer: - - key: bytes or str - The name of the metalayer. - value: object - The metalayer object that will be serialized using msgpack. + storage: :class:`blosc2.Storage` + All the decompression parameters that you want to use as + a :class:`blosc2.Storage` instance. + others: Any + If `storage` is not passed, all the parameters of a :class:`blosc2.Storage` + can be passed as keyword arguments. Examples -------- @@ -301,10 +221,24 @@ def __init__(self, chunksize: int = None, data: object = None, **kwargs: dict): "mmap_mode", "initial_mapping_size", "_is_view", + "storage" ] for kwarg in kwargs: if kwarg not in allowed_kwargs: raise ValueError(f"{kwarg} is not supported as keyword argument") + if kwargs.get("storage") is not None: + if any(key not in ["_is_view", "_schunk", "storage"] for key in kwargs.keys()): + raise AttributeError("Cannot pass both `storage` and other kwargs already included in Storage") + storage = kwargs.get("storage") + del kwargs["storage"] + kwargs = {**kwargs, **asdict(storage)} + + if isinstance(kwargs.get("cparams"), blosc2.CParams): + kwargs["cparams"] = asdict(kwargs.get("cparams")) + + if isinstance(kwargs.get("dparams"), blosc2.DParams): + kwargs["dparams"] = asdict(kwargs.get("dparams")) + urlpath = kwargs.get("urlpath") if "contiguous" not in kwargs: # Make contiguous true for disk, else sparse (for in-memory performance) @@ -1395,8 +1329,8 @@ def __dealloc__(self): super().__dealloc__() -@_inherit_doc_parameter(SChunk.__init__, "mmap_mode:", {r"\* - 'w\+'[^*]+": ""}) -@_inherit_doc_parameter(SChunk.__init__, "initial_mapping_size:", {r"r\+ w\+, or c": "r+ or c"}) +@_inherit_doc_parameter(blosc2.Storage, "mmap_mode:", {r"\* - 'w\+'[^*]+": ""}) +@_inherit_doc_parameter(blosc2.Storage, "initial_mapping_size:", {r"r\+ w\+, or c": "r+ or c"}) def open(urlpath: str | pathlib.Path | blosc2.URLPath, mode: str = "a", offset: int = 0, **kwargs: dict) -> blosc2.SChunk | blosc2.NDArray | blosc2.C2Array: """Open a persistent :ref:`SChunk` or :ref:`NDArray` or a remote :ref:`C2Array` diff --git a/src/blosc2/storage.py b/src/blosc2/storage.py index 23cabb51..6598f90e 100644 --- a/src/blosc2/storage.py +++ b/src/blosc2/storage.py @@ -6,11 +6,11 @@ # LICENSE file in the root directory of this source tree) ####################################################################### -from dataclasses import dataclass, field, asdict +from dataclasses import dataclass, field, asdict, fields +import warnings import blosc2 - # Internal Blosc threading # Get CPU info cpu_info = blosc2.get_cpu_info() @@ -102,6 +102,118 @@ class DParams: """ nthreads: int = field(default_factory=default_nthreads) + +@dataclass +class Storage: + """Dataclass for hosting the different storage parameters. + + Parameters + ---------- + contiguous: bool + If the chunks are stored contiguously or not. + Default is True when :paramref:`urlpath` is not None; + False otherwise. + urlpath: str or pathlib.Path, optional + If the storage is persistent, the name of the file (when + `contiguous = True`) or the directory (if `contiguous = False`). + If the storage is in-memory, then this field is `None`. + cparams: :class:`CParams` or dict + The compression parameters as a :class:`CParams` instance or a dictionary. + dparams: :class:`DParams` or dict + The decompression parameters as a :class:`DParams` instance or a dictionary. + mode: str, optional + Persistence mode: ‘r’ means read only (must exist); + ‘a’ means read/write (create if it doesn’t exist); + ‘w’ means create (overwrite if it exists). Default is 'a'. + mmap_mode: str, optional + If set, the file will be memory-mapped instead of using the default + I/O functions and the `mode` argument will be ignored. The memory-mapping + modes are similar as used by the + `numpy.memmap `_ + function, but it is possible to extend the file: + + .. list-table:: + :widths: 10 90 + :header-rows: 1 + + * - mode + - description + * - 'r' + - Open an existing file for reading only. + * - 'r+' + - Open an existing file for reading and writing. Use this mode if you want + to append data to an existing schunk file. + * - 'w+' + - Create or overwrite an existing file for reading and writing. Use this + mode if you want to create a new schunk. + * - 'c' + - Open an existing file in copy-on-write mode: all changes affect the data + in memory but changes are not saved to disk. The file on disk is + read-only. On Windows, the size of the mapping cannot change. + + Only contiguous storage can be memory-mapped. Hence, `urlpath` must point to a + file (and not a directory). + + .. note:: + Memory-mapped files are opened once and the file contents remain in (virtual) + memory for the lifetime of the schunk. Using memory-mapped I/O can be faster + than using the default I/O functions depending on the use case. Whereas + reading performance is generally better, writing performance may also be + slower in some cases on certain systems. In any case, memory-mapped files + can be especially beneficial when operating with network file systems + (like NFS). + + This is currently a beta feature (especially write operations) and we + recommend trying it out and reporting any issues you may encounter. + + initial_mapping_size: int, optional + The initial size of the mapping for the memory-mapped file when writes are + allowed (r+ w+, or c mode). Once a file is memory-mapped and extended beyond the + initial mapping size, the file must be remapped which may be expensive. This + parameter allows to decouple the mapping size from the actual file size to early + reserve memory for future writes and avoid remappings. The memory is only + reserved virtually and does not occupy physical memory unless actual writes + happen. Since the virtual address space is large enough, it is ok to be generous + with this parameter (with special consideration on Windows, see note below). + For best performance, set this to the maximum expected size of the compressed + data (see example in :obj:`SChunk.__init__ `). + The size is in bytes. + + Default: 1 GiB. + + .. note:: + On Windows, the size of the mapping is directly coupled to the file size. + When the schunk gets destroyed, the file size will be truncated to the + actual size of the schunk. + + meta: dict or None + A dictionary with different metalayers. One entry per metalayer: + + key: bytes or str + The name of the metalayer. + value: object + The metalayer object that will be serialized using msgpack. + """ + contiguous: bool = None + urlpath: str = None + cparams: CParams | dict = field(default_factory=CParams) + dparams: DParams | dict = field(default_factory=DParams) + mode: str = 'a' + mmap_mode: str = None + initial_mapping_size: int = None + meta: dict = None + + def __post_init__(self): + if self.contiguous is None: + self.contiguous = False if self.urlpath is None else True + # Check for None values + for field in fields(self): + if (getattr(self, field.name) is None and + field.name not in ['urlpath', 'mmap_mode', 'initial_mapping_size', 'meta']): + setattr(self, field.name, getattr(Storage(), field.name)) + warnings.warn("`{name}` field value changed from `None` to `{value}`".format(name=field.name, value=getattr(self, field.name))) + + # Defaults for compression params cparams_dflts = asdict(CParams()) """ @@ -114,7 +226,7 @@ class DParams: Decompression params defaults. """ # Default for storage -storage_dflts = {"contiguous": False, "urlpath": None, "cparams": blosc2.CParams(), "dparams": blosc2.DParams} +storage_dflts = asdict(Storage()) """ Storage params defaults. This is meant only for :ref:`SChunk ` or :ref:`NDArray `. """ diff --git a/tests/test_schunk.py b/tests/test_schunk.py index 26e385e5..fc24f514 100644 --- a/tests/test_schunk.py +++ b/tests/test_schunk.py @@ -7,6 +7,7 @@ ####################################################################### import os +from dataclasses import asdict, replace import numpy as np import pytest @@ -37,31 +38,33 @@ @pytest.mark.parametrize( "cparams, dparams, nchunks", [ - ({"codec": blosc2.Codec.LZ4, "clevel": 6, "typesize": 4}, {}, 0), - ({"typesize": 4}, {"nthreads": 4}, 1), - ({"splitmode": blosc2.SplitMode.ALWAYS_SPLIT, "nthreads": 5, "typesize": 4}, {}, 5), + (blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=6, typesize=4), blosc2.DParams(), 0), + ({"typesize": 4}, blosc2.DParams(nthreads=4), 1), + (blosc2.CParams(splitmode=blosc2.SplitMode.ALWAYS_SPLIT, nthreads=5, typesize=4), {}, 5), ({"codec": blosc2.Codec.LZ4HC, "typesize": 4}, {}, 10), ], ) def test_schunk_numpy(contiguous, urlpath, mode, mmap_mode, cparams, dparams, nchunks): - storage = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams, "dparams": dparams} + storage = blosc2.Storage(contiguous=contiguous, urlpath=urlpath, mode=mode, mmap_mode=mmap_mode, + cparams=cparams, dparams=dparams) blosc2.remove_urlpath(urlpath) chunk_len = 200 * 1000 if mode != "r": - schunk = blosc2.SChunk(chunksize=chunk_len * 4, mode=mode, mmap_mode=mmap_mode, **storage) + schunk = blosc2.SChunk(chunksize=chunk_len * 4, storage=storage) else: with pytest.raises( ValueError, match="not specify a urlpath" if urlpath is None else "does not exist" ): - blosc2.SChunk(chunksize=chunk_len * 4, mode=mode, mmap_mode=mmap_mode, **storage) + blosc2.SChunk(chunksize=chunk_len * 4, storage=storage) # Create a schunk which we can read later + storage2 = replace(storage, + mode="w" if mmap_mode is None else None, + mmap_mode="w+" if mmap_mode is not None else None) schunk = blosc2.SChunk( chunksize=chunk_len * 4, - mode="w" if mmap_mode is None else None, - mmap_mode="w+" if mmap_mode is not None else None, - **storage, + storage=storage2, ) assert schunk.urlpath == urlpath @@ -74,9 +77,10 @@ def test_schunk_numpy(contiguous, urlpath, mode, mmap_mode, cparams, dparams, nc if mode == "r": if urlpath is not None: - schunk = blosc2.SChunk(chunksize=chunk_len * 4, mode=mode, mmap_mode=mmap_mode, **storage) + schunk = blosc2.SChunk(chunksize=chunk_len * 4, **asdict(storage)) else: return + assert schunk.nchunks == nchunks for i in range(nchunks): buffer = i * np.arange(chunk_len, dtype="int32") @@ -132,9 +136,9 @@ def test_schunk_ndarray(tmp_path, mode_write, mode_read, mmap_mode_write, mmap_m @pytest.mark.parametrize( "nbytes, cparams, dparams, nchunks", [ - (7, {"codec": blosc2.Codec.LZ4, "clevel": 6, "typesize": 5}, {}, 1), - (641091, {"typesize": 3}, {"nthreads": 2}, 1), - (136, {"typesize": 1}, {}, 5), + (7, blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=6, typesize=5), {}, 1), + (641091, {"typesize": 3}, blosc2.DParams(nthreads=2), 1), + (136, blosc2.CParams(typesize=1), blosc2.DParams(), 5), (1232, {"typesize": 8}, blosc2.dparams_dflts, 10), ], ) @@ -189,27 +193,31 @@ def test_schunk(contiguous, urlpath, mode, mmap_mode, nbytes, cparams, dparams, @pytest.mark.parametrize( "cparams, dparams, nchunks", [ - ({"codec": blosc2.Codec.LZ4, "clevel": 6, "typesize": 4}, {}, 1), + ({"codec": blosc2.Codec.LZ4, "clevel": 6, "typesize": 4}, blosc2.DParams(), 1), ({"typesize": 4}, {"nthreads": 4}, 1), - ({"splitmode": blosc2.SplitMode.ALWAYS_SPLIT, "nthreads": 5, "typesize": 4}, {}, 5), - ({"codec": blosc2.Codec.LZ4HC, "typesize": 4}, {}, 10), + (blosc2.CParams(splitmode=blosc2.SplitMode.ALWAYS_SPLIT, nthreads=5, typesize=4), {}, 5), + (blosc2.CParams(codec=blosc2.Codec.LZ4HC, typesize=4), blosc2.DParams(), 10), ], ) @pytest.mark.parametrize("copy", [True, False]) def test_schunk_cframe(contiguous, urlpath, mode, mmap_mode, cparams, dparams, nchunks, copy): - storage = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams, "dparams": dparams} + storage = blosc2.Storage(contiguous=contiguous, urlpath=urlpath, cparams=cparams, dparams=dparams, + mode=mode, mmap_mode=mmap_mode) blosc2.remove_urlpath(urlpath) data = np.arange(200 * 1000 * nchunks, dtype="int32") - schunk = blosc2.SChunk(chunksize=200 * 1000 * 4, data=data, mode=mode, mmap_mode=mmap_mode, **storage) + schunk = blosc2.SChunk(chunksize=200 * 1000 * 4, data=data, **asdict(storage)) cframe = schunk.to_cframe() schunk2 = blosc2.schunk_from_cframe(cframe, copy) + cparams_dict = cparams if isinstance(cparams, dict) else asdict(cparams) if not os.getenv("BTUNE_TRADEOFF"): - for key in cparams: + for key in cparams_dict: if key == "nthreads": continue - assert schunk2.cparams[key] == cparams[key] + if key == "blocksize" and cparams_dict[key] == 0: + continue + assert schunk2.cparams[key] == cparams_dict[key] data2 = np.empty(data.shape, dtype=data.dtype) schunk2.get_slice(out=data2) From 8689a66c21b825a55bce785844394ea82dc4a194 Mon Sep 17 00:00:00 2001 From: Marta Iborra Date: Fri, 20 Sep 2024 10:40:28 +0200 Subject: [PATCH 09/18] Make SChunk.cparams and dparams to only accept dataclasses --- src/blosc2/blosc2_ext.pyx | 101 ++++++++++++++++----------------- src/blosc2/lazyexpr.py | 2 +- src/blosc2/ndarray.py | 5 +- src/blosc2/schunk.py | 12 ++-- src/blosc2/storage.py | 10 +++- tests/ndarray/test_copy.py | 4 +- tests/ndarray/test_empty.py | 8 +-- tests/ndarray/test_full.py | 3 +- tests/ndarray/test_lazyexpr.py | 4 +- tests/ndarray/test_lazyudf.py | 12 ++-- tests/ndarray/test_lossy.py | 4 +- tests/test_open.py | 4 +- tests/test_postfilters.py | 2 +- tests/test_prefilters.py | 7 ++- tests/test_schunk.py | 82 ++++++++++---------------- 15 files changed, 124 insertions(+), 136 deletions(-) diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx index 5211a659..b44b9284 100644 --- a/src/blosc2/blosc2_ext.pyx +++ b/src/blosc2/blosc2_ext.pyx @@ -33,6 +33,7 @@ from enum import Enum import numpy as np from msgpack import packb, unpackb +from dataclasses import asdict import blosc2 @@ -1069,17 +1070,6 @@ cdef class SChunk: else: # User codec codec = self.schunk.storage.cparams.compcode - cparams_dict = { - "codec": codec, - "codec_meta": self.schunk.storage.cparams.compcode_meta, - "clevel": self.schunk.storage.cparams.clevel, - "use_dict": self.schunk.storage.cparams.use_dict, - "typesize": self.schunk.storage.cparams.typesize, - "nthreads": self.schunk.storage.cparams.nthreads, - "blocksize": self.schunk.storage.cparams.blocksize, - "splitmode": blosc2.SplitMode(self.schunk.storage.cparams.splitmode), - "tuner": blosc2.Tuner(self.schunk.storage.cparams.tuner_id), - } filters = [0] * BLOSC2_MAX_FILTERS filters_meta = [0] * BLOSC2_MAX_FILTERS @@ -1090,42 +1080,50 @@ cdef class SChunk: # User filter filters[i] = self.schunk.filters[i] filters_meta[i] = self.schunk.filters_meta[i] - cparams_dict["filters"] = filters - cparams_dict["filters_meta"] = filters_meta - return cparams_dict - def update_cparams(self, cparams_dict): + cparams = blosc2.CParams( + codec=codec, + codec_meta=self.schunk.storage.cparams.compcode_meta, + clevel=self.schunk.storage.cparams.clevel, + use_dict=bool(self.schunk.storage.cparams.use_dict), + typesize=self.schunk.storage.cparams.typesize, + nthreads=self.schunk.storage.cparams.nthreads, + blocksize=self.schunk.storage.cparams.blocksize, + splitmode=blosc2.SplitMode(self.schunk.storage.cparams.splitmode), + tuner=blosc2.Tuner(self.schunk.storage.cparams.tuner_id), + filters=filters, + filters_meta=filters_meta, + ) + + return cparams + + def update_cparams(self, new_cparams): cdef blosc2_cparams* cparams = self.schunk.storage.cparams - codec = cparams_dict.get('codec', None) - if codec is None: - cparams.compcode = cparams.compcode - else: - cparams.compcode = codec if not isinstance(codec, blosc2.Codec) else codec.value - cparams.compcode_meta = cparams_dict.get('codec_meta', cparams.compcode_meta) - cparams.clevel = cparams_dict.get('clevel', cparams.clevel) - cparams.use_dict = cparams_dict.get('use_dict', cparams.use_dict) - cparams.typesize = cparams_dict.get('typesize', cparams.typesize) - cparams.nthreads = cparams_dict.get('nthreads', cparams.nthreads) - cparams.blocksize = cparams_dict.get('blocksize', cparams.blocksize) - splitmode = cparams_dict.get('splitmode', None) - cparams.splitmode = cparams.splitmode if splitmode is None else splitmode.value - - filters = cparams_dict.get('filters', None) - if filters is not None: - for i, filter in enumerate(filters): - cparams.filters[i] = filter.value if isinstance(filter, Enum) else filter - for i in range(len(filters), BLOSC2_MAX_FILTERS): - cparams.filters[i] = 0 - - filters_meta = cparams_dict.get('filters_meta', None) + codec = new_cparams.codec + cparams.compcode = codec if not isinstance(codec, blosc2.Codec) else codec.value + cparams.compcode_meta = new_cparams.codec_meta + cparams.clevel = new_cparams.clevel + cparams.use_dict = new_cparams.use_dict + cparams.typesize = new_cparams.typesize + cparams.nthreads = new_cparams.nthreads + cparams.blocksize = new_cparams.blocksize + cparams.splitmode = new_cparams.splitmode.value + cparams.tuner_id = new_cparams.tuner.value + + filters = new_cparams.filters + for i, filter in enumerate(filters): + cparams.filters[i] = filter.value if isinstance(filter, Enum) else filter + for i in range(len(filters), BLOSC2_MAX_FILTERS): + cparams.filters[i] = 0 + + filters_meta = new_cparams.filters_meta cdef int8_t meta_value - if filters_meta is not None: - for i, meta in enumerate(filters_meta): - # We still may want to encode negative values - meta_value = meta if meta < 0 else meta - cparams.filters_meta[i] = meta_value - for i in range(len(filters_meta), BLOSC2_MAX_FILTERS): - cparams.filters_meta[i] = 0 + for i, meta in enumerate(filters_meta): + # We still may want to encode negative values + meta_value = meta if meta < 0 else meta + cparams.filters_meta[i] = meta_value + for i in range(len(filters_meta), BLOSC2_MAX_FILTERS): + cparams.filters_meta[i] = 0 _check_cparams(cparams) @@ -1143,12 +1141,11 @@ cdef class SChunk: self.schunk.filters_meta = self.schunk.storage.cparams.filters_meta def get_dparams(self): - dparams_dict = {"nthreads": self.schunk.storage.dparams.nthreads} - return dparams_dict + return blosc2.DParams(nthreads=self.schunk.storage.dparams.nthreads) - def update_dparams(self, dparams_dict): + def update_dparams(self, new_dparams): cdef blosc2_dparams* dparams = self.schunk.storage.dparams - dparams.nthreads = dparams_dict.get('nthreads', dparams.nthreads) + dparams.nthreads = new_dparams.nthreads _check_dparams(dparams, self.schunk.storage.cparams) @@ -1967,17 +1964,17 @@ def open(urlpath, mode, offset, **kwargs): res = blosc2.NDArray(_schunk=PyCapsule_New(array.sc, "blosc2_schunk*", NULL), _array=PyCapsule_New(array, "b2nd_array_t*", NULL)) if cparams is not None: - res.schunk.cparams = cparams + res.schunk.cparams = cparams if isinstance(cparams, blosc2.CParams) else blosc2.CParams(**cparams) if dparams is not None: - res.schunk.dparams = dparams + res.schunk.dparams = dparams if isinstance(dparams, blosc2.DParams) else blosc2.DParams(**dparams) res.schunk.mode = mode else: res = blosc2.SChunk(_schunk=PyCapsule_New(schunk, "blosc2_schunk*", NULL), mode=mode, **kwargs) if cparams is not None: - res.cparams = cparams + res.cparams = cparams if isinstance(cparams, blosc2.CParams) else blosc2.CParams(**cparams) if dparams is not None: - res.dparams = dparams + res.dparams = dparams if isinstance(dparams, blosc2.DParams) else blosc2.DParams(**dparams) return res diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index 0ca42856..78f1d5b1 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -1884,7 +1884,7 @@ def eval(self, item=None, **kwargs): aux = np.empty(res_eval.shape, res_eval.dtype) res_eval[...] = aux res_eval.schunk.remove_prefilter(self.func.__name__) - res_eval.schunk.cparams["nthreads"] = self._cnthreads + res_eval.schunk.cparams.nthreads = self._cnthreads return res_eval else: diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index 0f9f094d..4964a3f4 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -18,6 +18,7 @@ import ndindex import numpy as np +from dataclasses import asdict import blosc2 from blosc2 import SpecialValue, blosc2_ext, compute_chunks_blocks @@ -1288,8 +1289,8 @@ def copy(self, dtype: np.dtype = None, **kwargs: dict) -> NDArray: """ if dtype is None: dtype = self.dtype - kwargs["cparams"] = kwargs.get("cparams", self.schunk.cparams).copy() - kwargs["dparams"] = kwargs.get("dparams", self.schunk.dparams).copy() + kwargs["cparams"] = kwargs.get("cparams", asdict(self.schunk.cparams)).copy() + kwargs["dparams"] = kwargs.get("dparams", asdict(self.schunk.dparams)).copy() if "meta" not in kwargs: # Copy metalayers as well meta_dict = {meta: self.schunk.meta[meta] for meta in self.schunk.meta} diff --git a/src/blosc2/schunk.py b/src/blosc2/schunk.py index 8b107399..7da6d7ef 100644 --- a/src/blosc2/schunk.py +++ b/src/blosc2/schunk.py @@ -279,26 +279,26 @@ def __init__(self, chunksize: int = None, data: object = None, **kwargs: dict): self._dparams = super().get_dparams() @property - def cparams(self) -> dict: + def cparams(self) -> blosc2.CParams: """ - Dictionary with the compression parameters. + :class:`blosc2.CParams` instance with the compression parameters. """ return self._cparams @cparams.setter - def cparams(self, value): + def cparams(self, value: blosc2.CParams) -> None: super().update_cparams(value) self._cparams = super().get_cparams() @property - def dparams(self) -> dict: + def dparams(self) -> blosc2.DParams: """ - Dictionary with the decompression parameters. + :class:`blosc2.DParams` instance with the decompression parameters. """ return self._dparams @dparams.setter - def dparams(self, value): + def dparams(self, value: blosc2.DParams) -> None: super().update_dparams(value) self._dparams = super().get_dparams() diff --git a/src/blosc2/storage.py b/src/blosc2/storage.py index 6598f90e..ccb001fe 100644 --- a/src/blosc2/storage.py +++ b/src/blosc2/storage.py @@ -86,8 +86,14 @@ class CParams: filters_meta: list[int] = field(default_factory=default_filters_meta) tuner: blosc2.Tuner = blosc2.Tuner.STUNE - # def __post_init__(self): - # if len(self.filters) > 6: + def __post_init__(self): + if len(self.filters) > 6: + raise ValueError("Number of filters exceeds 6") + if len(self.filters) < len(self.filters_meta): + self.filters_meta = self.filters_meta[:len(self.filters)] + warnings.warn("Changed `filters_meta` length to match `filters` length") + if len(self.filters) > len(self.filters_meta): + raise ValueError("Number of filters cannot exceed number of filters meta") @dataclass diff --git a/tests/ndarray/test_copy.py b/tests/ndarray/test_copy.py index a8d9c18b..a5c6681d 100644 --- a/tests/ndarray/test_copy.py +++ b/tests/ndarray/test_copy.py @@ -35,9 +35,9 @@ def test_copy(shape, chunks1, blocks1, chunks2, blocks2, dtype): assert a.schunk.dparams == b.schunk.dparams for key in cparams2: if key in ("filters", "filters_meta"): - assert b.schunk.cparams[key][: len(cparams2[key])] == cparams2[key] + assert getattr(b.schunk.cparams, key)[: len(cparams2[key])] == cparams2[key] continue - assert b.schunk.cparams[key] == cparams2[key] + assert getattr(b.schunk.cparams, key) == cparams2[key] assert b.chunks == tuple(chunks2) assert b.blocks == tuple(blocks2) assert a.dtype == b.dtype diff --git a/tests/ndarray/test_empty.py b/tests/ndarray/test_empty.py index d165de2d..bbca8568 100644 --- a/tests/ndarray/test_empty.py +++ b/tests/ndarray/test_empty.py @@ -83,10 +83,10 @@ def test_empty(shape, chunks, blocks, dtype, cparams, urlpath, contiguous): assert a.blocks == blocks assert a.dtype == dtype assert a.schunk.typesize == dtype.itemsize - assert a.schunk.cparams["codec"] == cparams["codec"] - assert a.schunk.cparams["clevel"] == cparams["clevel"] - assert a.schunk.cparams["filters"][: len(filters)] == filters - assert a.schunk.dparams["nthreads"] == 2 + assert a.schunk.cparams.codec == cparams["codec"] + assert a.schunk.cparams.clevel == cparams["clevel"] + assert a.schunk.cparams.filters[: len(filters)] == filters + assert a.schunk.dparams.nthreads == 2 blosc2.remove_urlpath(urlpath) diff --git a/tests/ndarray/test_full.py b/tests/ndarray/test_full.py index 3734af64..adf326fe 100644 --- a/tests/ndarray/test_full.py +++ b/tests/ndarray/test_full.py @@ -8,6 +8,7 @@ import numpy as np import pytest +from dataclasses import asdict import blosc2 @@ -74,7 +75,7 @@ def test_full(shape, chunks, blocks, fill_value, cparams, dparams, dtype, urlpat urlpath=urlpath, contiguous=contiguous, ) - assert a.schunk.dparams == dparams + assert asdict(a.schunk.dparams) == dparams if isinstance(fill_value, bytes): dtype = np.dtype(f"S{len(fill_value)}") assert a.dtype == np.dtype(dtype) if dtype is not None else np.dtype(np.uint8) diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py index a4b90b47..3659600e 100644 --- a/tests/ndarray/test_lazyexpr.py +++ b/tests/ndarray/test_lazyexpr.py @@ -455,8 +455,8 @@ def test_params(array_fixture): res = expr.eval(urlpath=urlpath, cparams=cparams, dparams=dparams, chunks=chunks, blocks=blocks) np.testing.assert_allclose(res[:], nres) assert res.schunk.urlpath == urlpath - assert res.schunk.cparams["nthreads"] == cparams["nthreads"] - assert res.schunk.dparams["nthreads"] == dparams["nthreads"] + assert res.schunk.cparams.nthreads == cparams["nthreads"] + assert res.schunk.dparams.nthreads == dparams["nthreads"] assert res.chunks == chunks assert res.blocks == blocks diff --git a/tests/ndarray/test_lazyudf.py b/tests/ndarray/test_lazyudf.py index ed642ccc..5391e5ae 100644 --- a/tests/ndarray/test_lazyudf.py +++ b/tests/ndarray/test_lazyudf.py @@ -188,7 +188,7 @@ def test_params(chunked_eval): res = expr.eval(urlpath=urlpath2, chunks=(10,)) np.testing.assert_allclose(res[...], npc) assert res.shape == npa.shape - assert res.schunk.cparams["nthreads"] == cparams["nthreads"] + assert res.schunk.cparams.nthreads == cparams["nthreads"] assert res.schunk.urlpath == urlpath2 assert res.chunks == (10,) @@ -243,7 +243,7 @@ def test_getitem(shape, chunks, blocks, slices, urlpath, contiguous, chunked_eva assert res.schunk.urlpath is None assert res.schunk.contiguous == contiguous # Check dparams after a getitem and an eval - assert res.schunk.dparams["nthreads"] == dparams["nthreads"] + assert res.schunk.dparams.nthreads == dparams["nthreads"] lazy_eval = expr[slices] np.testing.assert_allclose(lazy_eval, npc[slices]) @@ -282,8 +282,8 @@ def test_eval_slice(shape, chunks, blocks, slices, urlpath, contiguous, chunked_ np.testing.assert_allclose(res[...], npc[slices]) assert res.schunk.urlpath is None assert res.schunk.contiguous == contiguous - assert res.schunk.dparams["nthreads"] == dparams["nthreads"] - assert res.schunk.cparams["nthreads"] == blosc2.cparams_dflts["nthreads"] + assert res.schunk.dparams.nthreads == dparams["nthreads"] + assert res.schunk.cparams.nthreads == blosc2.cparams_dflts["nthreads"] assert res.shape == npc[slices].shape cparams = {"nthreads": 6} @@ -294,8 +294,8 @@ def test_eval_slice(shape, chunks, blocks, slices, urlpath, contiguous, chunked_ np.testing.assert_allclose(res[...], npc[slices]) assert res.schunk.urlpath == urlpath2 assert res.schunk.contiguous == contiguous - assert res.schunk.dparams["nthreads"] == dparams["nthreads"] - assert res.schunk.cparams["nthreads"] == cparams["nthreads"] + assert res.schunk.dparams.nthreads == dparams["nthreads"] + assert res.schunk.cparams.nthreads == cparams["nthreads"] assert res.shape == npc[slices].shape blosc2.remove_urlpath(urlpath) diff --git a/tests/ndarray/test_lossy.py b/tests/ndarray/test_lossy.py index ceced6e9..8e110017 100644 --- a/tests/ndarray/test_lossy.py +++ b/tests/ndarray/test_lossy.py @@ -66,8 +66,8 @@ def test_lossy(shape, cparams, dtype, urlpath, contiguous): a = blosc2.asarray(array, cparams=cparams, urlpath=urlpath, contiguous=contiguous, mode="w") if ( - a.schunk.cparams["codec"] in (blosc2.Codec.ZFP_RATE, blosc2.Codec.ZFP_PREC, blosc2.Codec.ZFP_ACC) - or a.schunk.cparams["filters"][0] == blosc2.Filter.NDMEAN + a.schunk.cparams.codec in (blosc2.Codec.ZFP_RATE, blosc2.Codec.ZFP_PREC, blosc2.Codec.ZFP_ACC) + or a.schunk.cparams.filters[0] == blosc2.Filter.NDMEAN ): _ = a[...] else: diff --git a/tests/test_open.py b/tests/test_open.py index 7e9d17ff..2ed39535 100644 --- a/tests/test_open.py +++ b/tests/test_open.py @@ -62,12 +62,12 @@ def test_open(contiguous, urlpath, cparams, dparams, nchunks, chunk_nitems, dtyp cparams2 = cparams cparams2["nthreads"] = 1 schunk_open = blosc2.open(urlpath, mode, mmap_mode=mmap_mode, cparams=cparams2) - assert schunk_open.cparams["nthreads"] == cparams2["nthreads"] + assert schunk_open.cparams.nthreads == cparams2["nthreads"] for key in cparams: if key == "nthreads": continue - assert schunk_open.cparams[key] == cparams[key] + assert getattr(schunk_open.cparams, key) == cparams[key] buffer = np.zeros(chunk_nitems, dtype=dtype) if mode != "r": diff --git a/tests/test_postfilters.py b/tests/test_postfilters.py index baa4934f..cadd646f 100644 --- a/tests/test_postfilters.py +++ b/tests/test_postfilters.py @@ -67,7 +67,7 @@ def postf2(input, output, offset): def postf3(input, output, offset): output[:] = input <= np.datetime64("1997-12-31") - schunk.dparams = {"nthreads": 1} + schunk.dparams = blosc2.DParams(nthreads=1) post_data = np.empty(chunk_len * nchunks, dtype=output_dtype) schunk.get_slice(0, chunk_len * nchunks, out=post_data) diff --git a/tests/test_prefilters.py b/tests/test_prefilters.py index 818517a4..d0865ce9 100644 --- a/tests/test_prefilters.py +++ b/tests/test_prefilters.py @@ -8,6 +8,7 @@ import numpy as np import pytest +from dataclasses import asdict, replace import blosc2 @@ -104,7 +105,7 @@ def fill_f4(inputs_tuple, output, offset): fill_f4((data, data2, np.pi), res, offset) - new_cparams = {"nthreads": 2} + new_cparams = replace(schunk.cparams, nthreads=2) schunk.cparams = new_cparams pre_data = np.empty(chunk_len * nchunks, dtype=schunk_dtype) @@ -180,7 +181,9 @@ def pref2(input, output, offset): def pref3(input, output, offset): output[:] = input <= np.datetime64("1997-12-31") - schunk.cparams = {"nthreads": 1} + new_cparams = asdict(schunk.cparams) + new_cparams["nthreads"] = 1 + schunk.cparams = blosc2.CParams(**new_cparams) schunk[: nchunks * chunk_len] = data post_data = np.empty(chunk_len * nchunks, dtype=schunk_dtype) diff --git a/tests/test_schunk.py b/tests/test_schunk.py index fc24f514..31915627 100644 --- a/tests/test_schunk.py +++ b/tests/test_schunk.py @@ -7,7 +7,7 @@ ####################################################################### import os -from dataclasses import asdict, replace +from dataclasses import asdict, replace, fields import numpy as np import pytest @@ -217,7 +217,7 @@ def test_schunk_cframe(contiguous, urlpath, mode, mmap_mode, cparams, dparams, n continue if key == "blocksize" and cparams_dict[key] == 0: continue - assert schunk2.cparams[key] == cparams_dict[key] + assert getattr(schunk2.cparams, key) == cparams_dict[key] data2 = np.empty(data.shape, dtype=data.dtype) schunk2.get_slice(out=data2) @@ -236,33 +236,33 @@ def test_schunk_cframe(contiguous, urlpath, mode, mmap_mode, cparams, dparams, n "cparams, dparams, new_cparams, new_dparams", [ ( - {"codec": blosc2.Codec.LZ4, "clevel": 6, "typesize": 4}, + blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=6, typesize=4), {}, - {"codec": blosc2.Codec.LZ4, "clevel": 6, "typesize": 4}, - {"nthreads": 4}, + blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=6, typesize=4), + blosc2.DParams(nthreads=4), ), ( {"typesize": 4}, - {"nthreads": 4}, - {"codec": blosc2.Codec.ZLIB, "splitmode": blosc2.SplitMode.ALWAYS_SPLIT}, - {"nthreads": 1}, + blosc2.DParams(nthreads=4), + blosc2.CParams(codec=blosc2.Codec.ZLIB, splitmode=blosc2.SplitMode.ALWAYS_SPLIT), + blosc2.DParams(nthreads=1), ), ( {"codec": blosc2.Codec.ZLIB, "splitmode": blosc2.SplitMode.ALWAYS_SPLIT}, {}, - { - "splitmode": blosc2.SplitMode.ALWAYS_SPLIT, - "nthreads": 5, - "typesize": 4, - "filters": [blosc2.Filter.SHUFFLE, blosc2.Filter.TRUNC_PREC], - }, - {"nthreads": 16}, + blosc2.CParams( + splitmode=blosc2.SplitMode.ALWAYS_SPLIT, + nthreads=5, + typesize=4, + filters=[blosc2.Filter.SHUFFLE, blosc2.Filter.TRUNC_PREC], + ), + blosc2.DParams(nthreads=16), ), ( - {"codec": blosc2.Codec.LZ4HC, "typesize": 4}, - {}, - {"filters": [blosc2.Filter.SHUFFLE, blosc2.Filter.TRUNC_PREC]}, - {"nthreads": 3}, + blosc2.CParams(codec=blosc2.Codec.LZ4HC, typesize=4), + blosc2.DParams(), + blosc2.CParams(filters=[blosc2.Filter.SHUFFLE, blosc2.Filter.TRUNC_PREC]), + blosc2.DParams(nthreads=3), ), ], ) @@ -273,39 +273,19 @@ def test_schunk_cdparams(cparams, dparams, new_cparams, new_dparams): schunk = blosc2.SChunk(chunksize=chunk_len * 4, **storage) # Check cparams have been set correctly - for key in cparams: - assert schunk.cparams[key] == cparams[key] - for key in dparams: - assert schunk.dparams[key] == dparams[key] + cparams_dict = cparams if isinstance(cparams, dict) else asdict(cparams) + dparams_dict = dparams if isinstance(dparams, dict) else asdict(dparams) + for key in cparams_dict: + assert getattr(schunk.cparams, key) == cparams_dict[key] + for key in dparams_dict: + assert getattr(schunk.dparams, key) == dparams_dict[key] schunk.cparams = new_cparams schunk.dparams = new_dparams - for key in schunk.cparams: - if key in new_cparams: - if key == "filters": - assert schunk.cparams[key][: len(new_cparams[key])] == new_cparams[key] - else: - assert schunk.cparams[key] == new_cparams[key] - elif key in cparams: - if key == "filters": - assert schunk.cparams[key][: len(cparams[key])] == cparams[key] - else: - assert schunk.cparams[key] == cparams[key] + for field in fields(schunk.cparams): + if field.name in ["filters", "filters_meta"]: + assert getattr(schunk.cparams, field.name)[: len(getattr(new_cparams, field.name))] == getattr(new_cparams, field.name) else: - if key == "filters": - assert schunk.cparams[key][: len(blosc2.cparams_dflts[key])] == blosc2.cparams_dflts[key] - elif key == "filters_meta": - # Exception for testing bytedelta in the last position - assert ( - schunk.cparams[key][: len(blosc2.cparams_dflts[key]) - 1] - == blosc2.cparams_dflts[key][:-1] - ) - else: - assert schunk.cparams[key] == blosc2.cparams_dflts[key] - - if "nthreads" in new_dparams: - assert schunk.dparams["nthreads"] == new_dparams["nthreads"] - elif "nthreads" in dparams: - assert schunk.dparams["nthreads"] == dparams["nthreads"] - else: - assert schunk.dparams["nthreads"] == blosc2.dparams_dflts["nthreads"] + assert getattr(schunk.cparams, field.name) == getattr(new_cparams, field.name) + + assert schunk.dparams.nthreads == new_dparams.nthreads From afbc1f25fb47ce6d6cf2c2d6bd6b1330cedbab44 Mon Sep 17 00:00:00 2001 From: Marta Iborra Date: Mon, 23 Sep 2024 10:23:23 +0200 Subject: [PATCH 10/18] Further testing and fixing. WIP --- src/blosc2/core.py | 2 +- src/blosc2/ndarray.py | 42 +++++--- src/blosc2/storage.py | 12 ++- tests/ndarray/test_c2array_udf.py | 6 +- tests/ndarray/test_copy.py | 4 +- tests/ndarray/test_empty.py | 11 ++- tests/ndarray/test_full.py | 6 +- tests/ndarray/test_lazyexpr.py | 8 +- tests/test_storage.py | 153 ++++++++++++++++++++++++++++++ tests/test_ucodecs.py | 5 +- tests/test_ufilters.py | 4 +- 11 files changed, 215 insertions(+), 38 deletions(-) create mode 100644 tests/test_storage.py diff --git a/src/blosc2/core.py b/src/blosc2/core.py index 093a8264..2f5ab25b 100644 --- a/src/blosc2/core.py +++ b/src/blosc2/core.py @@ -1447,7 +1447,7 @@ def decompress2(src: object, dst: object | bytearray = None, **kwargs: dict) -> Decompression parameters. The default values are in :class:`blosc2.DParams`. Keyword arguments supported: - cparams: :class:`blosc2.DParams` + dparams: :class:`blosc2.DParams` All the decompression parameters that you want to use as a :class:`blosc2.DParams` instance. others: Any diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index 4964a3f4..52bfdab5 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -1289,13 +1289,15 @@ def copy(self, dtype: np.dtype = None, **kwargs: dict) -> NDArray: """ if dtype is None: dtype = self.dtype - kwargs["cparams"] = kwargs.get("cparams", asdict(self.schunk.cparams)).copy() - kwargs["dparams"] = kwargs.get("dparams", asdict(self.schunk.dparams)).copy() + kwargs["cparams"] = kwargs.get("cparams").copy() if isinstance(kwargs.get("cparams"), dict) \ + else asdict(self.schunk.cparams) + kwargs["dparams"] = kwargs.get("dparams").copy() if isinstance(kwargs.get("dparams"), dict) \ + else asdict(self.schunk.dparams) if "meta" not in kwargs: # Copy metalayers as well meta_dict = {meta: self.schunk.meta[meta] for meta in self.schunk.meta} kwargs["meta"] = meta_dict - _check_ndarray_kwargs(**kwargs) + kwargs = _check_ndarray_kwargs(**kwargs) return super().copy(dtype, **kwargs) @@ -1370,7 +1372,7 @@ def slice(self, key: int | slice | Sequence[slice], **kwargs: dict) -> NDArray: >>> print(type(c)) """ - _check_ndarray_kwargs(**kwargs) + kwargs = _check_ndarray_kwargs(**kwargs) key, mask = process_key(key, self.shape) start, stop, step = get_ndarray_start_stop(self.ndim, key, self.shape) key = (start, stop) @@ -2329,7 +2331,7 @@ def empty(shape: int | tuple | list, dtype: np.dtype = np.uint8, **kwargs: dict) dtype('int32') """ shape = _check_shape(shape) - _check_ndarray_kwargs(**kwargs) + kwargs = _check_ndarray_kwargs(**kwargs) chunks = kwargs.pop("chunks", None) blocks = kwargs.pop("blocks", None) chunks, blocks = compute_chunks_blocks(shape, chunks, blocks, dtype, **kwargs) @@ -2362,7 +2364,7 @@ def uninit(shape: int | tuple | list, dtype: np.dtype = np.uint8, **kwargs: dict dtype('float64') """ shape = _check_shape(shape) - _check_ndarray_kwargs(**kwargs) + kwargs = _check_ndarray_kwargs(**kwargs) chunks = kwargs.pop("chunks", None) blocks = kwargs.pop("blocks", None) chunks, blocks = compute_chunks_blocks(shape, chunks, blocks, dtype, **kwargs) @@ -2395,7 +2397,7 @@ def nans(shape: int | tuple | list, dtype: np.dtype = np.float64, **kwargs: dict dtype('float64') """ shape = _check_shape(shape) - _check_ndarray_kwargs(**kwargs) + kwargs = _check_ndarray_kwargs(**kwargs) chunks = kwargs.pop("chunks", None) blocks = kwargs.pop("blocks", None) chunks, blocks = compute_chunks_blocks(shape, chunks, blocks, dtype, **kwargs) @@ -2434,7 +2436,7 @@ def zeros(shape: int | tuple | list, dtype: np.dtype = np.uint8, **kwargs: dict) dtype('float64') """ shape = _check_shape(shape) - _check_ndarray_kwargs(**kwargs) + kwargs = _check_ndarray_kwargs(**kwargs) chunks = kwargs.pop("chunks", None) blocks = kwargs.pop("blocks", None) chunks, blocks = compute_chunks_blocks(shape, chunks, blocks, dtype, **kwargs) @@ -2487,7 +2489,7 @@ def full(shape: int | tuple | list, fill_value: bytes | int | float | bool, dtyp if dtype is None: dtype = np.dtype(type(fill_value)) shape = _check_shape(shape) - _check_ndarray_kwargs(**kwargs) + kwargs = _check_ndarray_kwargs(**kwargs) chunks = kwargs.pop("chunks", None) blocks = kwargs.pop("blocks", None) chunks, blocks = compute_chunks_blocks(shape, chunks, blocks, dtype, **kwargs) @@ -2534,7 +2536,7 @@ def frombuffer( >>> a = blosc2.frombuffer(buffer, shape, chunks=chunks, dtype=dtype) """ shape = _check_shape(shape) - _check_ndarray_kwargs(**kwargs) + kwargs = _check_ndarray_kwargs(**kwargs) chunks = kwargs.pop("chunks", None) blocks = kwargs.pop("blocks", None) chunks, blocks = compute_chunks_blocks(shape, chunks, blocks, dtype, **kwargs) @@ -2597,7 +2599,7 @@ def asarray(array: np.ndarray | blosc2.C2Array, **kwargs: dict | list) -> NDArra >>> # Create a NDArray from a NumPy array >>> nda = blosc2.asarray(a) """ - _check_ndarray_kwargs(**kwargs) + kwargs = _check_ndarray_kwargs(**kwargs) chunks = kwargs.pop("chunks", None) blocks = kwargs.pop("blocks", None) # Use the chunks and blocks from the array if they are not passed @@ -2648,6 +2650,20 @@ def asarray(array: np.ndarray | blosc2.C2Array, **kwargs: dict | list) -> NDArra def _check_ndarray_kwargs(**kwargs): + if "storage" in kwargs: + for key in kwargs: + if key in list(blosc2.Storage.__annotations__): + raise AttributeError("Cannot pass both `storage` and other kwargs already included in Storage") + storage = kwargs.get("storage") + del kwargs["storage"] + kwargs = {**kwargs, **asdict(storage)} + else: + cparams = kwargs.get("cparams", {}) + cparams = cparams if isinstance(cparams, dict) else asdict(cparams) + dparams = kwargs.get("dparams", {}) + dparams = dparams if isinstance(dparams, dict) else asdict(dparams) + kwargs["cparams"] = cparams + kwargs["dparams"] = dparams supported_keys = [ "chunks", "blocks", @@ -2659,17 +2675,21 @@ def _check_ndarray_kwargs(**kwargs): "mode", "mmap_mode", "initial_mapping_size", + "storage", ] for key in kwargs: if key not in supported_keys: raise KeyError( f"Only {supported_keys} are supported as keyword arguments, and you passed '{key}'" ) + if "cparams" in kwargs and "chunks" in kwargs["cparams"]: raise ValueError("You cannot pass chunks in cparams, use `chunks` argument instead") if "cparams" in kwargs and "blocks" in kwargs["cparams"]: raise ValueError("You cannot pass chunks in cparams, use `blocks` argument instead") + return kwargs + def get_slice_nchunks(schunk: blosc2.SChunk, key: tuple[(int, int)] | int | slice | Sequence[slice] diff --git a/src/blosc2/storage.py b/src/blosc2/storage.py index ccb001fe..59292032 100644 --- a/src/blosc2/storage.py +++ b/src/blosc2/storage.py @@ -44,7 +44,7 @@ class CParams: Parameters ---------- - codec: :class:`Codec` + codec: :class:`Codec` or int The compressor code. Default is :py:obj:`Codec.ZSTD `. codec_meta: int The metadata for the compressor code, 0 by default. @@ -65,7 +65,7 @@ class CParams: splitmode: :class:`SplitMode` The split mode for the blocks. The default value is :py:obj:`SplitMode.ALWAYS_SPLIT `. - filters: :class:`Filter` list + filters: :class:`Filter` or int list The sequence of filters. Default: [:py:obj:`Filter.NOFILTER `, :py:obj:`Filter.NOFILTER `, :py:obj:`Filter.NOFILTER `, :py:obj:`Filter.NOFILTER `, :py:obj:`Filter.NOFILTER `, :py:obj:`Filter.SHUFFLE `]. @@ -74,7 +74,7 @@ class CParams: tuner: :class:`Tuner` The tuner to use. Default: :py:obj:`Tuner.STUNE `. """ - codec: blosc2.Codec = blosc2.Codec.ZSTD + codec: blosc2.Codec | int = blosc2.Codec.ZSTD codec_meta: int = 0 clevel: int = 1 use_dict: bool = False @@ -82,7 +82,7 @@ class CParams: nthreads: int = field(default_factory=default_nthreads) blocksize: int = 0 splitmode: blosc2.SplitMode = blosc2.SplitMode.ALWAYS_SPLIT - filters: list[blosc2.Filter] = field(default_factory=default_filters) + filters: list[blosc2.Filter | int] = field(default_factory=default_filters) filters_meta: list[int] = field(default_factory=default_filters_meta) tuner: blosc2.Tuner = blosc2.Tuner.STUNE @@ -95,6 +95,10 @@ def __post_init__(self): if len(self.filters) > len(self.filters_meta): raise ValueError("Number of filters cannot exceed number of filters meta") + for i in range(len(self.filters)): + if self.filters_meta[i] == 0 and self.filters[i] == blosc2.Filter.BYTEDELTA: + self.filters_meta[i] = self.typesize + @dataclass class DParams: diff --git a/tests/ndarray/test_c2array_udf.py b/tests/ndarray/test_c2array_udf.py index d9c7357f..9e562e97 100644 --- a/tests/ndarray/test_c2array_udf.py +++ b/tests/ndarray/test_c2array_udf.py @@ -95,9 +95,7 @@ def test_getitem(chunks, blocks, slices, urlpath, contiguous, chunked_eval, c2su chunked_eval=chunked_eval, chunks=chunks, blocks=blocks, - urlpath=urlpath, - contiguous=contiguous, - dparams=dparams, + storage=blosc2.Storage(urlpath=urlpath, contiguous=contiguous, dparams=dparams), ) lazy_eval = expr[slices] np.testing.assert_allclose(lazy_eval, npc[slices]) @@ -107,6 +105,6 @@ def test_getitem(chunks, blocks, slices, urlpath, contiguous, chunked_eval, c2su assert res.schunk.urlpath is None assert res.schunk.contiguous == contiguous # Check dparams after a getitem and an eval - assert res.schunk.dparams["nthreads"] == dparams["nthreads"] + assert res.schunk.dparams.nthreads == dparams["nthreads"] blosc2.remove_urlpath(urlpath) diff --git a/tests/ndarray/test_copy.py b/tests/ndarray/test_copy.py index a5c6681d..5bf773de 100644 --- a/tests/ndarray/test_copy.py +++ b/tests/ndarray/test_copy.py @@ -27,7 +27,7 @@ def test_copy(shape, chunks1, blocks1, chunks2, blocks2, dtype): typesize = dtype.itemsize size = int(np.prod(shape)) buffer = bytes(size * typesize) - cparams1 = {"clevel": 2} + cparams1 = blosc2.CParams(clevel=2) a = blosc2.frombuffer(buffer, shape, dtype=dtype, chunks=chunks1, blocks=blocks1, cparams=cparams1) cparams2 = {"clevel": 5, "filters": [blosc2.Filter.BITSHUFFLE], "filters_meta": [0]} b = a.copy(chunks=chunks2, blocks=blocks2, cparams=cparams2) @@ -63,7 +63,7 @@ def test_copy_numpy(shape, chunks1, blocks1, chunks2, blocks2, dtype): else: nparray = np.arange(size, dtype=dtype).reshape(shape) a = blosc2.asarray(nparray, chunks=chunks1, blocks=blocks1) - cparams = {"clevel": 5, "filters": [blosc2.Filter.BITSHUFFLE], "filters_meta": [0]} + cparams = blosc2.CParams(clevel=5, filters=[blosc2.Filter.BITSHUFFLE], filters_meta=[0]) b = a.copy(chunks=chunks2, blocks=blocks2, cparams=cparams) assert b.dtype == nparray.dtype if dtype.kind == "V": diff --git a/tests/ndarray/test_empty.py b/tests/ndarray/test_empty.py index bbca8568..aada0083 100644 --- a/tests/ndarray/test_empty.py +++ b/tests/ndarray/test_empty.py @@ -65,16 +65,17 @@ def test_empty(shape, chunks, blocks, dtype, cparams, urlpath, contiguous): blosc2.remove_urlpath(urlpath) filters = cparams["filters"] - cparams["filters_meta"] = [0] * len(filters) + storage = blosc2.Storage(cparams=blosc2.CParams(**cparams), + dparams={"nthreads": 2}, + urlpath=urlpath, + contiguous=contiguous, + ) a = blosc2.empty( shape, chunks=chunks, blocks=blocks, dtype=dtype, - cparams=cparams, - dparams={"nthreads": 2}, - urlpath=urlpath, - contiguous=contiguous, + storage=storage, ) dtype = np.dtype(dtype) diff --git a/tests/ndarray/test_full.py b/tests/ndarray/test_full.py index adf326fe..a33bb8bc 100644 --- a/tests/ndarray/test_full.py +++ b/tests/ndarray/test_full.py @@ -33,7 +33,7 @@ (10, 10), b"sun", None, - {"codec": blosc2.Codec.LZ4HC, "clevel": 8, "use_dict": False, "nthreads": 2}, + blosc2.CParams(codec=blosc2.Codec.LZ4HC, clevel=8, use_dict=False, nthreads=2), {"nthreads": 2}, "full.b2nd", True, @@ -55,7 +55,7 @@ (11, 11), 123456789, None, - {"codec": blosc2.Codec.LZ4HC, "clevel": 8, "use_dict": False, "nthreads": 2}, + blosc2.CParams(codec=blosc2.Codec.LZ4HC, clevel=8, use_dict=False, nthreads=2), {"nthreads": 2}, None, True, @@ -71,7 +71,7 @@ def test_full(shape, chunks, blocks, fill_value, cparams, dparams, dtype, urlpat blocks=blocks, dtype=dtype, cparams=cparams, - dparams=dparams, + dparams=blosc2.DParams(**dparams), urlpath=urlpath, contiguous=contiguous, ) diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py index 3659600e..0c0edbb4 100644 --- a/tests/ndarray/test_lazyexpr.py +++ b/tests/ndarray/test_lazyexpr.py @@ -448,14 +448,14 @@ def test_params(array_fixture): urlpath = "eval_expr.b2nd" blosc2.remove_urlpath(urlpath) - cparams = {"nthreads": 2} + cparams = blosc2.CParams(nthreads=2) dparams = {"nthreads": 4} chunks = tuple(i // 2 for i in nres.shape) blocks = tuple(i // 4 for i in nres.shape) res = expr.eval(urlpath=urlpath, cparams=cparams, dparams=dparams, chunks=chunks, blocks=blocks) np.testing.assert_allclose(res[:], nres) assert res.schunk.urlpath == urlpath - assert res.schunk.cparams.nthreads == cparams["nthreads"] + assert res.schunk.cparams.nthreads == cparams.nthreads assert res.schunk.dparams.nthreads == dparams["nthreads"] assert res.chunks == chunks assert res.blocks == blocks @@ -493,8 +493,8 @@ def test_save(): chunks = tuple(i // 2 for i in nres.shape) blocks = tuple(i // 4 for i in nres.shape) urlpath_eval = "eval_expr.b2nd" - res = expr.eval( - urlpath=urlpath_eval, cparams=cparams, dparams=dparams, mode="w", chunks=chunks, blocks=blocks + res = expr.eval(storage=blosc2.Storage(urlpath=urlpath_eval, cparams=cparams, dparams=dparams, mode="w"), + chunks=chunks, blocks=blocks ) np.testing.assert_allclose(res[:], nres, rtol=tol, atol=tol) diff --git a/tests/test_storage.py b/tests/test_storage.py new file mode 100644 index 00000000..519af538 --- /dev/null +++ b/tests/test_storage.py @@ -0,0 +1,153 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# This source code is licensed under a BSD-style license (found in the +# LICENSE file in the root directory of this source tree) +####################################################################### + +from dataclasses import asdict, fields + +import pytest + +import blosc2 + + +@pytest.mark.parametrize( + "urlpath, contiguous, mode, mmap_mode, cparams, dparams", + [ + (None, None, "w", None, blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=6, typesize=4), blosc2.DParams()), + (None, False, "a", None, {"typesize": 4}, blosc2.DParams()), + (None, None, "r", None, blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=6, typesize=4), blosc2.DParams(nthreads=4)), + (None, True, "a", None, blosc2.CParams(splitmode=blosc2.SplitMode.ALWAYS_SPLIT, nthreads=5, typesize=4), {}), + ("b2frame", None, "r", None, {"codec": blosc2.Codec.LZ4HC, "typesize": 4}, blosc2.DParams()), + ("b2frame", False, "a", None, blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=6, typesize=4), blosc2.DParams(nthreads=4)), + ("b2frame", True, "w", None, blosc2.CParams(splitmode=blosc2.SplitMode.ALWAYS_SPLIT, nthreads=5, typesize=4), {}), + ("b2frame", True, "r", "r", blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=6, typesize=4), blosc2.DParams()), + ("b2frame", None, "w", "w+", {"typesize": 4}, {}), + ], +) +def test_storage_values(contiguous, urlpath, mode, mmap_mode, cparams, dparams): + storage = blosc2.Storage(contiguous=contiguous, urlpath=urlpath, mode=mode, mmap_mode=mmap_mode, + cparams=cparams, dparams=dparams) + if contiguous is None: + if urlpath is not None: + assert storage.contiguous + else: + assert not storage.contiguous + else: + assert storage.contiguous == contiguous + + assert storage.urlpath == urlpath + assert storage.mode == mode + assert storage.mmap_mode == mmap_mode + assert storage.cparams == cparams + assert storage.dparams == dparams + + +def test_storage_defaults(): + storage = blosc2.Storage() + assert storage.contiguous == False + assert storage.urlpath is None + assert storage.mode == "a" + assert storage.mmap_mode is None + assert storage.initial_mapping_size is None + assert storage.cparams == blosc2.CParams() + assert storage.dparams == blosc2.DParams() + assert storage.meta is None + + +@pytest.mark.parametrize( + "urlpath, contiguous, cparams, dparams", + [ + (None, False, blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=6, typesize=4), blosc2.DParams()), + (None, True, {"typesize": 4}, blosc2.DParams(nthreads=4)), + ("b2frame", False, blosc2.CParams(splitmode=blosc2.SplitMode.ALWAYS_SPLIT, nthreads=5, typesize=4), {}), + ("b2frame", True, {"codec": blosc2.Codec.LZ4HC, "typesize": 4}, {}), + ], +) +def test_raises_storage(contiguous, urlpath, cparams, dparams): + storage = blosc2.Storage(contiguous=contiguous, urlpath=urlpath, + cparams=cparams, dparams=dparams) + blosc2.remove_urlpath(urlpath) + + for field in fields(blosc2.Storage): + with pytest.raises(AttributeError): + _ = blosc2.SChunk(storage=storage, **{str(field.name): {}}) + with pytest.raises(TypeError): + _ = blosc2.SChunk(**{str(field.name): {}}, **asdict(storage)) + + +@pytest.mark.parametrize( + "cparams", + [ + blosc2.CParams(codec=blosc2.Codec.LZ4, filters=[blosc2.Filter.BITSHUFFLE], tuner=blosc2.Tuner.BTUNE), + {"typesize": 4, 'filters': [blosc2.Filter.TRUNC_PREC, blosc2.Filter.DELTA], 'filters_meta': [0, 0]}, + blosc2.CParams(nthreads=5, filters=[blosc2.Filter.BITSHUFFLE, blosc2.Filter.BYTEDELTA], filters_meta=[0] * 3), + {"codec": blosc2.Codec.LZ4HC, "typesize": 4, 'filters': [blosc2.Filter.BYTEDELTA], 'tuner': blosc2.Tuner.BTUNE}, + ], +) +def test_cparams_values(cparams): + schunk = blosc2.SChunk(cparams=cparams) + cparams_dataclass = cparams if isinstance(cparams, blosc2.CParams) else blosc2.CParams(**cparams) + + for field in fields(cparams_dataclass): + if field.name in ['filters', 'filters_meta']: + assert getattr(schunk.cparams, field.name)[:len(getattr(cparams_dataclass, field.name))] == getattr(cparams_dataclass, field.name) + else: + assert getattr(schunk.cparams, field.name) == getattr(cparams_dataclass, field.name) + + +def test_cparams_defaults(): + cparams = blosc2.CParams() + assert cparams.codec == blosc2.Codec.ZSTD + assert cparams.codec_meta == 0 + assert cparams.splitmode == blosc2.SplitMode.ALWAYS_SPLIT + assert cparams.clevel == 1 + assert cparams.typesize == 8 + assert cparams.nthreads == blosc2.nthreads + assert cparams.filters == [blosc2.Filter.NOFILTER] * 5 + [blosc2.Filter.SHUFFLE] + assert cparams.filters_meta == [0] * 6 + assert not cparams.use_dict + assert cparams.blocksize == 0 + assert cparams.tuner == blosc2.Tuner.STUNE + + +def test_raises_cparams(): + cparams = blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=6, typesize=4) + for field in fields(blosc2.CParams): + with pytest.raises(ValueError): + _ = blosc2.SChunk(cparams=cparams, **{str(field.name): {}}) + with pytest.raises(AttributeError): + _ = blosc2.compress2(b"12345678" * 1000, cparams=cparams, **{str(field.name): {}}) + + +@pytest.mark.parametrize( + "dparams", + [ + (blosc2.DParams()), + (blosc2.DParams(nthreads=2)), + ({}), + ({'nthreads': 2}), + ], +) +def test_dparams_values(dparams): + schunk = blosc2.SChunk(dparams=dparams) + dparams_dataclass = dparams if isinstance(dparams, blosc2.DParams) else blosc2.DParams(**dparams) + + for field in fields(dparams_dataclass): + assert getattr(schunk.dparams, field.name) == getattr(dparams_dataclass, field.name) + + +def test_dparams_defaults(): + dparams = blosc2.DParams() + assert dparams.nthreads == blosc2.nthreads + + +def test_raises_dparams(): + dparams = blosc2.DParams() + for field in fields(blosc2.DParams): + with pytest.raises(ValueError): + _ = blosc2.SChunk(dparams=dparams, **{str(field.name): {}}) + with pytest.raises(AttributeError): + _ = blosc2.decompress2(b"12345678" * 1000, dparams=dparams, **{str(field.name): {}}) diff --git a/tests/test_ucodecs.py b/tests/test_ucodecs.py index d4ff8f07..a20e9abe 100644 --- a/tests/test_ucodecs.py +++ b/tests/test_ucodecs.py @@ -40,6 +40,7 @@ def test_ucodecs(contiguous, urlpath, cparams, nchunks, codec_name, id, dtype): chunk_len = 20 * 1000 blocksize = chunk_len * dtype.itemsize / 10 cparams["blocksize"] = blocksize + cparams["typesize"] = dtype.itemsize def encoder1(input, output, meta, schunk): nd_input = input.view(dtype) @@ -71,7 +72,7 @@ def decoder1(input, output, meta, schunk): data=data, contiguous=contiguous, urlpath=urlpath, - cparams=cparams, + cparams=blosc2.CParams(**cparams), dparams=dparams, ) @@ -149,5 +150,5 @@ def test_dynamic_ucodecs_error(cparams, dparams): chunksize=chunk_len * dtype.itemsize, data=data, cparams=cparams, - dparams=dparams, + dparams=blosc2.DParams(**dparams), ) diff --git a/tests/test_ufilters.py b/tests/test_ufilters.py index 27218b26..90b925bb 100644 --- a/tests/test_ufilters.py +++ b/tests/test_ufilters.py @@ -82,7 +82,7 @@ def backward2(input, output, meta, schunk): contiguous=contiguous, urlpath=urlpath, cparams=cparams, - dparams=dparams, + dparams=blosc2.DParams(**dparams), ) out = np.empty(chunk_len * nchunks, dtype=dtype) @@ -129,7 +129,7 @@ def backward(input, output, meta, schunk): _ = blosc2.SChunk( chunksize=chunk_len * dtype.itemsize, data=data, - cparams=cparams, + cparams=blosc2.CParams(**cparams), dparams=dparams, ) From 57a5ae6f6c3fd7862dae385a15148923cdb3c965 Mon Sep 17 00:00:00 2001 From: Marta Iborra Date: Mon, 23 Sep 2024 10:49:29 +0200 Subject: [PATCH 11/18] Cannot test tuner --- tests/test_storage.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_storage.py b/tests/test_storage.py index 519af538..20908791 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -81,10 +81,10 @@ def test_raises_storage(contiguous, urlpath, cparams, dparams): @pytest.mark.parametrize( "cparams", [ - blosc2.CParams(codec=blosc2.Codec.LZ4, filters=[blosc2.Filter.BITSHUFFLE], tuner=blosc2.Tuner.BTUNE), + blosc2.CParams(codec=blosc2.Codec.LZ4, filters=[blosc2.Filter.BITSHUFFLE]), {"typesize": 4, 'filters': [blosc2.Filter.TRUNC_PREC, blosc2.Filter.DELTA], 'filters_meta': [0, 0]}, blosc2.CParams(nthreads=5, filters=[blosc2.Filter.BITSHUFFLE, blosc2.Filter.BYTEDELTA], filters_meta=[0] * 3), - {"codec": blosc2.Codec.LZ4HC, "typesize": 4, 'filters': [blosc2.Filter.BYTEDELTA], 'tuner': blosc2.Tuner.BTUNE}, + {"codec": blosc2.Codec.LZ4HC, "typesize": 4, 'filters': [blosc2.Filter.BYTEDELTA]}, ], ) def test_cparams_values(cparams): From cb7a849d61afcef8b7128f5c7afa8f563ccc7b3b Mon Sep 17 00:00:00 2001 From: Marta Iborra Date: Mon, 23 Sep 2024 12:41:15 +0200 Subject: [PATCH 12/18] Move cparams and dparams out of storage --- src/blosc2/ndarray.py | 29 +++++++++-------- src/blosc2/schunk.py | 16 +++++++--- src/blosc2/storage.py | 6 ---- tests/ndarray/test_c2array_udf.py | 3 +- tests/ndarray/test_empty.py | 8 ++--- tests/ndarray/test_full.py | 4 +-- tests/ndarray/test_lazyexpr.py | 4 +-- tests/ndarray/test_ndarray.py | 4 +-- tests/test_iterchunks.py | 8 ++--- tests/test_open.py | 4 +-- tests/test_pathlib.py | 4 +-- tests/test_proxy_schunk.py | 8 ++--- tests/test_schunk.py | 22 +++++++------ tests/test_schunk_constructor.py | 12 +++---- tests/test_schunk_delete.py | 8 ++--- tests/test_schunk_get_slice.py | 16 ++++------ tests/test_schunk_get_slice_nchunks.py | 4 +-- tests/test_schunk_insert.py | 9 ++---- tests/test_schunk_set_slice.py | 16 +++++----- tests/test_schunk_update.py | 8 ++--- tests/test_storage.py | 44 +++++++++++--------------- tests/test_vlmeta.py | 8 ++--- 22 files changed, 118 insertions(+), 127 deletions(-) diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index 52bfdab5..bccf9284 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -2655,15 +2655,11 @@ def _check_ndarray_kwargs(**kwargs): if key in list(blosc2.Storage.__annotations__): raise AttributeError("Cannot pass both `storage` and other kwargs already included in Storage") storage = kwargs.get("storage") - del kwargs["storage"] - kwargs = {**kwargs, **asdict(storage)} - else: - cparams = kwargs.get("cparams", {}) - cparams = cparams if isinstance(cparams, dict) else asdict(cparams) - dparams = kwargs.get("dparams", {}) - dparams = dparams if isinstance(dparams, dict) else asdict(dparams) - kwargs["cparams"] = cparams - kwargs["dparams"] = dparams + if isinstance(storage, blosc2.Storage): + kwargs = {**kwargs, **asdict(storage)} + else: + kwargs = {**kwargs, **storage} + supported_keys = [ "chunks", "blocks", @@ -2683,10 +2679,17 @@ def _check_ndarray_kwargs(**kwargs): f"Only {supported_keys} are supported as keyword arguments, and you passed '{key}'" ) - if "cparams" in kwargs and "chunks" in kwargs["cparams"]: - raise ValueError("You cannot pass chunks in cparams, use `chunks` argument instead") - if "cparams" in kwargs and "blocks" in kwargs["cparams"]: - raise ValueError("You cannot pass chunks in cparams, use `blocks` argument instead") + if "cparams" in kwargs: + if isinstance(kwargs["cparams"], blosc2.CParams): + kwargs["cparams"] = asdict(kwargs["cparams"]) + else: + if "chunks" in kwargs["cparams"]: + raise ValueError("You cannot pass chunks in cparams, use `chunks` argument instead") + if "blocks" in kwargs["cparams"]: + raise ValueError("You cannot pass chunks in cparams, use `blocks` argument instead") + if "dparams" in kwargs: + if isinstance(kwargs["dparams"], blosc2.DParams): + kwargs["dparams"] = asdict(kwargs["dparams"]) return kwargs diff --git a/src/blosc2/schunk.py b/src/blosc2/schunk.py index 7da6d7ef..1273ea5e 100644 --- a/src/blosc2/schunk.py +++ b/src/blosc2/schunk.py @@ -160,8 +160,14 @@ def __init__(self, chunksize: int = None, data: object = None, **kwargs: dict): Storage parameters. The default values are in :class:`blosc2.Storage`. Keyword arguments supported: storage: :class:`blosc2.Storage` + All the storage parameters that you want to use as + a :class:`blosc2.Storage` or dict instance. + cparams: :class:`blosc2.CParams` or dict + All the compression parameters that you want to use as + a :class:`blosc2.CParams` or dict instance. + dparams: :class:`blosc2.DParams` or dict All the decompression parameters that you want to use as - a :class:`blosc2.Storage` instance. + a :class:`blosc2.DParams` or dict instance. others: Any If `storage` is not passed, all the parameters of a :class:`blosc2.Storage` can be passed as keyword arguments. @@ -227,11 +233,13 @@ def __init__(self, chunksize: int = None, data: object = None, **kwargs: dict): if kwarg not in allowed_kwargs: raise ValueError(f"{kwarg} is not supported as keyword argument") if kwargs.get("storage") is not None: - if any(key not in ["_is_view", "_schunk", "storage"] for key in kwargs.keys()): + if any(key in list(blosc2.Storage.__annotations__) for key in kwargs.keys()): raise AttributeError("Cannot pass both `storage` and other kwargs already included in Storage") storage = kwargs.get("storage") - del kwargs["storage"] - kwargs = {**kwargs, **asdict(storage)} + if isinstance(storage, blosc2.Storage): + kwargs = {**kwargs, **asdict(storage)} + else: + kwargs = {**kwargs, **storage} if isinstance(kwargs.get("cparams"), blosc2.CParams): kwargs["cparams"] = asdict(kwargs.get("cparams")) diff --git a/src/blosc2/storage.py b/src/blosc2/storage.py index 59292032..6d1ba296 100644 --- a/src/blosc2/storage.py +++ b/src/blosc2/storage.py @@ -127,10 +127,6 @@ class Storage: If the storage is persistent, the name of the file (when `contiguous = True`) or the directory (if `contiguous = False`). If the storage is in-memory, then this field is `None`. - cparams: :class:`CParams` or dict - The compression parameters as a :class:`CParams` instance or a dictionary. - dparams: :class:`DParams` or dict - The decompression parameters as a :class:`DParams` instance or a dictionary. mode: str, optional Persistence mode: ‘r’ means read only (must exist); ‘a’ means read/write (create if it doesn’t exist); @@ -206,8 +202,6 @@ class Storage: """ contiguous: bool = None urlpath: str = None - cparams: CParams | dict = field(default_factory=CParams) - dparams: DParams | dict = field(default_factory=DParams) mode: str = 'a' mmap_mode: str = None initial_mapping_size: int = None diff --git a/tests/ndarray/test_c2array_udf.py b/tests/ndarray/test_c2array_udf.py index 9e562e97..e13f631f 100644 --- a/tests/ndarray/test_c2array_udf.py +++ b/tests/ndarray/test_c2array_udf.py @@ -95,7 +95,8 @@ def test_getitem(chunks, blocks, slices, urlpath, contiguous, chunked_eval, c2su chunked_eval=chunked_eval, chunks=chunks, blocks=blocks, - storage=blosc2.Storage(urlpath=urlpath, contiguous=contiguous, dparams=dparams), + storage=blosc2.Storage(urlpath=urlpath, contiguous=contiguous), + dparams=dparams, ) lazy_eval = expr[slices] np.testing.assert_allclose(lazy_eval, npc[slices]) diff --git a/tests/ndarray/test_empty.py b/tests/ndarray/test_empty.py index aada0083..bf405f70 100644 --- a/tests/ndarray/test_empty.py +++ b/tests/ndarray/test_empty.py @@ -65,17 +65,15 @@ def test_empty(shape, chunks, blocks, dtype, cparams, urlpath, contiguous): blosc2.remove_urlpath(urlpath) filters = cparams["filters"] - storage = blosc2.Storage(cparams=blosc2.CParams(**cparams), - dparams={"nthreads": 2}, - urlpath=urlpath, - contiguous=contiguous, - ) + storage = blosc2.Storage(urlpath=urlpath, contiguous=contiguous) a = blosc2.empty( shape, chunks=chunks, blocks=blocks, dtype=dtype, storage=storage, + cparams=blosc2.CParams(**cparams), + dparams={"nthreads": 2}, ) dtype = np.dtype(dtype) diff --git a/tests/ndarray/test_full.py b/tests/ndarray/test_full.py index a33bb8bc..4f7a5e81 100644 --- a/tests/ndarray/test_full.py +++ b/tests/ndarray/test_full.py @@ -64,6 +64,7 @@ ) def test_full(shape, chunks, blocks, fill_value, cparams, dparams, dtype, urlpath, contiguous): blosc2.remove_urlpath(urlpath) + storage = {"urlpath": urlpath, "contiguous": contiguous} a = blosc2.full( shape, fill_value, @@ -72,8 +73,7 @@ def test_full(shape, chunks, blocks, fill_value, cparams, dparams, dtype, urlpat dtype=dtype, cparams=cparams, dparams=blosc2.DParams(**dparams), - urlpath=urlpath, - contiguous=contiguous, + **storage, ) assert asdict(a.schunk.dparams) == dparams if isinstance(fill_value, bytes): diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py index 0c0edbb4..825cc463 100644 --- a/tests/ndarray/test_lazyexpr.py +++ b/tests/ndarray/test_lazyexpr.py @@ -493,8 +493,8 @@ def test_save(): chunks = tuple(i // 2 for i in nres.shape) blocks = tuple(i // 4 for i in nres.shape) urlpath_eval = "eval_expr.b2nd" - res = expr.eval(storage=blosc2.Storage(urlpath=urlpath_eval, cparams=cparams, dparams=dparams, mode="w"), - chunks=chunks, blocks=blocks + res = expr.eval(storage=blosc2.Storage(urlpath=urlpath_eval, mode="w"), + chunks=chunks, blocks=blocks, cparams=cparams, dparams=dparams, ) np.testing.assert_allclose(res[:], nres, rtol=tol, atol=tol) diff --git a/tests/ndarray/test_ndarray.py b/tests/ndarray/test_ndarray.py index 597668ca..f6564f1d 100644 --- a/tests/ndarray/test_ndarray.py +++ b/tests/ndarray/test_ndarray.py @@ -25,11 +25,11 @@ ) @pytest.mark.parametrize("copy", [True, False]) def test_ndarray_cframe(contiguous, urlpath, cparams, dparams, nchunks, copy): - storage = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams, "dparams": dparams} + storage = {"contiguous": contiguous, "urlpath": urlpath} blosc2.remove_urlpath(urlpath) data = np.arange(200 * 1000 * nchunks, dtype="int32").reshape(200, 1000, nchunks) - ndarray = blosc2.asarray(data, **storage) + ndarray = blosc2.asarray(data, storage=storage, cparams=cparams, dparams=dparams) cframe = ndarray.to_cframe() ndarray2 = blosc2.ndarray_from_cframe(cframe, copy) diff --git a/tests/test_iterchunks.py b/tests/test_iterchunks.py index 2ecfdef3..8c1a8bfb 100644 --- a/tests/test_iterchunks.py +++ b/tests/test_iterchunks.py @@ -24,10 +24,10 @@ ], ) def test_iterchunks(contiguous, urlpath, cparams, dparams, nchunks): - storage = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams, "dparams": dparams} + kwargs = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams, "dparams": dparams} blosc2.remove_urlpath(urlpath) - schunk = blosc2.SChunk(chunksize=200 * 1000 * 4, **storage) + schunk = blosc2.SChunk(chunksize=200 * 1000 * 4, **kwargs) for i in range(nchunks): buffer = i * np.arange(200 * 1000, dtype="int32") @@ -54,12 +54,12 @@ def test_iterchunks(contiguous, urlpath, cparams, dparams, nchunks): ], ) def test_iterchunks_pf(contiguous, urlpath, cparams, dparams, nchunks): - storage = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams, "dparams": dparams} + kwargs = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams, "dparams": dparams} blosc2.remove_urlpath(urlpath) chunkshape = 200 * 1000 data = np.arange(0, nchunks * chunkshape, dtype=np.int32) - schunk = blosc2.SChunk(chunksize=chunkshape * 4, data=data, **storage) + schunk = blosc2.SChunk(chunksize=chunkshape * 4, data=data, **kwargs) @schunk.postfilter(np.int32, np.int32) def postf1(input, output, offset): diff --git a/tests/test_open.py b/tests/test_open.py index 2ed39535..839acbc2 100644 --- a/tests/test_open.py +++ b/tests/test_open.py @@ -43,11 +43,11 @@ def test_open(contiguous, urlpath, cparams, dparams, nchunks, chunk_nitems, dtyp if os.name == "nt" and mmap_mode == "c": pytest.skip("Cannot test mmap_mode 'c' on Windows") - storage = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams, "dparams": dparams} + kwargs = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams, "dparams": dparams} blosc2.remove_urlpath(urlpath) dtype = np.dtype(dtype) schunk = blosc2.SChunk( - chunksize=chunk_nitems * dtype.itemsize, mmap_mode="w+" if mmap_mode is not None else None, **storage + chunksize=chunk_nitems * dtype.itemsize, mmap_mode="w+" if mmap_mode is not None else None, **kwargs ) for i in range(nchunks): buffer = i * np.arange(chunk_nitems, dtype=dtype) diff --git a/tests/test_pathlib.py b/tests/test_pathlib.py index 9da3433b..f3c7f785 100644 --- a/tests/test_pathlib.py +++ b/tests/test_pathlib.py @@ -23,12 +23,12 @@ ) def test_schunk_pathlib(mode, mmap_mode, cparams, dparams, nchunks): urlpath = pathlib.Path("b2frame") - storage = {"urlpath": urlpath, "cparams": cparams, "dparams": dparams} + kwargs = {"urlpath": urlpath, "cparams": cparams, "dparams": dparams} blosc2.remove_urlpath(urlpath) if mode != "r": chunk_len = 200 * 1000 - schunk = blosc2.SChunk(chunksize=chunk_len * 4, mode=mode, mmap_mode=mmap_mode, **storage) + schunk = blosc2.SChunk(chunksize=chunk_len * 4, mode=mode, mmap_mode=mmap_mode, **kwargs) assert schunk.urlpath == str(urlpath) for i in range(nchunks): diff --git a/tests/test_proxy_schunk.py b/tests/test_proxy_schunk.py index caa233c0..607efaf3 100644 --- a/tests/test_proxy_schunk.py +++ b/tests/test_proxy_schunk.py @@ -22,11 +22,11 @@ ], ) def test_schunk_proxy(contiguous, urlpath, chunksize, nchunks, start, stop): - storage = {"contiguous": contiguous, "cparams": {"typesize": 4}} + kwargs = {"contiguous": contiguous, "cparams": {"typesize": 4}} blosc2.remove_urlpath(urlpath) num_elem = chunksize // 4 * nchunks data = np.arange(num_elem, dtype="int32") - schunk = blosc2.SChunk(chunksize=chunksize, data=data, **storage) + schunk = blosc2.SChunk(chunksize=chunksize, data=data, **kwargs) bytes_obj = data.tobytes() cache = blosc2.Proxy(schunk, urlpath=urlpath) @@ -58,13 +58,13 @@ def test_schunk_proxy(contiguous, urlpath, chunksize, nchunks, start, stop): ], ) def test_open(urlpath, chunksize, nchunks): - storage = {"urlpath": urlpath, "cparams": {"typesize": 4}} + kwargs = {"urlpath": urlpath, "cparams": {"typesize": 4}} proxy_urlpath = "proxy.b2frame" blosc2.remove_urlpath(urlpath) blosc2.remove_urlpath(proxy_urlpath) num_elem = chunksize // 4 * nchunks data = np.arange(num_elem, dtype="int32") - schunk = blosc2.SChunk(chunksize=chunksize, data=data, **storage) + schunk = blosc2.SChunk(chunksize=chunksize, data=data, **kwargs) bytes_obj = data.tobytes() proxy = blosc2.Proxy(schunk, urlpath=proxy_urlpath) del proxy diff --git a/tests/test_schunk.py b/tests/test_schunk.py index 31915627..db27362e 100644 --- a/tests/test_schunk.py +++ b/tests/test_schunk.py @@ -45,18 +45,18 @@ ], ) def test_schunk_numpy(contiguous, urlpath, mode, mmap_mode, cparams, dparams, nchunks): - storage = blosc2.Storage(contiguous=contiguous, urlpath=urlpath, mode=mode, mmap_mode=mmap_mode, - cparams=cparams, dparams=dparams) + storage = blosc2.Storage(contiguous=contiguous, urlpath=urlpath, mode=mode, mmap_mode=mmap_mode) blosc2.remove_urlpath(urlpath) chunk_len = 200 * 1000 if mode != "r": - schunk = blosc2.SChunk(chunksize=chunk_len * 4, storage=storage) + schunk = blosc2.SChunk(chunksize=chunk_len * 4, storage=storage, cparams=cparams, dparams=dparams) + else: with pytest.raises( ValueError, match="not specify a urlpath" if urlpath is None else "does not exist" ): - blosc2.SChunk(chunksize=chunk_len * 4, storage=storage) + blosc2.SChunk(chunksize=chunk_len * 4, storage=storage, cparams=cparams, dparams=dparams) # Create a schunk which we can read later storage2 = replace(storage, @@ -65,6 +65,8 @@ def test_schunk_numpy(contiguous, urlpath, mode, mmap_mode, cparams, dparams, nc schunk = blosc2.SChunk( chunksize=chunk_len * 4, storage=storage2, + cparams=cparams, + dparams=dparams ) assert schunk.urlpath == urlpath @@ -143,13 +145,13 @@ def test_schunk_ndarray(tmp_path, mode_write, mode_read, mmap_mode_write, mmap_m ], ) def test_schunk(contiguous, urlpath, mode, mmap_mode, nbytes, cparams, dparams, nchunks): - storage = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams, "dparams": dparams} + kwargs = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams, "dparams": dparams} numpy_meta = {b"dtype": str(np.dtype(np.uint8))} test_meta = {b"lorem": 1234} meta = {"numpy": numpy_meta, "test": test_meta} blosc2.remove_urlpath(urlpath) - schunk = blosc2.SChunk(chunksize=2 * nbytes, meta=meta, mode=mode, mmap_mode=mmap_mode, **storage) + schunk = blosc2.SChunk(chunksize=2 * nbytes, meta=meta, mode=mode, mmap_mode=mmap_mode, **kwargs) assert "numpy" in schunk.meta assert "error" not in schunk.meta @@ -201,12 +203,12 @@ def test_schunk(contiguous, urlpath, mode, mmap_mode, nbytes, cparams, dparams, ) @pytest.mark.parametrize("copy", [True, False]) def test_schunk_cframe(contiguous, urlpath, mode, mmap_mode, cparams, dparams, nchunks, copy): - storage = blosc2.Storage(contiguous=contiguous, urlpath=urlpath, cparams=cparams, dparams=dparams, + storage = blosc2.Storage(contiguous=contiguous, urlpath=urlpath, mode=mode, mmap_mode=mmap_mode) blosc2.remove_urlpath(urlpath) data = np.arange(200 * 1000 * nchunks, dtype="int32") - schunk = blosc2.SChunk(chunksize=200 * 1000 * 4, data=data, **asdict(storage)) + schunk = blosc2.SChunk(chunksize=200 * 1000 * 4, data=data, **asdict(storage), cparams=cparams, dparams=dparams) cframe = schunk.to_cframe() schunk2 = blosc2.schunk_from_cframe(cframe, copy) @@ -267,10 +269,10 @@ def test_schunk_cframe(contiguous, urlpath, mode, mmap_mode, cparams, dparams, n ], ) def test_schunk_cdparams(cparams, dparams, new_cparams, new_dparams): - storage = {"cparams": cparams, "dparams": dparams} + kwargs = {"cparams": cparams, "dparams": dparams} chunk_len = 200 * 1000 - schunk = blosc2.SChunk(chunksize=chunk_len * 4, **storage) + schunk = blosc2.SChunk(chunksize=chunk_len * 4, **kwargs) # Check cparams have been set correctly cparams_dict = cparams if isinstance(cparams, dict) else asdict(cparams) diff --git a/tests/test_schunk_constructor.py b/tests/test_schunk_constructor.py index f04b51d2..dda1e956 100644 --- a/tests/test_schunk_constructor.py +++ b/tests/test_schunk_constructor.py @@ -24,13 +24,13 @@ ], ) def test_schunk_numpy(contiguous, urlpath, cparams, dparams, chunksize): - storage = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams, "dparams": dparams} + kwargs = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams, "dparams": dparams} blosc2.remove_urlpath(urlpath) num_elem = 20 * 1000 nchunks = num_elem * 4 // chunksize + 1 if num_elem * 4 % chunksize != 0 else num_elem * 4 // chunksize data = np.arange(num_elem, dtype="int32") bytes_obj = data.tobytes() - schunk = blosc2.SChunk(chunksize=chunksize, data=data, **storage) + schunk = blosc2.SChunk(chunksize=chunksize, data=data, **kwargs) # Test properties assert len(schunk) == num_elem assert chunksize == schunk.chunksize @@ -81,14 +81,14 @@ def test_schunk_numpy(contiguous, urlpath, cparams, dparams, chunksize): ], ) def test_schunk(contiguous, urlpath, cparams, dparams, chunksize): - storage = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams, "dparams": dparams} + storage = {"contiguous": contiguous, "urlpath": urlpath} blosc2.remove_urlpath(urlpath) nrep = 1000 nchunks = 5 * nrep // chunksize + 1 if nrep * 5 % chunksize != 0 else 5 * nrep // chunksize buffer = b"1234 " * nrep - schunk = blosc2.SChunk(chunksize=chunksize, data=buffer, **storage) + schunk = blosc2.SChunk(chunksize=chunksize, data=buffer, cparams=cparams, dparams=dparams, **storage) for i in range(nchunks): start = i * chunksize @@ -141,11 +141,11 @@ def test_schunk(contiguous, urlpath, cparams, dparams, chunksize): ], ) def test_schunk_fill_special(contiguous, urlpath, cparams, nitems, special_value, expected_value): - storage = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams} + kwargs = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams} blosc2.remove_urlpath(urlpath) chunk_len = 200 * 1000 - schunk = blosc2.SChunk(chunksize=chunk_len * 4, **storage) + schunk = blosc2.SChunk(chunksize=chunk_len * 4, **kwargs) if special_value in [blosc2.SpecialValue.ZERO, blosc2.SpecialValue.NAN, blosc2.SpecialValue.UNINIT]: schunk.fill_special(nitems, special_value) else: diff --git a/tests/test_schunk_delete.py b/tests/test_schunk_delete.py index 2a00a7bb..6156c291 100644 --- a/tests/test_schunk_delete.py +++ b/tests/test_schunk_delete.py @@ -26,7 +26,7 @@ ], ) def test_schunk_delete_numpy(contiguous, urlpath, nchunks, ndeletes): - storage = { + kwargs = { "contiguous": contiguous, "urlpath": urlpath, "cparams": {"nthreads": 2}, @@ -34,7 +34,7 @@ def test_schunk_delete_numpy(contiguous, urlpath, nchunks, ndeletes): } blosc2.remove_urlpath(urlpath) - schunk = blosc2.SChunk(chunksize=200 * 1000 * 4, **storage) + schunk = blosc2.SChunk(chunksize=200 * 1000 * 4, **kwargs) for i in range(nchunks): buffer = i * np.arange(200 * 1000, dtype="int32") nchunks_ = schunk.append_data(buffer) @@ -72,13 +72,11 @@ def test_schunk_delete(contiguous, urlpath, nchunks, ndeletes): storage = { "contiguous": contiguous, "urlpath": urlpath, - "cparams": {"nthreads": 2}, - "dparams": {"nthreads": 2}, } blosc2.remove_urlpath(urlpath) nbytes = 23401 - schunk = blosc2.SChunk(chunksize=nbytes * 2, **storage) + schunk = blosc2.SChunk(chunksize=nbytes * 2, cparams={"nthreads": 2}, dparams={"nthreads": 2}, **storage) for i in range(nchunks): bytes_obj = b"i " * nbytes nchunks_ = schunk.append_data(bytes_obj) diff --git a/tests/test_schunk_get_slice.py b/tests/test_schunk_get_slice.py index bf563ecf..d136460a 100644 --- a/tests/test_schunk_get_slice.py +++ b/tests/test_schunk_get_slice.py @@ -34,11 +34,11 @@ ], ) def test_schunk_get_slice(contiguous, urlpath, mode, cparams, dparams, nchunks, start, stop): - storage = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams, "dparams": dparams} + kwargs = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams, "dparams": dparams} blosc2.remove_urlpath(urlpath) data = np.arange(200 * 100 * nchunks, dtype="int32") - schunk = blosc2.SChunk(chunksize=200 * 100 * 4, data=data, mode=mode, **storage) + schunk = blosc2.SChunk(chunksize=200 * 100 * 4, data=data, mode=mode, **kwargs) start_, stop_ = start, stop if start is None: @@ -82,10 +82,8 @@ def test_schunk_get_slice(contiguous, urlpath, mode, cparams, dparams, nchunks, ], ) def test_schunk_getitem_int(cparams, nchunks, elem): - storage = {"cparams": cparams} - data = np.arange(200 * 100 * nchunks, dtype="int32") - schunk = blosc2.SChunk(chunksize=200 * 100 * 4, data=data, **storage) + schunk = blosc2.SChunk(chunksize=200 * 100 * 4, data=data, cparams=cparams) sl = data[elem] res = schunk[elem] @@ -93,12 +91,12 @@ def test_schunk_getitem_int(cparams, nchunks, elem): def test_schunk_get_slice_raises(): - storage = {"contiguous": True, "urlpath": "schunk.b2frame", "cparams": {"typesize": 4}, "dparams": {}} - blosc2.remove_urlpath(storage["urlpath"]) + kwargs = {"contiguous": True, "urlpath": "schunk.b2frame", "cparams": {"typesize": 4}, "dparams": {}} + blosc2.remove_urlpath(kwargs["urlpath"]) nchunks = 2 data = np.arange(200 * 100 * nchunks, dtype="int32") - schunk = blosc2.SChunk(chunksize=200 * 100 * 4, data=data, **storage) + schunk = blosc2.SChunk(chunksize=200 * 100 * 4, data=data, **kwargs) start = 200 * 100 stop = 200 * 100 * nchunks @@ -118,4 +116,4 @@ def test_schunk_get_slice_raises(): stop = start + 4 assert schunk[start:stop] == b"" - blosc2.remove_urlpath(storage["urlpath"]) + blosc2.remove_urlpath(kwargs["urlpath"]) diff --git a/tests/test_schunk_get_slice_nchunks.py b/tests/test_schunk_get_slice_nchunks.py index 15ce2c14..ecce9cf2 100644 --- a/tests/test_schunk_get_slice_nchunks.py +++ b/tests/test_schunk_get_slice_nchunks.py @@ -33,8 +33,8 @@ ], ) def test_schunk_get_slice(contiguous, urlpath, cparams, nchunks, start, stop): - storage = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams} - schunk = blosc2.SChunk(chunksize=200 * 100 * 4, mode="w", **storage) + kwargs = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams} + schunk = blosc2.SChunk(chunksize=200 * 100 * 4, mode="w", **kwargs) for i in range(nchunks): chunk = np.full(schunk.chunksize // schunk.typesize, i, dtype=np.int32) schunk.append_data(chunk) diff --git a/tests/test_schunk_insert.py b/tests/test_schunk_insert.py index a08f230f..18abc21a 100644 --- a/tests/test_schunk_insert.py +++ b/tests/test_schunk_insert.py @@ -30,15 +30,10 @@ @pytest.mark.parametrize("create_chunk", [True, False]) def test_schunk_insert_numpy(contiguous, urlpath, nchunks, ninserts, copy, create_chunk, gil): blosc2.set_releasegil(gil) - storage = { - "contiguous": contiguous, - "urlpath": urlpath, - "cparams": {"nthreads": 2}, - "dparams": {"nthreads": 2}, - } + storage = blosc2.Storage(contiguous=contiguous, urlpath=urlpath) blosc2.remove_urlpath(urlpath) - schunk = blosc2.SChunk(chunksize=200 * 1000 * 4, **storage) + schunk = blosc2.SChunk(chunksize=200 * 1000 * 4, storage=storage, cparams={"nthreads": 2}, dparams={"nthreads": 2}) for i in range(nchunks): buffer = i * np.arange(200 * 1000, dtype="int32") nchunks_ = schunk.append_data(buffer) diff --git a/tests/test_schunk_set_slice.py b/tests/test_schunk_set_slice.py index 3f75f48a..f5a26683 100644 --- a/tests/test_schunk_set_slice.py +++ b/tests/test_schunk_set_slice.py @@ -34,11 +34,11 @@ ], ) def test_schunk_set_slice(contiguous, urlpath, mode, cparams, dparams, nchunks, start, stop): - storage = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams, "dparams": dparams} + storage = {"contiguous": contiguous, "urlpath": urlpath, "mode": mode} blosc2.remove_urlpath(urlpath) data = np.arange(200 * 100 * nchunks, dtype="int32") - schunk = blosc2.SChunk(chunksize=200 * 100 * 4, data=data, mode=mode, **storage) + schunk = blosc2.SChunk(chunksize=200 * 100 * 4, data=data, storage=storage, cparams=cparams, dparams=dparams) _start, _stop = start, stop if _start is None: @@ -62,14 +62,14 @@ def test_schunk_set_slice(contiguous, urlpath, mode, cparams, dparams, nchunks, def test_schunk_set_slice_raises(): - storage = {"contiguous": True, "urlpath": "schunk.b2frame", "cparams": {"typesize": 4}, "dparams": {}} - blosc2.remove_urlpath(storage["urlpath"]) + kwargs = {"contiguous": True, "urlpath": "schunk.b2frame", "cparams": {"typesize": 4}, "dparams": {}} + blosc2.remove_urlpath(kwargs["urlpath"]) nchunks = 2 data = np.arange(200 * 100 * nchunks, dtype="int32") - blosc2.SChunk(chunksize=200 * 100 * 4, data=data, **storage) + blosc2.SChunk(chunksize=200 * 100 * 4, data=data, **kwargs) - schunk = blosc2.open(storage["urlpath"], mode="r") + schunk = blosc2.open(kwargs["urlpath"], mode="r") start = 200 * 100 stop = 200 * 100 * nchunks val = 3 * np.arange(start, stop, dtype="int32") @@ -77,7 +77,7 @@ def test_schunk_set_slice_raises(): with pytest.raises(ValueError): schunk[start:stop] = val - schunk = blosc2.open(storage["urlpath"], mode="a") + schunk = blosc2.open(kwargs["urlpath"], mode="a") with pytest.raises(IndexError): schunk[start:stop:2] = val @@ -95,4 +95,4 @@ def test_schunk_set_slice_raises(): with pytest.raises(ValueError): schunk[start:stop] = val - blosc2.remove_urlpath(storage["urlpath"]) + blosc2.remove_urlpath(kwargs["urlpath"]) diff --git a/tests/test_schunk_update.py b/tests/test_schunk_update.py index 533e483a..03a247b0 100644 --- a/tests/test_schunk_update.py +++ b/tests/test_schunk_update.py @@ -29,7 +29,7 @@ @pytest.mark.parametrize("create_chunk", [True, False]) def test_schunk_update_numpy(contiguous, urlpath, nchunks, nupdates, copy, create_chunk, gil): blosc2.set_releasegil(gil) - storage = { + kwargs = { "contiguous": contiguous, "urlpath": urlpath, "cparams": {"nthreads": 2}, @@ -37,7 +37,7 @@ def test_schunk_update_numpy(contiguous, urlpath, nchunks, nupdates, copy, creat } blosc2.remove_urlpath(urlpath) - schunk = blosc2.SChunk(chunksize=200 * 1000 * 4, **storage) + schunk = blosc2.SChunk(chunksize=200 * 1000 * 4, **kwargs) for i in range(nchunks): buffer = i * np.arange(200 * 1000, dtype="int32") nchunks_ = schunk.append_data(buffer) @@ -79,7 +79,7 @@ def test_schunk_update_numpy(contiguous, urlpath, nchunks, nupdates, copy, creat @pytest.mark.parametrize("create_chunk", [True, False]) def test_update(contiguous, urlpath, nchunks, nupdates, copy, create_chunk, gil): blosc2.set_releasegil(gil) - storage = { + kwargs = { "contiguous": contiguous, "urlpath": urlpath, "cparams": {"nthreads": 2}, @@ -89,7 +89,7 @@ def test_update(contiguous, urlpath, nchunks, nupdates, copy, create_chunk, gil) blosc2.remove_urlpath(urlpath) nbytes = 23401 - schunk = blosc2.SChunk(chunksize=nbytes * 2, **storage) + schunk = blosc2.SChunk(chunksize=nbytes * 2, **kwargs) for i in range(nchunks): bytes_obj = b"i " * nbytes nchunks_ = schunk.append_data(bytes_obj) diff --git a/tests/test_storage.py b/tests/test_storage.py index 20908791..2f63a92c 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -14,22 +14,21 @@ @pytest.mark.parametrize( - "urlpath, contiguous, mode, mmap_mode, cparams, dparams", + "urlpath, contiguous, mode, mmap_mode", [ - (None, None, "w", None, blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=6, typesize=4), blosc2.DParams()), - (None, False, "a", None, {"typesize": 4}, blosc2.DParams()), - (None, None, "r", None, blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=6, typesize=4), blosc2.DParams(nthreads=4)), - (None, True, "a", None, blosc2.CParams(splitmode=blosc2.SplitMode.ALWAYS_SPLIT, nthreads=5, typesize=4), {}), - ("b2frame", None, "r", None, {"codec": blosc2.Codec.LZ4HC, "typesize": 4}, blosc2.DParams()), - ("b2frame", False, "a", None, blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=6, typesize=4), blosc2.DParams(nthreads=4)), - ("b2frame", True, "w", None, blosc2.CParams(splitmode=blosc2.SplitMode.ALWAYS_SPLIT, nthreads=5, typesize=4), {}), - ("b2frame", True, "r", "r", blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=6, typesize=4), blosc2.DParams()), - ("b2frame", None, "w", "w+", {"typesize": 4}, {}), + (None, None, "w", None), + (None, False, "a", None), + (None, None, "r", None), + (None, True, "a", None), + ("b2frame", None, "r", None), + ("b2frame", False, "a", None), + ("b2frame", True, "w", None), + ("b2frame", True, "r", "r"), + ("b2frame", None, "w", "w+"), ], ) -def test_storage_values(contiguous, urlpath, mode, mmap_mode, cparams, dparams): - storage = blosc2.Storage(contiguous=contiguous, urlpath=urlpath, mode=mode, mmap_mode=mmap_mode, - cparams=cparams, dparams=dparams) +def test_storage_values(contiguous, urlpath, mode, mmap_mode): + storage = blosc2.Storage(contiguous=contiguous, urlpath=urlpath, mode=mode, mmap_mode=mmap_mode) if contiguous is None: if urlpath is not None: assert storage.contiguous @@ -41,8 +40,6 @@ def test_storage_values(contiguous, urlpath, mode, mmap_mode, cparams, dparams): assert storage.urlpath == urlpath assert storage.mode == mode assert storage.mmap_mode == mmap_mode - assert storage.cparams == cparams - assert storage.dparams == dparams def test_storage_defaults(): @@ -52,23 +49,20 @@ def test_storage_defaults(): assert storage.mode == "a" assert storage.mmap_mode is None assert storage.initial_mapping_size is None - assert storage.cparams == blosc2.CParams() - assert storage.dparams == blosc2.DParams() assert storage.meta is None @pytest.mark.parametrize( - "urlpath, contiguous, cparams, dparams", + "urlpath, contiguous", [ - (None, False, blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=6, typesize=4), blosc2.DParams()), - (None, True, {"typesize": 4}, blosc2.DParams(nthreads=4)), - ("b2frame", False, blosc2.CParams(splitmode=blosc2.SplitMode.ALWAYS_SPLIT, nthreads=5, typesize=4), {}), - ("b2frame", True, {"codec": blosc2.Codec.LZ4HC, "typesize": 4}, {}), + (None, False), + (None, True), + ("b2frame", False), + ("b2frame", True), ], ) -def test_raises_storage(contiguous, urlpath, cparams, dparams): - storage = blosc2.Storage(contiguous=contiguous, urlpath=urlpath, - cparams=cparams, dparams=dparams) +def test_raises_storage(contiguous, urlpath): + storage = blosc2.Storage(contiguous=contiguous, urlpath=urlpath) blosc2.remove_urlpath(urlpath) for field in fields(blosc2.Storage): diff --git a/tests/test_vlmeta.py b/tests/test_vlmeta.py index ddbe5267..9155cc1f 100644 --- a/tests/test_vlmeta.py +++ b/tests/test_vlmeta.py @@ -21,10 +21,10 @@ ], ) def test_schunk_numpy(contiguous, urlpath, cparams, dparams, nchunks): - storage = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams, "dparams": dparams} + kwargs = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams, "dparams": dparams} blosc2.remove_urlpath(urlpath) - schunk = blosc2.SChunk(chunksize=200 * 1000 * 4, **storage) + schunk = blosc2.SChunk(chunksize=200 * 1000 * 4, **kwargs) for i in range(nchunks): buffer = i * np.arange(200 * 1000, dtype="int32") nchunks_ = schunk.append_data(buffer) @@ -47,10 +47,10 @@ def test_schunk_numpy(contiguous, urlpath, cparams, dparams, nchunks): ], ) def test_schunk(contiguous, urlpath, nbytes, cparams, dparams, nchunks): - storage = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams, "dparams": dparams} + kwargs = {"contiguous": contiguous, "urlpath": urlpath, "cparams": cparams, "dparams": dparams} blosc2.remove_urlpath(urlpath) - schunk = blosc2.SChunk(chunksize=2 * nbytes, **storage) + schunk = blosc2.SChunk(chunksize=2 * nbytes, **kwargs) for i in range(nchunks): bytes_obj = b"i " * nbytes nchunks_ = schunk.append_data(bytes_obj) From 9ecb2e01fe0f47a510b5d5366e7bcd9f066d0c52 Mon Sep 17 00:00:00 2001 From: Marta Iborra Date: Mon, 23 Sep 2024 14:00:11 +0200 Subject: [PATCH 13/18] Further test dataclasses with ndarrays --- tests/test_storage.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/tests/test_storage.py b/tests/test_storage.py index 2f63a92c..e320da25 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -7,6 +7,7 @@ ####################################################################### from dataclasses import asdict, fields +import numpy as np import pytest @@ -71,6 +72,11 @@ def test_raises_storage(contiguous, urlpath): with pytest.raises(TypeError): _ = blosc2.SChunk(**{str(field.name): {}}, **asdict(storage)) + with pytest.raises(AttributeError): + _ = blosc2.empty((30, 30), storage=storage, **{str(field.name): {}}) + with pytest.raises(TypeError): + _ = blosc2.empty((30, 30), **{str(field.name): {}}, **asdict(storage)) + @pytest.mark.parametrize( "cparams", @@ -84,13 +90,23 @@ def test_raises_storage(contiguous, urlpath): def test_cparams_values(cparams): schunk = blosc2.SChunk(cparams=cparams) cparams_dataclass = cparams if isinstance(cparams, blosc2.CParams) else blosc2.CParams(**cparams) - for field in fields(cparams_dataclass): if field.name in ['filters', 'filters_meta']: assert getattr(schunk.cparams, field.name)[:len(getattr(cparams_dataclass, field.name))] == getattr(cparams_dataclass, field.name) else: assert getattr(schunk.cparams, field.name) == getattr(cparams_dataclass, field.name) + array = blosc2.empty((30, 30), np.int32, cparams=cparams) + for field in fields(cparams_dataclass): + print(field.name) + if field.name in ['filters', 'filters_meta']: + print(getattr(array.schunk.cparams, field.name)) + assert getattr(array.schunk.cparams, field.name)[:len(getattr(cparams_dataclass, field.name))] == getattr(cparams_dataclass, field.name) + elif field.name == 'typesize': + assert getattr(array.schunk.cparams, field.name) == array.dtype.itemsize + elif field.name != 'blocksize': + assert getattr(array.schunk.cparams, field.name) == getattr(cparams_dataclass, field.name) + def test_cparams_defaults(): cparams = blosc2.CParams() @@ -114,6 +130,8 @@ def test_raises_cparams(): _ = blosc2.SChunk(cparams=cparams, **{str(field.name): {}}) with pytest.raises(AttributeError): _ = blosc2.compress2(b"12345678" * 1000, cparams=cparams, **{str(field.name): {}}) + with pytest.raises(KeyError): + _ = blosc2.empty((10, 10), cparams=cparams, **{str(field.name): {}}) @pytest.mark.parametrize( @@ -128,9 +146,10 @@ def test_raises_cparams(): def test_dparams_values(dparams): schunk = blosc2.SChunk(dparams=dparams) dparams_dataclass = dparams if isinstance(dparams, blosc2.DParams) else blosc2.DParams(**dparams) - + array = blosc2.empty((30, 30), dparams=dparams) for field in fields(dparams_dataclass): assert getattr(schunk.dparams, field.name) == getattr(dparams_dataclass, field.name) + assert getattr(array.schunk.dparams, field.name) == getattr(dparams_dataclass, field.name) def test_dparams_defaults(): From 5a8c269a2e054ff983c6a2183d1f0cd425140c8c Mon Sep 17 00:00:00 2001 From: Marta Iborra Date: Tue, 24 Sep 2024 11:40:50 +0200 Subject: [PATCH 14/18] Always use blosc2.nthreads as default value --- src/blosc2/__init__.py | 21 +++++++++++---------- src/blosc2/blosc2_ext.pyx | 4 ++-- src/blosc2/core.py | 3 ++- src/blosc2/storage.py | 13 +------------ tests/test_storage.py | 24 ++++++++++++++++++++++-- 5 files changed, 38 insertions(+), 27 deletions(-) diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py index 0672bb35..24b6f96e 100644 --- a/src/blosc2/__init__.py +++ b/src/blosc2/__init__.py @@ -188,15 +188,23 @@ class Tuner(Enum): unpack_tensor, ) +# Internal Blosc threading +# Get CPU info +cpu_info = get_cpu_info() +nthreads = ncores = cpu_info.get("count", 1) +"""Number of threads to be used in compression/decompression. +""" +# Protection against too many threads +nthreads = min(nthreads, 32) +# Experiments say that, when using a large number of threads, it is better to not use them all +nthreads -= nthreads // 8 + # This import must be before ndarray and schunk from .storage import ( CParams, cparams_dflts, - cpu_info, DParams, dparams_dflts, - ncores, - nthreads, Storage, storage_dflts, ) @@ -254,13 +262,6 @@ class Tuner(Enum): The blosc2 version + date. """ -# Internal Blosc threading -set_nthreads(nthreads) - -# Set the number of threads for NumExpr -numexpr.set_num_threads(nthreads) - -_disable_overloaded_equal = False # Delayed imports for avoiding overwriting of python builtins from .ndarray import ( diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx index b44b9284..6358186c 100644 --- a/src/blosc2/blosc2_ext.pyx +++ b/src/blosc2/blosc2_ext.pyx @@ -726,7 +726,7 @@ cdef create_cparams_from_kwargs(blosc2_cparams *cparams, kwargs): cparams.clevel = kwargs.get('clevel', blosc2.cparams_dflts['clevel']) cparams.use_dict = kwargs.get('use_dict', blosc2.cparams_dflts['use_dict']) cparams.typesize = typesize = kwargs.get('typesize', blosc2.cparams_dflts['typesize']) - cparams.nthreads = kwargs.get('nthreads', blosc2.cparams_dflts['nthreads']) + cparams.nthreads = kwargs.get('nthreads', blosc2.nthreads) cparams.blocksize = kwargs.get('blocksize', blosc2.cparams_dflts['blocksize']) splitmode = kwargs.get('splitmode', blosc2.cparams_dflts['splitmode']) cparams.splitmode = splitmode.value @@ -807,7 +807,7 @@ def compress2(src, **kwargs): return dest[:size] cdef create_dparams_from_kwargs(blosc2_dparams *dparams, kwargs, blosc2_cparams* cparams=NULL): - dparams.nthreads = kwargs.get('nthreads', blosc2.dparams_dflts['nthreads']) + dparams.nthreads = kwargs.get('nthreads', blosc2.nthreads) dparams.schunk = NULL dparams.postfilter = NULL dparams.postparams = NULL diff --git a/src/blosc2/core.py b/src/blosc2/core.py index 2f5ab25b..7eb2da54 100644 --- a/src/blosc2/core.py +++ b/src/blosc2/core.py @@ -894,8 +894,9 @@ def set_nthreads(nthreads: int) -> int: -------- :attr:`~blosc2.nthreads` """ + rc = blosc2_ext.set_nthreads(nthreads) blosc2.nthreads = nthreads - return blosc2_ext.set_nthreads(nthreads) + return rc def compressor_list(plugins: bool = False) -> list: diff --git a/src/blosc2/storage.py b/src/blosc2/storage.py index 6d1ba296..9abf126b 100644 --- a/src/blosc2/storage.py +++ b/src/blosc2/storage.py @@ -11,20 +11,9 @@ import blosc2 -# Internal Blosc threading -# Get CPU info -cpu_info = blosc2.get_cpu_info() -nthreads = ncores = cpu_info.get("count", 1) -"""Number of threads to be used in compression/decompression. -""" -# Protection against too many threads -nthreads = min(nthreads, 32) -# Experiments say that, when using a large number of threads, it is better to not use them all -nthreads -= nthreads // 8 - def default_nthreads(): - return nthreads + return blosc2.nthreads def default_filters(): return [blosc2.Filter.NOFILTER, diff --git a/tests/test_storage.py b/tests/test_storage.py index e320da25..93b64158 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -98,15 +98,21 @@ def test_cparams_values(cparams): array = blosc2.empty((30, 30), np.int32, cparams=cparams) for field in fields(cparams_dataclass): - print(field.name) if field.name in ['filters', 'filters_meta']: - print(getattr(array.schunk.cparams, field.name)) assert getattr(array.schunk.cparams, field.name)[:len(getattr(cparams_dataclass, field.name))] == getattr(cparams_dataclass, field.name) elif field.name == 'typesize': assert getattr(array.schunk.cparams, field.name) == array.dtype.itemsize elif field.name != 'blocksize': assert getattr(array.schunk.cparams, field.name) == getattr(cparams_dataclass, field.name) + blosc2.set_nthreads(10) + schunk = blosc2.SChunk(cparams=cparams) + cparams_dataclass = cparams if isinstance(cparams, blosc2.CParams) else blosc2.CParams(**cparams) + assert schunk.cparams.nthreads == cparams_dataclass.nthreads + + array = blosc2.empty((30, 30), np.int32, cparams=cparams) + assert array.schunk.cparams.nthreads == cparams_dataclass.nthreads + def test_cparams_defaults(): cparams = blosc2.CParams() @@ -122,6 +128,10 @@ def test_cparams_defaults(): assert cparams.blocksize == 0 assert cparams.tuner == blosc2.Tuner.STUNE + blosc2.set_nthreads(1) + cparams = blosc2.CParams() + assert cparams.nthreads == blosc2.nthreads + def test_raises_cparams(): cparams = blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=6, typesize=4) @@ -151,11 +161,21 @@ def test_dparams_values(dparams): assert getattr(schunk.dparams, field.name) == getattr(dparams_dataclass, field.name) assert getattr(array.schunk.dparams, field.name) == getattr(dparams_dataclass, field.name) + blosc2.set_nthreads(3) + schunk = blosc2.SChunk(dparams=dparams) + dparams_dataclass = dparams if isinstance(dparams, blosc2.DParams) else blosc2.DParams(**dparams) + array = blosc2.empty((30, 30), dparams=dparams) + assert schunk.dparams.nthreads == dparams_dataclass.nthreads + assert array.schunk.dparams.nthreads == dparams_dataclass.nthreads def test_dparams_defaults(): dparams = blosc2.DParams() assert dparams.nthreads == blosc2.nthreads + blosc2.set_nthreads(1) + dparams = blosc2.DParams() + assert dparams.nthreads == blosc2.nthreads + def test_raises_dparams(): dparams = blosc2.DParams() From 45371615ff414afeec4a9df2d1967d71b2925126 Mon Sep 17 00:00:00 2001 From: Marta Iborra Date: Tue, 24 Sep 2024 11:59:59 +0200 Subject: [PATCH 15/18] Further testing --- tests/ndarray/test_lazyexpr.py | 12 ++++++------ tests/ndarray/test_lossy.py | 2 +- tests/ndarray/test_metalayers.py | 5 ++--- tests/ndarray/test_mode.py | 11 ++++------- tests/ndarray/test_ndarray.py | 6 +++--- tests/ndarray/test_reductions.py | 2 +- 6 files changed, 17 insertions(+), 21 deletions(-) diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py index 825cc463..69fd0022 100644 --- a/tests/ndarray/test_lazyexpr.py +++ b/tests/ndarray/test_lazyexpr.py @@ -160,7 +160,7 @@ def test_simple_expression(array_fixture): a1, a2, a3, a4, na1, na2, na3, na4 = array_fixture expr = a1 + a2 - a3 * a4 nres = ne.evaluate("na1 + na2 - na3 * na4") - res = expr.eval() + res = expr.eval(cparams=blosc2.CParams()) np.testing.assert_allclose(res[:], nres) @@ -171,7 +171,7 @@ def test_proxy_simple_expression(array_fixture): a3 = blosc2.Proxy(a3) expr = a1 + a2 - a3 * a4 nres = ne.evaluate("na1 + na2 - na3 * na4") - res = expr.eval() + res = expr.eval(storage=blosc2.Storage()) np.testing.assert_allclose(res[:], nres) @@ -221,7 +221,7 @@ def test_func_expression(array_fixture): expr = (a1 + a2) * a3 - a4 expr = blosc2.sin(expr) + blosc2.cos(expr) nres = ne.evaluate("sin((na1 + na2) * na3 - na4) + cos((na1 + na2) * na3 - na4)") - res = expr.eval() + res = expr.eval(storage={}) np.testing.assert_allclose(res[:], nres) @@ -250,7 +250,7 @@ def test_comparison_operators(dtype_fixture, compare_expressions, comparison_ope else: expr = eval(f"a1 {comparison_operator} a2", {"a1": a1, "a2": a2}) expr_string = f"na1 {comparison_operator} na2" - res_lazyexpr = expr.eval() + res_lazyexpr = expr.eval(dparams={}) # Evaluate using NumExpr res_numexpr = ne.evaluate(expr_string) # Compare the results @@ -290,7 +290,7 @@ def test_functions(function, dtype_fixture, shape_fixture): a1 = blosc2.asarray(na1, cparams=cparams) # Construct the lazy expression based on the function name expr = blosc2.LazyExpr(new_op=(a1, function, None)) - res_lazyexpr = expr.eval() + res_lazyexpr = expr.eval(cparams={}) # Evaluate using NumExpr expr_string = f"{function}(na1)" res_numexpr = ne.evaluate(expr_string) @@ -384,7 +384,7 @@ def test_abs(shape_fixture, dtype_fixture): na1 = np.linspace(-1, 1, nelems, dtype=dtype_fixture).reshape(shape_fixture) a1 = blosc2.asarray(na1) expr = blosc2.LazyExpr(new_op=(a1, "abs", None)) - res_lazyexpr = expr.eval() + res_lazyexpr = expr.eval(dparams={}) res_np = np.abs(na1) np.testing.assert_allclose(res_lazyexpr[:], res_np) diff --git a/tests/ndarray/test_lossy.py b/tests/ndarray/test_lossy.py index 8e110017..7b646aae 100644 --- a/tests/ndarray/test_lossy.py +++ b/tests/ndarray/test_lossy.py @@ -18,7 +18,7 @@ ( (32, 18), np.float32, - {"codec": blosc2.Codec.NDLZ, "codec_meta": 4}, + blosc2.CParams(codec=blosc2.Codec.NDLZ, codec_meta=4), None, False, ), diff --git a/tests/ndarray/test_metalayers.py b/tests/ndarray/test_metalayers.py index 4a7aae43..68c6ab48 100644 --- a/tests/ndarray/test_metalayers.py +++ b/tests/ndarray/test_metalayers.py @@ -41,9 +41,8 @@ def test_metalayers(shape, chunks, blocks, urlpath, contiguous, dtype): chunks=chunks, blocks=blocks, dtype=dtype, - urlpath=urlpath, - contiguous=contiguous, - meta={"numpy": numpy_meta, "test": test_meta}, + storage=blosc2.Storage(urlpath=urlpath, contiguous=contiguous, + meta={"numpy": numpy_meta, "test": test_meta}), ) assert os.path.exists(urlpath) diff --git a/tests/ndarray/test_mode.py b/tests/ndarray/test_mode.py index 78be1478..3249fac1 100644 --- a/tests/ndarray/test_mode.py +++ b/tests/ndarray/test_mode.py @@ -22,14 +22,14 @@ 3.14, np.float64, {"codec": blosc2.Codec.ZLIB, "clevel": 5, "use_dict": False, "nthreads": 2}, - {"nthreads": 1}, + blosc2.DParams(nthreads=1), False, ), ( (13, 13), 123456789, None, - {"codec": blosc2.Codec.LZ4HC, "clevel": 8, "use_dict": False, "nthreads": 2}, + blosc2.CParams(codec=blosc2.Codec.LZ4HC, clevel=8, use_dict=False, nthreads=2), {"nthreads": 2}, True, ), @@ -45,9 +45,7 @@ def test_mode(shape, fill_value, cparams, dparams, dtype, urlpath, contiguous, m dtype=dtype, cparams=cparams, dparams=dparams, - urlpath=urlpath, - contiguous=contiguous, - mode=mode, + storage={"urlpath": urlpath, "contiguous": contiguous, "mode": mode}, ) _ = blosc2.full( shape, @@ -55,8 +53,7 @@ def test_mode(shape, fill_value, cparams, dparams, dtype, urlpath, contiguous, m dtype=dtype, cparams=cparams, dparams=dparams, - urlpath=urlpath, - contiguous=contiguous, + storage={"urlpath": urlpath, "contiguous": contiguous}, ) a = blosc2.open(urlpath, mode=mode) diff --git a/tests/ndarray/test_ndarray.py b/tests/ndarray/test_ndarray.py index f6564f1d..17d02938 100644 --- a/tests/ndarray/test_ndarray.py +++ b/tests/ndarray/test_ndarray.py @@ -17,10 +17,10 @@ @pytest.mark.parametrize( "cparams, dparams, nchunks", [ - ({"codec": blosc2.Codec.LZ4, "clevel": 6, "typesize": 4}, {}, 1), + (blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=6, typesize=4), blosc2.DParams(), 1), ({"typesize": 4}, {"nthreads": 4}, 1), - ({"splitmode": blosc2.SplitMode.ALWAYS_SPLIT, "typesize": 4}, {}, 5), - ({"codec": blosc2.Codec.LZ4HC, "typesize": 4}, {}, 10), + ({"splitmode": blosc2.SplitMode.ALWAYS_SPLIT, "typesize": 4}, blosc2.DParams(), 5), + (blosc2.CParams(codec=blosc2.Codec.LZ4HC, typesize=4), {}, 10), ], ) @pytest.mark.parametrize("copy", [True, False]) diff --git a/tests/ndarray/test_reductions.py b/tests/ndarray/test_reductions.py index 7b456fbd..496713c6 100644 --- a/tests/ndarray/test_reductions.py +++ b/tests/ndarray/test_reductions.py @@ -65,7 +65,7 @@ def test_reduce_bool(array_fixture, reduce_op): @pytest.mark.parametrize("axis", [0, 1, (0, 1), None]) @pytest.mark.parametrize("keepdims", [True, False]) @pytest.mark.parametrize("dtype_out", [np.int16, np.float64]) -@pytest.mark.parametrize("kwargs", [{}, {"cparams": dict(clevel=1, filters=[blosc2.Filter.BITSHUFFLE], filters_meta=[0])}]) +@pytest.mark.parametrize("kwargs", [{}, {"cparams": blosc2.CParams(clevel=1, filters=[blosc2.Filter.BITSHUFFLE], filters_meta=[0])}]) def test_reduce_params(array_fixture, axis, keepdims, dtype_out, reduce_op, kwargs): a1, a2, a3, a4, na1, na2, na3, na4 = array_fixture if axis is not None and np.isscalar(axis) and len(a1.shape) >= axis: From 5fce6315bf38ffd3125a52afe5187005a463a8a9 Mon Sep 17 00:00:00 2001 From: Marta Iborra Date: Tue, 24 Sep 2024 12:35:24 +0200 Subject: [PATCH 16/18] Do some fixes --- src/blosc2/__init__.py | 1 - src/blosc2/blosc2_ext.pyx | 1 - src/blosc2/core.py | 18 ++++++++++++------ src/blosc2/schunk.py | 2 +- src/blosc2/storage.py | 10 ++++++---- tests/ndarray/test_lossy.py | 4 +++- tests/test_compress2.py | 12 ++++++------ 7 files changed, 28 insertions(+), 20 deletions(-) diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py index 24b6f96e..a3197573 100644 --- a/src/blosc2/__init__.py +++ b/src/blosc2/__init__.py @@ -234,7 +234,6 @@ class Tuner(Enum): from .schunk import SChunk, open - # Registry for postfilters postfilter_funcs = {} """ diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx index 6358186c..fb5c1d16 100644 --- a/src/blosc2/blosc2_ext.pyx +++ b/src/blosc2/blosc2_ext.pyx @@ -33,7 +33,6 @@ from enum import Enum import numpy as np from msgpack import packb, unpackb -from dataclasses import asdict import blosc2 diff --git a/src/blosc2/core.py b/src/blosc2/core.py index 7eb2da54..3544e237 100644 --- a/src/blosc2/core.py +++ b/src/blosc2/core.py @@ -1398,9 +1398,9 @@ def compress2(src: object, **kwargs: dict) -> str | bytes: Compression parameters. The default values are in :class:`blosc2.CParams`. Keyword arguments supported: - cparams: :class:`blosc2.CParams` + cparams: :class:`blosc2.CParams` or dict All the compression parameters that you want to use as - a :class:`blosc2.CParams` instance. + a :class:`blosc2.CParams` or dict instance. others: Any If `cparams` is not passed, all the parameters of a :class:`blosc2.CParams` can be passed as keyword arguments. @@ -1421,7 +1421,10 @@ def compress2(src: object, **kwargs: dict) -> str | bytes: if 'cparams' in kwargs: if len(kwargs) > 1: raise AttributeError("Cannot pass both cparams and other kwargs already included in CParams") - kwargs = asdict(kwargs.get('cparams')) + if isinstance(kwargs.get('cparams'), blosc2.CParams): + kwargs = asdict(kwargs.get('cparams')) + else: + kwargs = kwargs.get('cparams') return blosc2_ext.compress2(src, **kwargs) @@ -1448,9 +1451,9 @@ def decompress2(src: object, dst: object | bytearray = None, **kwargs: dict) -> Decompression parameters. The default values are in :class:`blosc2.DParams`. Keyword arguments supported: - dparams: :class:`blosc2.DParams` + dparams: :class:`blosc2.DParams` or dict All the decompression parameters that you want to use as - a :class:`blosc2.DParams` instance. + a :class:`blosc2.DParams` or dict instance. others: Any If `dparams` is not passed, all the parameters of a :class:`blosc2.DParams` can be passed as keyword arguments. @@ -1479,7 +1482,10 @@ def decompress2(src: object, dst: object | bytearray = None, **kwargs: dict) -> if 'dparams' in kwargs: if len(kwargs) > 1: raise AttributeError("Cannot pass both dparams and other kwargs already included in DParams") - kwargs = asdict(kwargs.get('dparams')) + if isinstance(kwargs.get('dparams'), blosc2.DParams): + kwargs = asdict(kwargs.get('dparams')) + else: + kwargs = kwargs.get('dparams') return blosc2_ext.decompress2(src, dst, **kwargs) diff --git a/src/blosc2/schunk.py b/src/blosc2/schunk.py index 1273ea5e..91ee36fe 100644 --- a/src/blosc2/schunk.py +++ b/src/blosc2/schunk.py @@ -159,7 +159,7 @@ def __init__(self, chunksize: int = None, data: object = None, **kwargs: dict): kwargs: dict, optional Storage parameters. The default values are in :class:`blosc2.Storage`. Keyword arguments supported: - storage: :class:`blosc2.Storage` + storage: :class:`blosc2.Storage` or dict All the storage parameters that you want to use as a :class:`blosc2.Storage` or dict instance. cparams: :class:`blosc2.CParams` or dict diff --git a/src/blosc2/storage.py b/src/blosc2/storage.py index 9abf126b..759b297e 100644 --- a/src/blosc2/storage.py +++ b/src/blosc2/storage.py @@ -46,8 +46,9 @@ class CParams: typesize: int from 1 to 255 The data type size. Default: 8. nthreads: int - The number of threads to use internally. By default, blosc2 computes - a good guess. + The number of threads to use internally. By default, the + value of :py:obj:`blosc2.nthreads` is used. If not set with + :func:`blosc2.set_nthreads`, blosc2 computes a good guess for it. blocksize: int The requested size of the compressed blocks. If 0 (the default) blosc2 chooses it automatically. @@ -96,8 +97,9 @@ class DParams: Parameters ---------- nthreads: int - The number of threads to use internally. By default, blosc2 computes - a good guess. + The number of threads to use internally. By default, the + value of :py:obj:`blosc2.nthreads` is used. If not set with + :func:`blosc2.set_nthreads`, blosc2 computes a good guess for it. """ nthreads: int = field(default_factory=default_nthreads) diff --git a/tests/ndarray/test_lossy.py b/tests/ndarray/test_lossy.py index 7b646aae..b0a21fd4 100644 --- a/tests/ndarray/test_lossy.py +++ b/tests/ndarray/test_lossy.py @@ -9,6 +9,7 @@ import numpy as np import pytest +from dataclasses import asdict import blosc2 @@ -60,7 +61,8 @@ ], ) def test_lossy(shape, cparams, dtype, urlpath, contiguous): - if cparams.get("codec") == blosc2.Codec.NDLZ: + cparams_dict = cparams if isinstance(cparams, dict) else asdict(cparams) + if cparams_dict.get("codec") == blosc2.Codec.NDLZ: dtype = np.uint8 array = np.linspace(0, np.prod(shape), np.prod(shape), dtype=dtype).reshape(shape) a = blosc2.asarray(array, cparams=cparams, urlpath=urlpath, contiguous=contiguous, mode="w") diff --git a/tests/test_compress2.py b/tests/test_compress2.py index 257b8d5b..af5e6f92 100644 --- a/tests/test_compress2.py +++ b/tests/test_compress2.py @@ -33,11 +33,11 @@ ( np.arange(10, dtype="float32"), # Do a reduction of precision of 10 bits in mantissa - { - "filters": [blosc2.Filter.TRUNC_PREC, blosc2.Filter.BITSHUFFLE], - "filters_meta": [-10, 0], - "typesize": 4, - }, + {"cparams": {"filters": [blosc2.Filter.TRUNC_PREC, blosc2.Filter.BITSHUFFLE], + "filters_meta": [-10, 0], + "typesize": 4, + }, + }, {"nthreads": 4}, ), ( @@ -46,7 +46,7 @@ {'dparams': blosc2.DParams()}, ), (np.arange(45, dtype=np.float64), {'cparams': blosc2.CParams(codec=blosc2.Codec.LZ4HC, typesize=4)}, {}), - (np.arange(50, dtype=np.int64), {"typesize": 4}, blosc2.dparams_dflts), + (np.arange(50, dtype=np.int64), {"typesize": 4}, {"dparams": blosc2.dparams_dflts}), ], ) def test_compress2_numpy(obj, cparams, dparams, gil): From 3a43d2e189b8af621377237d9fb233f8f8d7dd76 Mon Sep 17 00:00:00 2001 From: oumaima-ech-chdig Date: Wed, 18 Sep 2024 13:37:17 +0200 Subject: [PATCH 17/18] Proxy examples --- src/blosc2/proxy.py | 165 +++++++++----------------------------------- 1 file changed, 33 insertions(+), 132 deletions(-) diff --git a/src/blosc2/proxy.py b/src/blosc2/proxy.py index 43c6cada..2850cb86 100644 --- a/src/blosc2/proxy.py +++ b/src/blosc2/proxy.py @@ -11,119 +11,28 @@ import numpy as np -class ProxyNDSource(ABC): - """ - Base interface for NDim sources in :ref:`Proxy`. - """ - - @property - @abstractmethod - def shape(self) -> tuple: - """ - The shape of the source. - """ - pass - - @property - @abstractmethod - def chunks(self) -> tuple: - """ - The chunk shape of the source. - """ - pass - - @property - @abstractmethod - def blocks(self) -> tuple: - """ - The block shape of the source. - """ - pass - - @property - @abstractmethod - def dtype(self) -> np.dtype: - """ - The dtype of the source. - """ - pass - - @abstractmethod - def get_chunk(self, nchunk: int) -> bytes: - """ - Return the compressed chunk in :paramref:`self`. - - Parameters - ---------- - nchunk: int - The unidimensional index of the chunk to retrieve. - - Returns - ------- - out: bytes object - The compressed chunk. - """ - pass - - def aget_chunk(self, nchunk: int) -> bytes: - """ - Return the compressed chunk in :paramref:`self` in an asynchronous way. - - Parameters - ---------- - nchunk: int - The index of the chunk to retrieve. - - Returns - ------- - out: bytes object - The compressed chunk. - - Notes - ----- - This method is optional, and only available if the source has an async `aget_chunk` method. - """ - raise NotImplementedError("aget_chunk is only available if the source has an aget_chunk method") - - class ProxySource(ABC): """ - Base interface for sources of :ref:`Proxy` that are not NDim objects. - """ + Base interface for all supported sources in :ref:`Proxy`. - @property - @abstractmethod - def nbytes(self) -> int: - """ - The total number of bytes in the source. - """ - pass - - @property - @abstractmethod - def chunksize(self) -> tuple: - """ - The chunksize of the source. - """ - pass + In case the source is multidimensional, the attributes `shape`, `chunks`, + `blocks` and `dtype` are also required when creating the :ref:`Proxy`. - @property - @abstractmethod - def typesize(self) -> int: - """ - The typesize of the source. - """ - pass + In case the source is unidimensional, the attributes `chunksize`, `typesize` + and `nbytes` are required as well when creating the :ref:`Proxy`. + These attributes do not need to be available when opening an already + existing :ref:`Proxy`. + """ @abstractmethod - def get_chunk(self, nchunk: int) -> bytes: + def get_chunk(self, nchunk): """ Return the compressed chunk in :paramref:`self`. Parameters ---------- nchunk: int - The index of the chunk to retrieve. + The unidimensional index of the chunk to retrieve. Returns ------- @@ -132,42 +41,23 @@ def get_chunk(self, nchunk: int) -> bytes: """ pass - def aget_chunk(self, nchunk: int) -> bytes: - """ - Return the compressed chunk in :paramref:`self` in an asynchronous way. - - Parameters - ---------- - nchunk: int - The index of the chunk to retrieve. - - Returns - ------- - out: bytes object - The compressed chunk. - - Notes - ----- - This method is optional, and only available if the source has an async `aget_chunk` method. - """ - raise NotImplementedError("aget_chunk is only available if the source has an aget_chunk method") - class Proxy(blosc2.Operand): """Proxy (with cache support) of an object following the :ref:`ProxySource` interface. - This can be used to cache chunks of a regular data container which follows the - :ref:`ProxySource` or :ref:`ProxyNDSource` interfaces. + This can be used to cache chunks of a regular data container + which follows the :ref:`ProxySource` interface in an urlpath. """ - def __init__(self, src: ProxySource or ProxyNDSource, urlpath: str = None, **kwargs: dict): + def __init__(self, src, urlpath=None, **kwargs): """ - Create a new :ref:`Proxy` to serve like a cache to save accessed chunks locally. + Create a new :ref:`Proxy` to serve like a cache to save accessed + chunks locally. Parameters ---------- - src: :ref:`ProxySource` or :ref:`ProxyNDSource` - The original container. + src: :ref:`ProxySource` + The original container urlpath: str, optional The urlpath where to save the container that will work as a cache. @@ -178,6 +68,7 @@ def __init__(self, src: ProxySource or ProxyNDSource, urlpath: str = None, **kwa vlmeta: dict or None A dictionary with different variable length metalayers. One entry per metalayer: + key: bytes or str The name of the metalayer. value: object @@ -225,7 +116,7 @@ def __init__(self, src: ProxySource or ProxyNDSource, urlpath: str = None, **kwa for key in vlmeta: self._schunk_cache.vlmeta[key] = vlmeta[key] - def fetch(self, item: slice | list[slice] = None) -> blosc2.NDArray | blosc2.schunk.SChunk: + def fetch(self, item=None): """ Get the container used as cache with the requested data updated. @@ -270,7 +161,7 @@ def fetch(self, item: slice | list[slice] = None) -> blosc2.NDArray | blosc2.sch return self._cache - async def afetch(self, item: slice | list[slice] = None) -> blosc2.NDArray | blosc2.schunk.SChunk: + async def afetch(self, item=None): """ Get the container used as cache with the requested data updated in an asynchronous way. @@ -372,7 +263,7 @@ async def afetch(self, item: slice | list[slice] = None) -> blosc2.NDArray | blo return self._cache - def __getitem__(self, item: slice | list[slice]) -> np.ndarray: + def __getitem__(self, item): """ Get a slice as a numpy.ndarray using the :ref:`Proxy`. @@ -426,13 +317,23 @@ def __str__(self): return f"Proxy({self.src}, urlpath={self.urlpath})" @property - def vlmeta(self) -> blosc2.schunk.vlmeta: + def vlmeta(self): """ Get the vlmeta of the cache. See Also -------- :ref:`SChunk.vlmeta` + + Examples + -------- + >>> import numpy as np + >>> import blosc2 + >>> data = np.arange(100).reshape(10, 10) + >>> ndarray = blosc2.asarray(data) + >>> proxy = blosc2.Proxy(ndarray) + >>> f"VLMeta data: {proxy.vlmeta}" + VLMeta data: """ return self._schunk_cache.vlmeta @@ -483,7 +384,7 @@ def __init__(self, proxy: Proxy, field: str): self.shape = proxy.shape self.dtype = proxy.dtype - def __getitem__(self, item: slice | list[slice]) -> np.ndarray: + def __getitem__(self, item: slice): """ Get a slice as a numpy.ndarray using the `field` in `proxy`. From eff3865c6125d52c627456083b449d4dbf9ee0d0 Mon Sep 17 00:00:00 2001 From: oumaima-ech-chdig Date: Mon, 23 Sep 2024 13:57:25 +0200 Subject: [PATCH 18/18] Proxy examples: second corrections. --- src/blosc2/proxy.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/blosc2/proxy.py b/src/blosc2/proxy.py index 2850cb86..1d01c0e0 100644 --- a/src/blosc2/proxy.py +++ b/src/blosc2/proxy.py @@ -25,7 +25,7 @@ class ProxySource(ABC): """ @abstractmethod - def get_chunk(self, nchunk): + def get_chunk(self, nchunk: int) -> bytes: """ Return the compressed chunk in :paramref:`self`. @@ -116,7 +116,7 @@ def __init__(self, src, urlpath=None, **kwargs): for key in vlmeta: self._schunk_cache.vlmeta[key] = vlmeta[key] - def fetch(self, item=None): + def fetch(self, item: slice | list[slice] = None) -> blosc2.NDArray | blosc2.schunk.SChunk: """ Get the container used as cache with the requested data updated. @@ -161,7 +161,7 @@ def fetch(self, item=None): return self._cache - async def afetch(self, item=None): + async def afetch(self, item: slice | list[slice] = None) -> blosc2.NDArray | blosc2.schunk.SChunk: """ Get the container used as cache with the requested data updated in an asynchronous way. @@ -263,7 +263,7 @@ async def afetch(self, item=None): return self._cache - def __getitem__(self, item): + def __getitem__(self, item: slice | list[slice]) -> np.ndarray: """ Get a slice as a numpy.ndarray using the :ref:`Proxy`. @@ -317,7 +317,7 @@ def __str__(self): return f"Proxy({self.src}, urlpath={self.urlpath})" @property - def vlmeta(self): + def vlmeta(self) -> blosc2.schunk.vlmeta: """ Get the vlmeta of the cache. @@ -384,7 +384,7 @@ def __init__(self, proxy: Proxy, field: str): self.shape = proxy.shape self.dtype = proxy.dtype - def __getitem__(self, item: slice): + def __getitem__(self, item: slice | list[slice]) -> np.ndarray: """ Get a slice as a numpy.ndarray using the `field` in `proxy`.