Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Working with ProxySource: examples. #259

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions src/blosc2/c2array.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,24 @@ def __init__(self, path: str, /, urlbase: str = None, auth_token: str = None):
-------
out: C2Array

Examples
--------
>>> import blosc2
>>> import pathlib
>>> host = "https://demo.caterva2.net/"
>>> root = "b2tests"
>>> dir = "expr/"
>>> name = "ds-0-10-linspace-float64-(True, True)-a1-(60, 60)d.b2nd"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd use regular examples in example root.

>>> path = pathlib.Path(f"{root}/{dir + name}").as_posix()
>>> remote_array = blosc2.C2Array(path, urlbase=host)
>>> f"Shape of the remote array: {remote_array.shape}"
>>> f"Chunks of the remote array: {remote_array.chunks}"
>>> f"Blocks of the remote array: {remote_array.blocks}"
>>> f"Dtype of the remote array: {remote_array.dtype}"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd replace this by just:

>>> remote_array.shape
(60, 60)
...

Shape of the remote array: (60, 60)
Chunks of the remote array: (30, 60)
Blocks of the remote array: (10, 60)
Dtype of the remote array: float64
"""
if path.startswith("/"):
raise ValueError("The path should start with a root name, not a slash")
Expand Down Expand Up @@ -252,6 +270,36 @@ def get_chunk(self, nchunk: int) -> bytes:
-------
out: bytes
The requested compressed chunk.

Examples
--------
>>> import pathlib
>>> import numpy as np
>>> import blosc2
>>> host = "https://demo.caterva2.net/"
>>> root = "b2tests"
>>> dir = "expr/"
>>> root = "b2tests"
>>> dir = "expr/"
>>> name1 = "ds-0-10-linspace-float64-(True, True)-a1-(60, 60)d.b2nd"
>>> name2 = "ds-0-10-linspace-float64-(True, True)-a2-(60, 60)d.b2nd"
>>> path1 = pathlib.Path(f"{root}/{dir + name1}").as_posix()
>>> path2 = pathlib.Path(f"{root}/{dir + name2}").as_posix()
>>> a = blosc2.C2Array(path1, host)
>>> b = blosc2.C2Array(path2, host)
>>> c = a + b
>>> # Get the compressed chunk from array 'a' for index 0
>>> chunk_index = 0
>>> compressed_chunk = c.get_chunk(chunk_index)
>>> f"Size of chunk {chunk_index} from a: {len(compressed_chunk)} bytes"
Size of chunk 0 from 'a': 8604 bytes
>>> # Decompress the chunk and convert it to a NumPy array
>>> decompressed_chunk = blosc2.decompress(compressed_chunk)
>>> chunk_np_array = np.frombuffer(decompressed_chunk, dtype=a.dtype)
>>> f"Content of chunk {chunk_index} as NumPy array:{chunk_np_array}"
Content of chunk 0 as NumPy array:
[0.00000000e+00 5.55709919e-03 1.11141984e-02 ... 9.98610725e+00
9.99166435e+00 9.99722145e+00]
"""
url = _sub_url(self.urlbase, f"api/chunk/{self.path}")
params = {"nchunk": nchunk}
Expand Down
121 changes: 98 additions & 23 deletions src/blosc2/proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@
#######################################################################
from abc import ABC, abstractmethod

import numpy as np

import blosc2
import numpy as np


class ProxyNDSource(ABC):
Expand Down Expand Up @@ -248,17 +247,12 @@ def fetch(self, item: slice | list[slice] = None) -> blosc2.NDArray | blosc2.sch
>>> data = np.arange(20).reshape(10, 2)
>>> ndarray = blosc2.asarray(data)
>>> proxy = blosc2.Proxy(ndarray)
>>> full_data = proxy.fetch()
>>> f"Full data cache: {full_data[:]}"
Full data cache:
[[ 0 1][ 2 3][ 4 5]
[ 6 7][ 8 9][10 11]
[12 13][14 15][16 17]
[18 19]]
>>> slice_data = proxy[0:2, :]
>>> f"Slice data cache: {slice_data}"
>>> slice_data = proxy.fetch((slice(0, 3), slice(0, 2)))
>>> f"Slice data cache: {slice_data[:3, :2]}"
Slice data cache:
[[0 1][2 3]]
[[0 1]
[2 3]
[4 5]]
"""
if item is None:
# Full realization
Expand Down Expand Up @@ -296,6 +290,65 @@ async def afetch(self, item: slice | list[slice] = None) -> blosc2.NDArray | blo
-----
This method is only available if the :ref:`ProxySource` or :ref:`ProxyNDSource`
have an async `aget_chunk` method.

Examples
--------
>>> import numpy as np
>>> import blosc2
>>> import asyncio
>>> class MyProxySource:
>>> def __init__(self, data):
>>> # If the next source is multidimensional, it must have the attributes:
>>> self.data = data
>>> f"Data shape: {self.shape}, Chunks: {self.chunks}"
Data shape: (4, 5), Chunks: [2, 5]
>>> f"Blocks: {self.blocks}, Dtype: {self.dtype}"
Blocks: [1, 5], Dtype: int64
>>> @property
>>> def shape(self):
>>> return self.data.shape
>>> @property
>>> def chunks(self):
>>> return self.data.chunks
>>> @property
>>> def blocks(self):
>>> return self.data.blocks
>>> @property
>>> def dtype(self):
>>> return self.data.dtype
>>> # This method must be present
>>> def get_chunk(self, nchunk):
>>> return self.data.get_chunk(nchunk)
>>> # This method is optional
>>> async def aget_chunk(self, nchunk):
>>> await asyncio.sleep(0.1) # Simulate an asynchronous operation
>>> return self.data.get_chunk(nchunk)
>>> data = np.arange(20).reshape(4, 5)
>>> chunks = [2, 5]
>>> blocks = [1, 5]
>>> data = blosc2.asarray(data, chunks=chunks, blocks=blocks)
>>> source = MyProxySource(data2)
>>> proxy = blosc2.Proxy(source)
>>> async def fetch_data():
>>> # Fetch a slice of the data from the proxy asynchronously
>>> slice_data = await proxy.afetch(slice(0, 2))
>>> # Note that only data fetched is shown, the rest is uninitialized
>>> f"Slice data cache: {slice_data[:]}"
Slice data cache:
[[0 1 2 3 4]
[5 6 7 8 9]
[0 0 0 0 0]
[0 0 0 0 0]]
>>> # Fetch the full data from the proxy asynchronously
>>> full_data = await proxy.afetch()
>>> # Now, all data is shown, meaning the full data has been fetched
>>> f"Full data cache: {full_data[:]}"
Full data cache:
[[ 0 1 2 3 4]
[ 5 6 7 8 9]
[10 11 12 13 14]
[15 16 17 18 19]]
>>> asyncio.run(fetch_data())
"""
if not callable(getattr(self.src, "aget_chunk", None)):
raise NotImplementedError("afetch is only available if the source has an aget_chunk method")
Expand Down Expand Up @@ -333,34 +386,36 @@ def __getitem__(self, item: slice | list[slice]) -> np.ndarray:
--------
>>> import numpy as np
>>> import blosc2
>>> data = np.arange(100).reshape(10, 10)
>>> data = np.arange(25).reshape(5, 5)
>>> ndarray = blosc2.asarray(data)
>>> proxy = blosc2.Proxy(ndarray)
>>> slice_1 = proxy[0:3, 0:3]
>>> f"Slice 1: {slice_1}"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same, print the entire proxy values to see that only slice [:3, :3] has been filled. If (10, 10) is too large for the dump, use e.g. (5, 5).

Slice 1:
[[ 0 1 2]
[ 5 6 7]
[10 11 12]
[20 21 22]]
>>> slice_2 = proxy[5:8, 2:5]
>>> slice_2 = proxy[2:5, 2:5]
>>> f"Slice 2: {slice_2}"
Slice 2:
[[52 53 54]
[62 63 64]
[72 73 74]]
[[12 13 14]
[17 18 19]
[22 23 24]]
"""
# Populate the cache
self.fetch(item)
return self._cache[item]

@property
def dtype(self) -> np.dtype:
"""The dtype of :paramref:`self` or None if the data is unidimensional"""
def dtype(self):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dito

"""The dtype of :paramref:`self` or None if the data is unidimensional
"""
return self._cache.dtype if isinstance(self._cache, blosc2.NDArray) else None

@property
def shape(self) -> tuple[int]:
"""The shape of :paramref:`self`"""
def shape(self):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dito

"""The shape of :paramref:`self`
"""
return self._cache.shape if isinstance(self._cache, blosc2.NDArray) else len(self._cache)

def __str__(self):
Expand All @@ -378,7 +433,7 @@ def vlmeta(self) -> blosc2.schunk.vlmeta:
return self._schunk_cache.vlmeta

@property
def fields(self) -> dict:
def fields(self)-> dict:
"""
Dictionary with the fields of :paramref:`self`.

Expand All @@ -390,6 +445,26 @@ def fields(self) -> dict:
See Also
--------
:ref:`NDField`

Examples
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm unsure about this one. It brings some insight about how to use the fields property, but on the other hand, this has already been documented in the original NDArray.fields. Better a See also section?

--------
>>> import numpy as np
>>> import blosc2
>>> data = np.zeros(16, dtype=[('field1', 'i4'), ('field2', 'f4')]).reshape(4, 4)
>>> ndarray = blosc2.asarray(data)
>>> proxy = blosc2.Proxy(ndarray)
>>> # Get a dictionary of fields from the proxy, where each field can be accessed individually
>>> fields_dict = proxy.fields
>>> for field_name, field_proxy in fields_dict.items():
>>> f"Field name: {field_name}, Field data: {field_proxy}"
Field name: field1, Field data: <blosc2.proxy.ProxyNDField object at 0x10c176c90>
Field name: field2, Field data: <blosc2.proxy.ProxyNDField object at 0x103264bf0>
>>> field1_data = fields_dict['field1'][:]
>>> field1_data
[[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]]
"""
_fields = getattr(self._cache, "fields", None)
if _fields is None:
Expand Down
Loading