From 55b47b17e06315eea13f37f658ae6128908c1115 Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Fri, 3 Nov 2023 12:36:28 -0700 Subject: [PATCH 01/12] Pull Zarrita into Zarr-Python @ 78274781ad64aef95772eb4b083f7ea9b7d03d06 No code changes to Zarrita were made. --- zarr/zarrita/__init__.py | 42 +++ zarr/zarrita/array.py | 498 ++++++++++++++++++++++++++++++++ zarr/zarrita/array_v2.py | 579 +++++++++++++++++++++++++++++++++++++ zarr/zarrita/codecs.py | 605 +++++++++++++++++++++++++++++++++++++++ zarr/zarrita/common.py | 163 +++++++++++ zarr/zarrita/group.py | 187 ++++++++++++ zarr/zarrita/group_v2.py | 228 +++++++++++++++ zarr/zarrita/indexing.py | 219 ++++++++++++++ zarr/zarrita/metadata.py | 344 ++++++++++++++++++++++ zarr/zarrita/sharding.py | 541 ++++++++++++++++++++++++++++++++++ zarr/zarrita/store.py | 307 ++++++++++++++++++++ zarr/zarrita/sync.py | 89 ++++++ 12 files changed, 3802 insertions(+) create mode 100644 zarr/zarrita/__init__.py create mode 100644 zarr/zarrita/array.py create mode 100644 zarr/zarrita/array_v2.py create mode 100644 zarr/zarrita/codecs.py create mode 100644 zarr/zarrita/common.py create mode 100644 zarr/zarrita/group.py create mode 100644 zarr/zarrita/group_v2.py create mode 100644 zarr/zarrita/indexing.py create mode 100644 zarr/zarrita/metadata.py create mode 100644 zarr/zarrita/sharding.py create mode 100644 zarr/zarrita/store.py create mode 100644 zarr/zarrita/sync.py diff --git a/zarr/zarrita/__init__.py b/zarr/zarrita/__init__.py new file mode 100644 index 0000000000..22a96e0a75 --- /dev/null +++ b/zarr/zarrita/__init__.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +from typing import Union + +import zarrita.codecs # noqa: F401 +from zarrita.array import Array # noqa: F401 +from zarrita.array_v2 import ArrayV2 # noqa: F401 +from zarrita.group import Group # noqa: F401 +from zarrita.group_v2 import GroupV2 # noqa: F401 +from zarrita.metadata import RuntimeConfiguration, runtime_configuration # noqa: F401 +from zarrita.store import ( # noqa: F401 + LocalStore, + RemoteStore, + Store, + StoreLike, + StorePath, + make_store_path, +) +from zarrita.sync import sync as _sync + + +async def open_auto_async( + store: StoreLike, + runtime_configuration_: RuntimeConfiguration = RuntimeConfiguration(), +) -> Union[Array, ArrayV2, Group, GroupV2]: + store_path = make_store_path(store) + try: + return await Group.open_or_array( + store_path, runtime_configuration=runtime_configuration_ + ) + except KeyError: + return await GroupV2.open_or_array(store_path, runtime_configuration_) + + +def open_auto( + store: StoreLike, + runtime_configuration_: RuntimeConfiguration = RuntimeConfiguration(), +) -> Union[Array, ArrayV2, Group, GroupV2]: + return _sync( + open_auto_async(store, runtime_configuration_), + runtime_configuration_.asyncio_loop, + ) diff --git a/zarr/zarrita/array.py b/zarr/zarrita/array.py new file mode 100644 index 0000000000..6ffd0b01e0 --- /dev/null +++ b/zarr/zarrita/array.py @@ -0,0 +1,498 @@ +from __future__ import annotations + +import json +from typing import Any, Dict, Iterable, Literal, Optional, Tuple, Union + +import numpy as np +from attr import evolve, frozen + +from zarrita.array_v2 import ArrayV2 +from zarrita.codecs import CodecMetadata, CodecPipeline, bytes_codec +from zarrita.common import ( + ZARR_JSON, + ChunkCoords, + Selection, + SliceSelection, + concurrent_map, +) +from zarrita.indexing import BasicIndexer, all_chunk_coords, is_total_slice +from zarrita.metadata import ( + ArrayMetadata, + DataType, + DefaultChunkKeyEncodingConfigurationMetadata, + DefaultChunkKeyEncodingMetadata, + RegularChunkGridConfigurationMetadata, + RegularChunkGridMetadata, + RuntimeConfiguration, + V2ChunkKeyEncodingConfigurationMetadata, + V2ChunkKeyEncodingMetadata, + dtype_to_data_type, +) +from zarrita.sharding import ShardingCodec +from zarrita.store import StoreLike, StorePath, make_store_path +from zarrita.sync import sync + + +@frozen +class _AsyncArrayProxy: + array: Array + + def __getitem__(self, selection: Selection) -> _AsyncArraySelectionProxy: + return _AsyncArraySelectionProxy(self.array, selection) + + +@frozen +class _AsyncArraySelectionProxy: + array: Array + selection: Selection + + async def get(self) -> np.ndarray: + return await self.array._get_async(self.selection) + + async def set(self, value: np.ndarray): + return await self.array._set_async(self.selection, value) + + +def _json_convert(o): + if isinstance(o, DataType): + return o.name + raise TypeError + + +@frozen +class Array: + metadata: ArrayMetadata + store_path: StorePath + runtime_configuration: RuntimeConfiguration + codec_pipeline: CodecPipeline + + @classmethod + async def create_async( + cls, + store: StoreLike, + *, + shape: ChunkCoords, + dtype: Union[str, np.dtype], + chunk_shape: ChunkCoords, + fill_value: Optional[Any] = None, + chunk_key_encoding: Union[ + Tuple[Literal["default"], Literal[".", "/"]], + Tuple[Literal["v2"], Literal[".", "/"]], + ] = ("default", "/"), + codecs: Optional[Iterable[CodecMetadata]] = None, + dimension_names: Optional[Iterable[str]] = None, + attributes: Optional[Dict[str, Any]] = None, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + exists_ok: bool = False, + ) -> Array: + store_path = make_store_path(store) + if not exists_ok: + assert not await (store_path / ZARR_JSON).exists_async() + + data_type = ( + DataType[dtype] + if isinstance(dtype, str) + else DataType[dtype_to_data_type[dtype.str]] + ) + + codecs = list(codecs) if codecs is not None else [bytes_codec()] + + if fill_value is None: + if data_type == DataType.bool: + fill_value = False + else: + fill_value = 0 + + metadata = ArrayMetadata( + shape=shape, + data_type=data_type, + chunk_grid=RegularChunkGridMetadata( + configuration=RegularChunkGridConfigurationMetadata( + chunk_shape=chunk_shape + ) + ), + chunk_key_encoding=( + V2ChunkKeyEncodingMetadata( + configuration=V2ChunkKeyEncodingConfigurationMetadata( + separator=chunk_key_encoding[1] + ) + ) + if chunk_key_encoding[0] == "v2" + else DefaultChunkKeyEncodingMetadata( + configuration=DefaultChunkKeyEncodingConfigurationMetadata( + separator=chunk_key_encoding[1] + ) + ) + ), + fill_value=fill_value, + codecs=codecs, + dimension_names=tuple(dimension_names) if dimension_names else None, + attributes=attributes or {}, + ) + runtime_configuration = runtime_configuration or RuntimeConfiguration() + + array = cls( + metadata=metadata, + store_path=store_path, + runtime_configuration=runtime_configuration, + codec_pipeline=CodecPipeline.from_metadata( + metadata.codecs, metadata.get_core_metadata(runtime_configuration) + ), + ) + + await array._save_metadata() + return array + + @classmethod + def create( + cls, + store: StoreLike, + *, + shape: ChunkCoords, + dtype: Union[str, np.dtype], + chunk_shape: ChunkCoords, + fill_value: Optional[Any] = None, + chunk_key_encoding: Union[ + Tuple[Literal["default"], Literal[".", "/"]], + Tuple[Literal["v2"], Literal[".", "/"]], + ] = ("default", "/"), + codecs: Optional[Iterable[CodecMetadata]] = None, + dimension_names: Optional[Iterable[str]] = None, + attributes: Optional[Dict[str, Any]] = None, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + exists_ok: bool = False, + ) -> Array: + return sync( + cls.create_async( + store=store, + shape=shape, + dtype=dtype, + chunk_shape=chunk_shape, + fill_value=fill_value, + chunk_key_encoding=chunk_key_encoding, + codecs=codecs, + dimension_names=dimension_names, + attributes=attributes, + runtime_configuration=runtime_configuration, + exists_ok=exists_ok, + ), + runtime_configuration.asyncio_loop, + ) + + @classmethod + async def open_async( + cls, + store: StoreLike, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> Array: + store_path = make_store_path(store) + zarr_json_bytes = await (store_path / ZARR_JSON).get_async() + assert zarr_json_bytes is not None + return cls.from_json( + store_path, + json.loads(zarr_json_bytes), + runtime_configuration=runtime_configuration or RuntimeConfiguration(), + ) + + @classmethod + def open( + cls, + store: StoreLike, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> Array: + return sync( + cls.open_async(store, runtime_configuration=runtime_configuration), + runtime_configuration.asyncio_loop, + ) + + @classmethod + def from_json( + cls, + store_path: StorePath, + zarr_json: Any, + runtime_configuration: RuntimeConfiguration, + ) -> Array: + metadata = ArrayMetadata.from_json(zarr_json) + out = cls( + metadata=metadata, + store_path=store_path, + runtime_configuration=runtime_configuration, + codec_pipeline=CodecPipeline.from_metadata( + metadata.codecs, metadata.get_core_metadata(runtime_configuration) + ), + ) + out._validate_metadata() + return out + + @classmethod + async def open_auto_async( + cls, + store: StoreLike, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> Union[Array, ArrayV2]: + store_path = make_store_path(store) + v3_metadata_bytes = await (store_path / ZARR_JSON).get_async() + if v3_metadata_bytes is not None: + return cls.from_json( + store_path, + json.loads(v3_metadata_bytes), + runtime_configuration=runtime_configuration or RuntimeConfiguration(), + ) + return await ArrayV2.open_async(store_path) + + @classmethod + def open_auto( + cls, + store: StoreLike, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> Union[Array, ArrayV2]: + return sync( + cls.open_auto_async(store, runtime_configuration), + runtime_configuration.asyncio_loop, + ) + + async def _save_metadata(self) -> None: + self._validate_metadata() + + await (self.store_path / ZARR_JSON).set_async(self.metadata.to_bytes()) + + def _validate_metadata(self) -> None: + assert len(self.metadata.shape) == len( + self.metadata.chunk_grid.configuration.chunk_shape + ), "`chunk_shape` and `shape` need to have the same number of dimensions." + assert self.metadata.dimension_names is None or len(self.metadata.shape) == len( + self.metadata.dimension_names + ), "`dimension_names` and `shape` need to have the same number of dimensions." + assert self.metadata.fill_value is not None, "`fill_value` is required." + + @property + def ndim(self) -> int: + return len(self.metadata.shape) + + @property + def shape(self) -> ChunkCoords: + return self.metadata.shape + + @property + def dtype(self) -> np.dtype: + return self.metadata.dtype + + @property + def async_(self) -> _AsyncArrayProxy: + return _AsyncArrayProxy(self) + + def __getitem__(self, selection: Selection): + return sync(self._get_async(selection), self.runtime_configuration.asyncio_loop) + + async def _get_async(self, selection: Selection): + indexer = BasicIndexer( + selection, + shape=self.metadata.shape, + chunk_shape=self.metadata.chunk_grid.configuration.chunk_shape, + ) + + # setup output array + out = np.zeros( + indexer.shape, + dtype=self.metadata.dtype, + order=self.runtime_configuration.order, + ) + + # reading chunks and decoding them + await concurrent_map( + [ + (chunk_coords, chunk_selection, out_selection, out) + for chunk_coords, chunk_selection, out_selection in indexer + ], + self._read_chunk, + self.runtime_configuration.concurrency, + ) + + if out.shape: + return out + else: + return out[()] + + async def _read_chunk( + self, + chunk_coords: ChunkCoords, + chunk_selection: SliceSelection, + out_selection: SliceSelection, + out: np.ndarray, + ): + chunk_key_encoding = self.metadata.chunk_key_encoding + chunk_key = chunk_key_encoding.encode_chunk_key(chunk_coords) + store_path = self.store_path / chunk_key + + if len(self.codec_pipeline.codecs) == 1 and isinstance( + self.codec_pipeline.codecs[0], ShardingCodec + ): + chunk_array = await self.codec_pipeline.codecs[0].decode_partial( + store_path, chunk_selection + ) + if chunk_array is not None: + out[out_selection] = chunk_array + else: + out[out_selection] = self.metadata.fill_value + else: + chunk_bytes = await store_path.get_async() + if chunk_bytes is not None: + chunk_array = await self.codec_pipeline.decode(chunk_bytes) + tmp = chunk_array[chunk_selection] + out[out_selection] = tmp + else: + out[out_selection] = self.metadata.fill_value + + def __setitem__(self, selection: Selection, value: np.ndarray) -> None: + sync(self._set_async(selection, value), self.runtime_configuration.asyncio_loop) + + async def _set_async(self, selection: Selection, value: np.ndarray) -> None: + chunk_shape = self.metadata.chunk_grid.configuration.chunk_shape + indexer = BasicIndexer( + selection, + shape=self.metadata.shape, + chunk_shape=chunk_shape, + ) + + sel_shape = indexer.shape + + # check value shape + if np.isscalar(value): + # setting a scalar value + pass + else: + if not hasattr(value, "shape"): + value = np.asarray(value, self.metadata.dtype) + assert value.shape == sel_shape + if value.dtype.name != self.metadata.dtype.name: + value = value.astype(self.metadata.dtype, order="A") + + # merging with existing data and encoding chunks + await concurrent_map( + [ + ( + value, + chunk_shape, + chunk_coords, + chunk_selection, + out_selection, + ) + for chunk_coords, chunk_selection, out_selection in indexer + ], + self._write_chunk, + self.runtime_configuration.concurrency, + ) + + async def _write_chunk( + self, + value: np.ndarray, + chunk_shape: ChunkCoords, + chunk_coords: ChunkCoords, + chunk_selection: SliceSelection, + out_selection: SliceSelection, + ): + chunk_key_encoding = self.metadata.chunk_key_encoding + chunk_key = chunk_key_encoding.encode_chunk_key(chunk_coords) + store_path = self.store_path / chunk_key + + if is_total_slice(chunk_selection, chunk_shape): + # write entire chunks + if np.isscalar(value): + chunk_array = np.empty( + chunk_shape, + dtype=self.metadata.dtype, + ) + chunk_array.fill(value) + else: + chunk_array = value[out_selection] + await self._write_chunk_to_store(store_path, chunk_array) + + elif len(self.codec_pipeline.codecs) == 1 and isinstance( + self.codec_pipeline.codecs[0], ShardingCodec + ): + sharding_codec = self.codec_pipeline.codecs[0] + # print("encode_partial", chunk_coords, chunk_selection, repr(self)) + await sharding_codec.encode_partial( + store_path, + value[out_selection], + chunk_selection, + ) + else: + # writing partial chunks + # read chunk first + chunk_bytes = await store_path.get_async() + + # merge new value + if chunk_bytes is None: + chunk_array = np.empty( + chunk_shape, + dtype=self.metadata.dtype, + ) + chunk_array.fill(self.metadata.fill_value) + else: + chunk_array = ( + await self.codec_pipeline.decode(chunk_bytes) + ).copy() # make a writable copy + chunk_array[chunk_selection] = value[out_selection] + + await self._write_chunk_to_store(store_path, chunk_array) + + async def _write_chunk_to_store( + self, store_path: StorePath, chunk_array: np.ndarray + ): + if np.all(chunk_array == self.metadata.fill_value): + # chunks that only contain fill_value will be removed + await store_path.delete_async() + else: + chunk_bytes = await self.codec_pipeline.encode(chunk_array) + if chunk_bytes is None: + await store_path.delete_async() + else: + await store_path.set_async(chunk_bytes) + + async def resize_async(self, new_shape: ChunkCoords) -> Array: + assert len(new_shape) == len(self.metadata.shape) + new_metadata = evolve(self.metadata, shape=new_shape) + + # Remove all chunks outside of the new shape + chunk_shape = self.metadata.chunk_grid.configuration.chunk_shape + chunk_key_encoding = self.metadata.chunk_key_encoding + old_chunk_coords = set(all_chunk_coords(self.metadata.shape, chunk_shape)) + new_chunk_coords = set(all_chunk_coords(new_shape, chunk_shape)) + + async def _delete_key(key: str) -> None: + await (self.store_path / key).delete_async() + + await concurrent_map( + [ + (chunk_key_encoding.encode_chunk_key(chunk_coords),) + for chunk_coords in old_chunk_coords.difference(new_chunk_coords) + ], + _delete_key, + self.runtime_configuration.concurrency, + ) + + # Write new metadata + await (self.store_path / ZARR_JSON).set_async(new_metadata.to_bytes()) + return evolve(self, metadata=new_metadata) + + def resize(self, new_shape: ChunkCoords) -> Array: + return sync( + self.resize_async(new_shape), self.runtime_configuration.asyncio_loop + ) + + async def update_attributes_async(self, new_attributes: Dict[str, Any]) -> Array: + new_metadata = evolve(self.metadata, attributes=new_attributes) + + # Write new metadata + await (self.store_path / ZARR_JSON).set_async(new_metadata.to_bytes()) + return evolve(self, metadata=new_metadata) + + def update_attributes(self, new_attributes: Dict[str, Any]) -> Array: + return sync( + self.update_attributes_async(new_attributes), + self.runtime_configuration.asyncio_loop, + ) + + def __repr__(self): + return f"" diff --git a/zarr/zarrita/array_v2.py b/zarr/zarrita/array_v2.py new file mode 100644 index 0000000000..b9ce51b29b --- /dev/null +++ b/zarr/zarrita/array_v2.py @@ -0,0 +1,579 @@ +from __future__ import annotations + +import asyncio +import json +from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union + +import numcodecs +import numpy as np +from attr import evolve, frozen +from numcodecs.compat import ensure_bytes, ensure_ndarray + +from zarrita.common import ( + ZARRAY_JSON, + ZATTRS_JSON, + BytesLike, + ChunkCoords, + Selection, + SliceSelection, + concurrent_map, + to_thread, +) +from zarrita.indexing import BasicIndexer, all_chunk_coords, is_total_slice +from zarrita.metadata import ArrayV2Metadata, RuntimeConfiguration +from zarrita.store import StoreLike, StorePath, make_store_path +from zarrita.sync import sync + +if TYPE_CHECKING: + from zarrita.array import Array + + +@frozen +class _AsyncArrayProxy: + array: ArrayV2 + + def __getitem__(self, selection: Selection) -> _AsyncArraySelectionProxy: + return _AsyncArraySelectionProxy(self.array, selection) + + +@frozen +class _AsyncArraySelectionProxy: + array: ArrayV2 + selection: Selection + + async def get(self) -> np.ndarray: + return await self.array.get_async(self.selection) + + async def set(self, value: np.ndarray): + return await self.array.set_async(self.selection, value) + + +@frozen +class ArrayV2: + metadata: ArrayV2Metadata + attributes: Optional[Dict[str, Any]] + store_path: StorePath + runtime_configuration: RuntimeConfiguration + + @classmethod + async def create_async( + cls, + store: StoreLike, + *, + shape: ChunkCoords, + dtype: np.dtype, + chunks: ChunkCoords, + dimension_separator: Literal[".", "/"] = ".", + fill_value: Optional[Union[None, int, float]] = None, + order: Literal["C", "F"] = "C", + filters: Optional[List[Dict[str, Any]]] = None, + compressor: Optional[Dict[str, Any]] = None, + attributes: Optional[Dict[str, Any]] = None, + exists_ok: bool = False, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> ArrayV2: + store_path = make_store_path(store) + if not exists_ok: + assert not await (store_path / ZARRAY_JSON).exists_async() + + metadata = ArrayV2Metadata( + shape=shape, + dtype=np.dtype(dtype), + chunks=chunks, + order=order, + dimension_separator=dimension_separator, + fill_value=0 if fill_value is None else fill_value, + compressor=numcodecs.get_codec(compressor).get_config() + if compressor is not None + else None, + filters=[numcodecs.get_codec(filter).get_config() for filter in filters] + if filters is not None + else None, + ) + array = cls( + metadata=metadata, + store_path=store_path, + attributes=attributes, + runtime_configuration=runtime_configuration, + ) + await array._save_metadata() + return array + + @classmethod + def create( + cls, + store: StoreLike, + *, + shape: ChunkCoords, + dtype: np.dtype, + chunks: ChunkCoords, + dimension_separator: Literal[".", "/"] = ".", + fill_value: Optional[Union[None, int, float]] = None, + order: Literal["C", "F"] = "C", + filters: Optional[List[Dict[str, Any]]] = None, + compressor: Optional[Dict[str, Any]] = None, + attributes: Optional[Dict[str, Any]] = None, + exists_ok: bool = False, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> ArrayV2: + return sync( + cls.create_async( + store, + shape=shape, + dtype=dtype, + chunks=chunks, + order=order, + dimension_separator=dimension_separator, + fill_value=0 if fill_value is None else fill_value, + compressor=compressor, + filters=filters, + attributes=attributes, + exists_ok=exists_ok, + runtime_configuration=runtime_configuration, + ), + runtime_configuration.asyncio_loop, + ) + + @classmethod + async def open_async( + cls, + store: StoreLike, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> ArrayV2: + store_path = make_store_path(store) + zarray_bytes, zattrs_bytes = await asyncio.gather( + (store_path / ZARRAY_JSON).get_async(), + (store_path / ZATTRS_JSON).get_async(), + ) + assert zarray_bytes is not None + return cls.from_json( + store_path, + zarray_json=json.loads(zarray_bytes), + zattrs_json=json.loads(zattrs_bytes) if zattrs_bytes is not None else None, + runtime_configuration=runtime_configuration, + ) + + @classmethod + def open( + cls, + store: StoreLike, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> ArrayV2: + return sync( + cls.open_async(store, runtime_configuration), + runtime_configuration.asyncio_loop, + ) + + @classmethod + def from_json( + cls, + store_path: StorePath, + zarray_json: Any, + zattrs_json: Optional[Any], + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> ArrayV2: + metadata = ArrayV2Metadata.from_json(zarray_json) + out = cls( + store_path=store_path, + metadata=metadata, + attributes=zattrs_json, + runtime_configuration=runtime_configuration, + ) + out._validate_metadata() + return out + + async def _save_metadata(self) -> None: + self._validate_metadata() + + await (self.store_path / ZARRAY_JSON).set_async(self.metadata.to_bytes()) + if self.attributes is not None and len(self.attributes) > 0: + await (self.store_path / ZATTRS_JSON).set_async( + json.dumps(self.attributes).encode(), + ) + else: + await (self.store_path / ZATTRS_JSON).delete_async() + + def _validate_metadata(self) -> None: + assert len(self.metadata.shape) == len( + self.metadata.chunks + ), "`chunks` and `shape` need to have the same number of dimensions." + + @property + def ndim(self) -> int: + return len(self.metadata.shape) + + @property + def shape(self) -> ChunkCoords: + return self.metadata.shape + + @property + def dtype(self) -> np.dtype: + return self.metadata.dtype + + @property + def async_(self) -> _AsyncArrayProxy: + return _AsyncArrayProxy(self) + + def __getitem__(self, selection: Selection): + return sync(self.get_async(selection), self.runtime_configuration.asyncio_loop) + + async def get_async(self, selection: Selection): + indexer = BasicIndexer( + selection, + shape=self.metadata.shape, + chunk_shape=self.metadata.chunks, + ) + + # setup output array + out = np.zeros( + indexer.shape, + dtype=self.metadata.dtype, + order=self.metadata.order, + ) + + # reading chunks and decoding them + await concurrent_map( + [ + (chunk_coords, chunk_selection, out_selection, out) + for chunk_coords, chunk_selection, out_selection in indexer + ], + self._read_chunk, + ) + + if out.shape: + return out + else: + return out[()] + + async def _read_chunk( + self, + chunk_coords: ChunkCoords, + chunk_selection: SliceSelection, + out_selection: SliceSelection, + out: np.ndarray, + ): + store_path = self.store_path / self._encode_chunk_key(chunk_coords) + + chunk_array = await self._decode_chunk(await store_path.get_async()) + if chunk_array is not None: + tmp = chunk_array[chunk_selection] + out[out_selection] = tmp + else: + out[out_selection] = self.metadata.fill_value + + async def _decode_chunk( + self, chunk_bytes: Optional[BytesLike] + ) -> Optional[np.ndarray]: + if chunk_bytes is None: + return None + + if self.metadata.compressor is not None: + compressor = numcodecs.get_codec(self.metadata.compressor) + chunk_array = ensure_ndarray( + await to_thread(compressor.decode, chunk_bytes) + ) + else: + chunk_array = ensure_ndarray(chunk_bytes) + + # ensure correct dtype + if str(chunk_array.dtype) != self.metadata.dtype: + chunk_array = chunk_array.view(self.metadata.dtype) + + # apply filters in reverse order + if self.metadata.filters is not None: + for filter_metadata in self.metadata.filters[::-1]: + filter = numcodecs.get_codec(filter_metadata) + chunk_array = await to_thread(filter.decode, chunk_array) + + # ensure correct chunk shape + if chunk_array.shape != self.metadata.chunks: + chunk_array = chunk_array.reshape( + self.metadata.chunks, + order=self.metadata.order, + ) + + return chunk_array + + def __setitem__(self, selection: Selection, value: np.ndarray) -> None: + sync(self.set_async(selection, value), self.runtime_configuration.asyncio_loop) + + async def set_async(self, selection: Selection, value: np.ndarray) -> None: + chunk_shape = self.metadata.chunks + indexer = BasicIndexer( + selection, + shape=self.metadata.shape, + chunk_shape=chunk_shape, + ) + + sel_shape = indexer.shape + + # check value shape + if np.isscalar(value): + # setting a scalar value + pass + else: + if not hasattr(value, "shape"): + value = np.asarray(value, self.metadata.dtype) + assert value.shape == sel_shape + if value.dtype != self.metadata.dtype: + value = value.astype(self.metadata.dtype, order="A") + + # merging with existing data and encoding chunks + await concurrent_map( + [ + ( + value, + chunk_shape, + chunk_coords, + chunk_selection, + out_selection, + ) + for chunk_coords, chunk_selection, out_selection in indexer + ], + self._write_chunk, + ) + + async def _write_chunk( + self, + value: np.ndarray, + chunk_shape: ChunkCoords, + chunk_coords: ChunkCoords, + chunk_selection: SliceSelection, + out_selection: SliceSelection, + ): + store_path = self.store_path / self._encode_chunk_key(chunk_coords) + + if is_total_slice(chunk_selection, chunk_shape): + # write entire chunks + if np.isscalar(value): + chunk_array = np.empty( + chunk_shape, + dtype=self.metadata.dtype, + order=self.metadata.order, + ) + chunk_array.fill(value) + else: + chunk_array = value[out_selection] + await self._write_chunk_to_store(store_path, chunk_array) + + else: + # writing partial chunks + # read chunk first + tmp = await self._decode_chunk(await store_path.get_async()) + + # merge new value + if tmp is None: + chunk_array = np.empty( + chunk_shape, + dtype=self.metadata.dtype, + order=self.metadata.order, + ) + chunk_array.fill(self.metadata.fill_value) + else: + chunk_array = tmp.copy( + order=self.metadata.order, + ) # make a writable copy + chunk_array[chunk_selection] = value[out_selection] + + await self._write_chunk_to_store(store_path, chunk_array) + + async def _write_chunk_to_store( + self, store_path: StorePath, chunk_array: np.ndarray + ): + chunk_bytes: Optional[BytesLike] + if np.all(chunk_array == self.metadata.fill_value): + # chunks that only contain fill_value will be removed + await store_path.delete_async() + else: + chunk_bytes = await self._encode_chunk(chunk_array) + if chunk_bytes is None: + await store_path.delete_async() + else: + await store_path.set_async(chunk_bytes) + + async def _encode_chunk(self, chunk_array: np.ndarray) -> Optional[BytesLike]: + chunk_array = chunk_array.ravel(order=self.metadata.order) + + if self.metadata.filters is not None: + for filter_metadata in self.metadata.filters: + filter = numcodecs.get_codec(filter_metadata) + chunk_array = await to_thread(filter.encode, chunk_array) + + if self.metadata.compressor is not None: + compressor = numcodecs.get_codec(self.metadata.compressor) + if ( + not chunk_array.flags.c_contiguous + and not chunk_array.flags.f_contiguous + ): + chunk_array = chunk_array.copy(order="A") + encoded_chunk_bytes = ensure_bytes( + await to_thread(compressor.encode, chunk_array) + ) + else: + encoded_chunk_bytes = ensure_bytes(chunk_array) + + return encoded_chunk_bytes + + def _encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: + chunk_identifier = self.metadata.dimension_separator.join( + map(str, chunk_coords) + ) + return "0" if chunk_identifier == "" else chunk_identifier + + async def resize_async(self, new_shape: ChunkCoords) -> ArrayV2: + assert len(new_shape) == len(self.metadata.shape) + new_metadata = evolve(self.metadata, shape=new_shape) + + # Remove all chunks outside of the new shape + chunk_shape = self.metadata.chunks + old_chunk_coords = set(all_chunk_coords(self.metadata.shape, chunk_shape)) + new_chunk_coords = set(all_chunk_coords(new_shape, chunk_shape)) + + async def _delete_key(key: str) -> None: + await (self.store_path / key).delete_async() + + await concurrent_map( + [ + (self._encode_chunk_key(chunk_coords),) + for chunk_coords in old_chunk_coords.difference(new_chunk_coords) + ], + _delete_key, + ) + + # Write new metadata + await (self.store_path / ZARRAY_JSON).set_async(new_metadata.to_bytes()) + return evolve(self, metadata=new_metadata) + + def resize(self, new_shape: ChunkCoords) -> ArrayV2: + return sync( + self.resize_async(new_shape), self.runtime_configuration.asyncio_loop + ) + + async def convert_to_v3_async(self) -> Array: + from sys import byteorder as sys_byteorder + + from zarrita.array import Array + from zarrita.common import ZARR_JSON + from zarrita.metadata import ( + ArrayMetadata, + BloscCodecConfigurationMetadata, + BloscCodecMetadata, + BytesCodecConfigurationMetadata, + BytesCodecMetadata, + CodecMetadata, + DataType, + GzipCodecConfigurationMetadata, + GzipCodecMetadata, + RegularChunkGridConfigurationMetadata, + RegularChunkGridMetadata, + TransposeCodecConfigurationMetadata, + TransposeCodecMetadata, + V2ChunkKeyEncodingConfigurationMetadata, + V2ChunkKeyEncodingMetadata, + blosc_shuffle_int_to_str, + dtype_to_data_type, + ) + + data_type = DataType[dtype_to_data_type[self.metadata.dtype.str]] + endian: Literal["little", "big"] + if self.metadata.dtype.byteorder == "=": + endian = sys_byteorder + elif self.metadata.dtype.byteorder == ">": + endian = "big" + else: + endian = "little" + + assert ( + self.metadata.filters is None or len(self.metadata.filters) == 0 + ), "Filters are not supported by v3." + + codecs: List[CodecMetadata] = [] + + if self.metadata.order == "F": + codecs.append( + TransposeCodecMetadata( + configuration=TransposeCodecConfigurationMetadata(order="F") + ) + ) + codecs.append( + BytesCodecMetadata( + configuration=BytesCodecConfigurationMetadata(endian=endian) + ) + ) + + if self.metadata.compressor is not None: + v2_codec = numcodecs.get_codec(self.metadata.compressor).get_config() + assert v2_codec["id"] in ( + "blosc", + "gzip", + ), "Only blosc and gzip are supported by v3." + if v2_codec["id"] == "blosc": + shuffle = blosc_shuffle_int_to_str[v2_codec.get("shuffle", 0)] + codecs.append( + BloscCodecMetadata( + configuration=BloscCodecConfigurationMetadata( + typesize=data_type.byte_count, + cname=v2_codec["cname"], + clevel=v2_codec["clevel"], + shuffle=shuffle, + blocksize=v2_codec.get("blocksize", 0), + ) + ) + ) + elif v2_codec["id"] == "gzip": + codecs.append( + GzipCodecMetadata( + configuration=GzipCodecConfigurationMetadata( + level=v2_codec.get("level", 5) + ) + ) + ) + + new_metadata = ArrayMetadata( + shape=self.metadata.shape, + chunk_grid=RegularChunkGridMetadata( + configuration=RegularChunkGridConfigurationMetadata( + chunk_shape=self.metadata.chunks + ) + ), + data_type=data_type, + fill_value=0 + if self.metadata.fill_value is None + else self.metadata.fill_value, + chunk_key_encoding=V2ChunkKeyEncodingMetadata( + configuration=V2ChunkKeyEncodingConfigurationMetadata( + separator=self.metadata.dimension_separator + ) + ), + codecs=codecs, + attributes=self.attributes or {}, + ) + + new_metadata_bytes = new_metadata.to_bytes() + await (self.store_path / ZARR_JSON).set_async(new_metadata_bytes) + + return Array.from_json( + store_path=self.store_path, + zarr_json=json.loads(new_metadata_bytes), + runtime_configuration=self.runtime_configuration, + ) + + async def update_attributes_async(self, new_attributes: Dict[str, Any]) -> ArrayV2: + await (self.store_path / ZATTRS_JSON).set_async( + json.dumps(new_attributes).encode() + ) + return evolve(self, attributes=new_attributes) + + def update_attributes(self, new_attributes: Dict[str, Any]) -> ArrayV2: + return sync( + self.update_attributes_async(new_attributes), + self.runtime_configuration.asyncio_loop, + ) + + def convert_to_v3(self) -> Array: + return sync( + self.convert_to_v3_async(), loop=self.runtime_configuration.asyncio_loop + ) + + def __repr__(self): + return f"" diff --git a/zarr/zarrita/codecs.py b/zarr/zarrita/codecs.py new file mode 100644 index 0000000000..56f99e3e06 --- /dev/null +++ b/zarr/zarrita/codecs.py @@ -0,0 +1,605 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from functools import reduce +from typing import TYPE_CHECKING, Iterable, List, Literal, Optional, Tuple, Union +from warnings import warn + +import numcodecs +import numpy as np +from attr import asdict, evolve, frozen +from crc32c import crc32c +from numcodecs.blosc import Blosc +from numcodecs.gzip import GZip +from zstandard import ZstdCompressor, ZstdDecompressor + +from zarrita.common import BytesLike, to_thread +from zarrita.metadata import ( + BloscCodecConfigurationMetadata, + BloscCodecMetadata, + BytesCodecConfigurationMetadata, + BytesCodecMetadata, + CodecMetadata, + Crc32cCodecMetadata, + GzipCodecConfigurationMetadata, + GzipCodecMetadata, + ShardingCodecConfigurationMetadata, + ShardingCodecMetadata, + TransposeCodecConfigurationMetadata, + TransposeCodecMetadata, + ZstdCodecConfigurationMetadata, + ZstdCodecMetadata, +) + +if TYPE_CHECKING: + from zarrita.metadata import CoreArrayMetadata + +# See https://zarr.readthedocs.io/en/stable/tutorial.html#configuring-blosc +numcodecs.blosc.use_threads = False + + +class Codec(ABC): + supports_partial_decode: bool + supports_partial_encode: bool + is_fixed_size: bool + array_metadata: CoreArrayMetadata + + @abstractmethod + def compute_encoded_size(self, input_byte_length: int) -> int: + pass + + def resolve_metadata(self) -> CoreArrayMetadata: + return self.array_metadata + + +class ArrayArrayCodec(Codec): + @abstractmethod + async def decode( + self, + chunk_array: np.ndarray, + ) -> np.ndarray: + pass + + @abstractmethod + async def encode( + self, + chunk_array: np.ndarray, + ) -> Optional[np.ndarray]: + pass + + +class ArrayBytesCodec(Codec): + @abstractmethod + async def decode( + self, + chunk_array: BytesLike, + ) -> np.ndarray: + pass + + @abstractmethod + async def encode( + self, + chunk_array: np.ndarray, + ) -> Optional[BytesLike]: + pass + + +class BytesBytesCodec(Codec): + @abstractmethod + async def decode( + self, + chunk_array: BytesLike, + ) -> BytesLike: + pass + + @abstractmethod + async def encode( + self, + chunk_array: BytesLike, + ) -> Optional[BytesLike]: + pass + + +@frozen +class CodecPipeline: + codecs: List[Codec] + + @classmethod + def from_metadata( + cls, + codecs_metadata: Iterable[CodecMetadata], + array_metadata: CoreArrayMetadata, + ) -> CodecPipeline: + out: List[Codec] = [] + for codec_metadata in codecs_metadata or []: + if codec_metadata.name == "endian": + codec_metadata = evolve(codec_metadata, name="bytes") # type: ignore + + codec: Codec + if codec_metadata.name == "blosc": + codec = BloscCodec.from_metadata(codec_metadata, array_metadata) + elif codec_metadata.name == "gzip": + codec = GzipCodec.from_metadata(codec_metadata, array_metadata) + elif codec_metadata.name == "zstd": + codec = ZstdCodec.from_metadata(codec_metadata, array_metadata) + elif codec_metadata.name == "transpose": + codec = TransposeCodec.from_metadata(codec_metadata, array_metadata) + elif codec_metadata.name == "bytes": + codec = BytesCodec.from_metadata(codec_metadata, array_metadata) + elif codec_metadata.name == "crc32c": + codec = Crc32cCodec.from_metadata(codec_metadata, array_metadata) + elif codec_metadata.name == "sharding_indexed": + from zarrita.sharding import ShardingCodec + + codec = ShardingCodec.from_metadata(codec_metadata, array_metadata) + else: + raise RuntimeError(f"Unsupported codec: {codec_metadata}") + + out.append(codec) + array_metadata = codec.resolve_metadata() + CodecPipeline._validate_codecs(out, array_metadata) + return cls(out) + + @staticmethod + def _validate_codecs( + codecs: List[Codec], array_metadata: CoreArrayMetadata + ) -> None: + from zarrita.sharding import ShardingCodec + + assert any( + isinstance(codec, ArrayBytesCodec) for codec in codecs + ), "Exactly one array-to-bytes codec is required." + + prev_codec: Optional[Codec] = None + for codec in codecs: + if prev_codec is not None: + assert not isinstance(codec, ArrayBytesCodec) or not isinstance( + prev_codec, ArrayBytesCodec + ), ( + f"ArrayBytesCodec '{type(codec)}' cannot follow after " + + f"ArrayBytesCodec '{type(prev_codec)}' because exactly " + + "1 ArrayBytesCodec is allowed." + ) + assert not isinstance(codec, ArrayBytesCodec) or not isinstance( + prev_codec, BytesBytesCodec + ), ( + f"ArrayBytesCodec '{type(codec)}' cannot follow after " + + f"BytesBytesCodec '{type(prev_codec)}'." + ) + assert not isinstance(codec, ArrayArrayCodec) or not isinstance( + prev_codec, ArrayBytesCodec + ), ( + f"ArrayArrayCodec '{type(codec)}' cannot follow after " + + f"ArrayBytesCodec '{type(prev_codec)}'." + ) + assert not isinstance(codec, ArrayArrayCodec) or not isinstance( + prev_codec, BytesBytesCodec + ), ( + f"ArrayArrayCodec '{type(codec)}' cannot follow after " + + f"BytesBytesCodec '{type(prev_codec)}'." + ) + + if isinstance(codec, ShardingCodec): + assert len(codec.configuration.chunk_shape) == len( + array_metadata.shape + ), ( + "The shard's `chunk_shape` and array's `shape` need to have the " + + "same number of dimensions." + ) + assert all( + s % c == 0 + for s, c in zip( + array_metadata.chunk_shape, + codec.configuration.chunk_shape, + ) + ), ( + "The array's `chunk_shape` needs to be divisible by the " + + "shard's inner `chunk_shape`." + ) + prev_codec = codec + + if ( + any(isinstance(codec, ShardingCodec) for codec in codecs) + and len(codecs) > 1 + ): + warn( + "Combining a `sharding_indexed` codec disables partial reads and " + + "writes, which may lead to inefficient performance." + ) + + def _array_array_codecs(self) -> List[ArrayArrayCodec]: + return [codec for codec in self.codecs if isinstance(codec, ArrayArrayCodec)] + + def _array_bytes_codec(self) -> ArrayBytesCodec: + return next( + codec for codec in self.codecs if isinstance(codec, ArrayBytesCodec) + ) + + def _bytes_bytes_codecs(self) -> List[BytesBytesCodec]: + return [codec for codec in self.codecs if isinstance(codec, BytesBytesCodec)] + + async def decode(self, chunk_bytes: BytesLike) -> np.ndarray: + for bb_codec in self._bytes_bytes_codecs()[::-1]: + chunk_bytes = await bb_codec.decode(chunk_bytes) + + chunk_array = await self._array_bytes_codec().decode(chunk_bytes) + + for aa_codec in self._array_array_codecs()[::-1]: + chunk_array = await aa_codec.decode(chunk_array) + + return chunk_array + + async def encode(self, chunk_array: np.ndarray) -> Optional[BytesLike]: + for aa_codec in self._array_array_codecs(): + chunk_array_maybe = await aa_codec.encode(chunk_array) + if chunk_array_maybe is None: + return None + chunk_array = chunk_array_maybe + + chunk_bytes_maybe = await self._array_bytes_codec().encode(chunk_array) + if chunk_bytes_maybe is None: + return None + chunk_bytes = chunk_bytes_maybe + + for bb_codec in self._bytes_bytes_codecs(): + chunk_bytes_maybe = await bb_codec.encode(chunk_bytes) + if chunk_bytes_maybe is None: + return None + chunk_bytes = chunk_bytes_maybe + + return chunk_bytes + + def compute_encoded_size(self, byte_length: int) -> int: + return reduce( + lambda acc, codec: codec.compute_encoded_size(acc), self.codecs, byte_length + ) + + +@frozen +class BloscCodec(BytesBytesCodec): + array_metadata: CoreArrayMetadata + configuration: BloscCodecConfigurationMetadata + blosc_codec: Blosc + is_fixed_size = False + + @classmethod + def from_metadata( + cls, codec_metadata: BloscCodecMetadata, array_metadata: CoreArrayMetadata + ) -> BloscCodec: + configuration = codec_metadata.configuration + if configuration.typesize == 0: + configuration = evolve( + configuration, typesize=array_metadata.data_type.byte_count + ) + config_dict = asdict(codec_metadata.configuration) + config_dict.pop("typesize", None) + map_shuffle_str_to_int = {"noshuffle": 0, "shuffle": 1, "bitshuffle": 2} + config_dict["shuffle"] = map_shuffle_str_to_int[config_dict["shuffle"]] + return cls( + array_metadata=array_metadata, + configuration=configuration, + blosc_codec=Blosc.from_config(config_dict), + ) + + async def decode( + self, + chunk_bytes: bytes, + ) -> BytesLike: + return await to_thread(self.blosc_codec.decode, chunk_bytes) + + async def encode( + self, + chunk_bytes: bytes, + ) -> Optional[BytesLike]: + chunk_array = np.frombuffer(chunk_bytes, dtype=self.array_metadata.dtype) + return await to_thread(self.blosc_codec.encode, chunk_array) + + def compute_encoded_size(self, _input_byte_length: int) -> int: + raise NotImplementedError + + +@frozen +class BytesCodec(ArrayBytesCodec): + array_metadata: CoreArrayMetadata + configuration: BytesCodecConfigurationMetadata + is_fixed_size = True + + @classmethod + def from_metadata( + cls, codec_metadata: BytesCodecMetadata, array_metadata: CoreArrayMetadata + ) -> BytesCodec: + assert ( + array_metadata.dtype.itemsize == 1 + or codec_metadata.configuration.endian is not None + ), "The `endian` configuration needs to be specified for multi-byte data types." + return cls( + array_metadata=array_metadata, + configuration=codec_metadata.configuration, + ) + + def _get_byteorder(self, array: np.ndarray) -> Literal["big", "little"]: + if array.dtype.byteorder == "<": + return "little" + elif array.dtype.byteorder == ">": + return "big" + else: + import sys + + return sys.byteorder + + async def decode( + self, + chunk_bytes: BytesLike, + ) -> np.ndarray: + if self.array_metadata.dtype.itemsize > 0: + if self.configuration.endian == "little": + prefix = "<" + else: + prefix = ">" + dtype = np.dtype( + f"{prefix}{self.array_metadata.data_type.to_numpy_shortname()}" + ) + else: + dtype = np.dtype(f"|{self.array_metadata.data_type.to_numpy_shortname()}") + chunk_array = np.frombuffer(chunk_bytes, dtype) + + # ensure correct chunk shape + if chunk_array.shape != self.array_metadata.chunk_shape: + chunk_array = chunk_array.reshape( + self.array_metadata.chunk_shape, + ) + return chunk_array + + async def encode( + self, + chunk_array: np.ndarray, + ) -> Optional[BytesLike]: + if chunk_array.dtype.itemsize > 1: + byteorder = self._get_byteorder(chunk_array) + if self.configuration.endian != byteorder: + new_dtype = chunk_array.dtype.newbyteorder(self.configuration.endian) + chunk_array = chunk_array.astype(new_dtype) + return chunk_array.tobytes() + + def compute_encoded_size(self, input_byte_length: int) -> int: + return input_byte_length + + +@frozen +class TransposeCodec(ArrayArrayCodec): + array_metadata: CoreArrayMetadata + order: Tuple[int, ...] + is_fixed_size = True + + @classmethod + def from_metadata( + cls, codec_metadata: TransposeCodecMetadata, array_metadata: CoreArrayMetadata + ) -> TransposeCodec: + configuration = codec_metadata.configuration + if configuration.order == "F": + order = tuple( + array_metadata.ndim - x - 1 for x in range(array_metadata.ndim) + ) + + elif configuration.order == "C": + order = tuple(range(array_metadata.ndim)) + + else: + assert len(configuration.order) == array_metadata.ndim, ( + "The `order` tuple needs have as many entries as " + + f"there are dimensions in the array. Got: {configuration.order}" + ) + assert len(configuration.order) == len(set(configuration.order)), ( + "There must not be duplicates in the `order` tuple. " + + f"Got: {configuration.order}" + ) + assert all(0 <= x < array_metadata.ndim for x in configuration.order), ( + "All entries in the `order` tuple must be between 0 and " + + f"the number of dimensions in the array. Got: {configuration.order}" + ) + order = tuple(configuration.order) + + return cls( + array_metadata=array_metadata, + order=order, + ) + + def resolve_metadata(self) -> CoreArrayMetadata: + from zarrita.metadata import CoreArrayMetadata + + return CoreArrayMetadata( + shape=tuple( + self.array_metadata.shape[self.order[i]] + for i in range(self.array_metadata.ndim) + ), + chunk_shape=tuple( + self.array_metadata.chunk_shape[self.order[i]] + for i in range(self.array_metadata.ndim) + ), + data_type=self.array_metadata.data_type, + fill_value=self.array_metadata.fill_value, + runtime_configuration=self.array_metadata.runtime_configuration, + ) + + async def decode( + self, + chunk_array: np.ndarray, + ) -> np.ndarray: + inverse_order = [0 for _ in range(self.array_metadata.ndim)] + for x, i in enumerate(self.order): + inverse_order[x] = i + chunk_array = chunk_array.transpose(inverse_order) + return chunk_array + + async def encode( + self, + chunk_array: np.ndarray, + ) -> Optional[np.ndarray]: + chunk_array = chunk_array.transpose(self.order) + return chunk_array + + def compute_encoded_size(self, input_byte_length: int) -> int: + return input_byte_length + + +@frozen +class GzipCodec(BytesBytesCodec): + array_metadata: CoreArrayMetadata + configuration: GzipCodecConfigurationMetadata + is_fixed_size = True + + @classmethod + def from_metadata( + cls, codec_metadata: GzipCodecMetadata, array_metadata: CoreArrayMetadata + ) -> GzipCodec: + return cls( + array_metadata=array_metadata, + configuration=codec_metadata.configuration, + ) + + async def decode( + self, + chunk_bytes: bytes, + ) -> BytesLike: + return await to_thread(GZip(self.configuration.level).decode, chunk_bytes) + + async def encode( + self, + chunk_bytes: bytes, + ) -> Optional[BytesLike]: + return await to_thread(GZip(self.configuration.level).encode, chunk_bytes) + + def compute_encoded_size(self, _input_byte_length: int) -> int: + raise NotImplementedError + + +@frozen +class ZstdCodec(BytesBytesCodec): + array_metadata: CoreArrayMetadata + configuration: ZstdCodecConfigurationMetadata + is_fixed_size = True + + @classmethod + def from_metadata( + cls, codec_metadata: ZstdCodecMetadata, array_metadata: CoreArrayMetadata + ) -> ZstdCodec: + return cls( + array_metadata=array_metadata, + configuration=codec_metadata.configuration, + ) + + def _compress(self, data: bytes) -> bytes: + ctx = ZstdCompressor( + level=self.configuration.level, write_checksum=self.configuration.checksum + ) + return ctx.compress(data) + + def _decompress(self, data: bytes) -> bytes: + ctx = ZstdDecompressor() + return ctx.decompress(data) + + async def decode( + self, + chunk_bytes: bytes, + ) -> BytesLike: + return await to_thread(self._decompress, chunk_bytes) + + async def encode( + self, + chunk_bytes: bytes, + ) -> Optional[BytesLike]: + return await to_thread(self._compress, chunk_bytes) + + def compute_encoded_size(self, _input_byte_length: int) -> int: + raise NotImplementedError + + +@frozen +class Crc32cCodec(BytesBytesCodec): + array_metadata: CoreArrayMetadata + is_fixed_size = True + + @classmethod + def from_metadata( + cls, codec_metadata: Crc32cCodecMetadata, array_metadata: CoreArrayMetadata + ) -> Crc32cCodec: + return cls(array_metadata=array_metadata) + + async def decode( + self, + chunk_bytes: bytes, + ) -> BytesLike: + crc32_bytes = chunk_bytes[-4:] + inner_bytes = chunk_bytes[:-4] + + assert np.uint32(crc32c(inner_bytes)).tobytes() == bytes(crc32_bytes) + return inner_bytes + + async def encode( + self, + chunk_bytes: bytes, + ) -> Optional[BytesLike]: + return chunk_bytes + np.uint32(crc32c(chunk_bytes)).tobytes() + + def compute_encoded_size(self, input_byte_length: int) -> int: + return input_byte_length + 4 + + +def blosc_codec( + typesize: int, + cname: Literal["lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib"] = "zstd", + clevel: int = 5, + shuffle: Literal["noshuffle", "shuffle", "bitshuffle"] = "noshuffle", + blocksize: int = 0, +) -> BloscCodecMetadata: + return BloscCodecMetadata( + configuration=BloscCodecConfigurationMetadata( + cname=cname, + clevel=clevel, + shuffle=shuffle, + blocksize=blocksize, + typesize=typesize, + ) + ) + + +def bytes_codec( + endian: Optional[Literal["big", "little"]] = "little" +) -> BytesCodecMetadata: + return BytesCodecMetadata(configuration=BytesCodecConfigurationMetadata(endian)) + + +def transpose_codec( + order: Union[Tuple[int, ...], Literal["C", "F"]] +) -> TransposeCodecMetadata: + return TransposeCodecMetadata( + configuration=TransposeCodecConfigurationMetadata(order) + ) + + +def gzip_codec(level: int = 5) -> GzipCodecMetadata: + return GzipCodecMetadata(configuration=GzipCodecConfigurationMetadata(level)) + + +def zstd_codec(level: int = 0, checksum: bool = False) -> ZstdCodecMetadata: + return ZstdCodecMetadata( + configuration=ZstdCodecConfigurationMetadata(level, checksum) + ) + + +def crc32c_codec() -> Crc32cCodecMetadata: + return Crc32cCodecMetadata() + + +def sharding_codec( + chunk_shape: Tuple[int, ...], + codecs: Optional[List[CodecMetadata]] = None, + index_codecs: Optional[List[CodecMetadata]] = None, +) -> ShardingCodecMetadata: + codecs = codecs or [bytes_codec()] + index_codecs = index_codecs or [bytes_codec(), crc32c_codec()] + return ShardingCodecMetadata( + configuration=ShardingCodecConfigurationMetadata( + chunk_shape, codecs, index_codecs + ) + ) diff --git a/zarr/zarrita/common.py b/zarr/zarrita/common.py new file mode 100644 index 0000000000..99b925bdaa --- /dev/null +++ b/zarr/zarrita/common.py @@ -0,0 +1,163 @@ +from __future__ import annotations + +import asyncio +import contextvars +import functools +from typing import ( + Any, + Awaitable, + Callable, + Dict, + List, + Literal, + Optional, + Tuple, + TypeVar, + Union, +) + +import numpy as np +from cattr import Converter + +ZARR_JSON = "zarr.json" +ZARRAY_JSON = ".zarray" +ZGROUP_JSON = ".zgroup" +ZATTRS_JSON = ".zattrs" + +BytesLike = Union[bytes, bytearray, memoryview] +ChunkCoords = Tuple[int, ...] +SliceSelection = Tuple[slice, ...] +Selection = Union[slice, SliceSelection] + + +def make_cattr(): + from zarrita.metadata import ( + BloscCodecMetadata, + BytesCodecMetadata, + ChunkKeyEncodingMetadata, + CodecMetadata, + Crc32cCodecMetadata, + DefaultChunkKeyEncodingMetadata, + GzipCodecMetadata, + ShardingCodecMetadata, + TransposeCodecMetadata, + V2ChunkKeyEncodingMetadata, + ZstdCodecMetadata, + ) + + converter = Converter() + + def _structure_chunk_key_encoding_metadata( + d: Dict[str, Any], _t + ) -> ChunkKeyEncodingMetadata: + if d["name"] == "default": + return converter.structure(d, DefaultChunkKeyEncodingMetadata) + if d["name"] == "v2": + return converter.structure(d, V2ChunkKeyEncodingMetadata) + raise KeyError + + converter.register_structure_hook( + ChunkKeyEncodingMetadata, _structure_chunk_key_encoding_metadata + ) + + def _structure_codec_metadata(d: Dict[str, Any], _t=None) -> CodecMetadata: + if d["name"] == "endian": + d["name"] = "bytes" + + if d["name"] == "blosc": + return converter.structure(d, BloscCodecMetadata) + if d["name"] == "bytes": + return converter.structure(d, BytesCodecMetadata) + if d["name"] == "transpose": + return converter.structure(d, TransposeCodecMetadata) + if d["name"] == "gzip": + return converter.structure(d, GzipCodecMetadata) + if d["name"] == "zstd": + return converter.structure(d, ZstdCodecMetadata) + if d["name"] == "sharding_indexed": + return converter.structure(d, ShardingCodecMetadata) + if d["name"] == "crc32c": + return converter.structure(d, Crc32cCodecMetadata) + raise KeyError + + converter.register_structure_hook(CodecMetadata, _structure_codec_metadata) + + converter.register_structure_hook_factory( + lambda t: str(t) == "ForwardRef('CodecMetadata')", + lambda t: _structure_codec_metadata, + ) + + def _structure_order(d: Any, _t=None) -> Union[Literal["C", "F"], Tuple[int, ...]]: + if d == "C": + return "C" + if d == "F": + return "F" + if isinstance(d, list): + return tuple(d) + raise KeyError + + converter.register_structure_hook_factory( + lambda t: str(t) + == "typing.Union[typing.Literal['C', 'F'], typing.Tuple[int, ...]]", + lambda t: _structure_order, + ) + + # Needed for v2 fill_value + def _structure_fill_value(d: Any, _t=None) -> Union[None, int, float]: + if d is None: + return None + try: + return int(d) + except ValueError: + pass + try: + return float(d) + except ValueError: + pass + raise ValueError + + converter.register_structure_hook_factory( + lambda t: str(t) == "typing.Union[NoneType, int, float]", + lambda t: _structure_fill_value, + ) + + # Needed for v2 dtype + converter.register_structure_hook( + np.dtype, + lambda d, _: np.dtype(d), + ) + + return converter + + +def product(tup: ChunkCoords) -> int: + return functools.reduce(lambda x, y: x * y, tup, 1) + + +T = TypeVar("T", bound=Tuple) +V = TypeVar("V") + + +async def concurrent_map( + items: List[T], func: Callable[..., Awaitable[V]], limit: Optional[int] = None +) -> List[V]: + if limit is None: + return await asyncio.gather(*[func(*item) for item in items]) + + else: + sem = asyncio.Semaphore(limit) + + async def run(item): + async with sem: + return await func(*item) + + return await asyncio.gather( + *[asyncio.ensure_future(run(item)) for item in items] + ) + + +async def to_thread(func, /, *args, **kwargs): + loop = asyncio.get_running_loop() + ctx = contextvars.copy_context() + func_call = functools.partial(ctx.run, func, *args, **kwargs) + return await loop.run_in_executor(None, func_call) diff --git a/zarr/zarrita/group.py b/zarr/zarrita/group.py new file mode 100644 index 0000000000..6842589f58 --- /dev/null +++ b/zarr/zarrita/group.py @@ -0,0 +1,187 @@ +from __future__ import annotations + +import json +from typing import Any, Dict, Literal, Optional, Union + +from attr import asdict, evolve, field, frozen + +from zarrita.array import Array +from zarrita.common import ZARR_JSON, make_cattr +from zarrita.metadata import RuntimeConfiguration +from zarrita.store import StoreLike, StorePath, make_store_path +from zarrita.sync import sync + + +@frozen +class GroupMetadata: + attributes: Dict[str, Any] = field(factory=dict) + zarr_format: Literal[3] = 3 + node_type: Literal["group"] = "group" + + def to_bytes(self) -> bytes: + return json.dumps(asdict(self)).encode() + + @classmethod + def from_json(cls, zarr_json: Any) -> GroupMetadata: + return make_cattr().structure(zarr_json, GroupMetadata) + + +@frozen +class Group: + metadata: GroupMetadata + store_path: StorePath + runtime_configuration: RuntimeConfiguration + + @classmethod + async def create_async( + cls, + store: StoreLike, + *, + attributes: Optional[Dict[str, Any]] = None, + exists_ok: bool = False, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> Group: + store_path = make_store_path(store) + if not exists_ok: + assert not await (store_path / ZARR_JSON).exists_async() + group = cls( + metadata=GroupMetadata(attributes=attributes or {}), + store_path=store_path, + runtime_configuration=runtime_configuration, + ) + await group._save_metadata() + return group + + @classmethod + def create( + cls, + store: StoreLike, + *, + attributes: Optional[Dict[str, Any]] = None, + exists_ok: bool = False, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> Group: + return sync( + cls.create_async( + store, + attributes=attributes, + exists_ok=exists_ok, + runtime_configuration=runtime_configuration, + ), + runtime_configuration.asyncio_loop, + ) + + @classmethod + async def open_async( + cls, + store: StoreLike, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> Group: + store_path = make_store_path(store) + zarr_json_bytes = await (store_path / ZARR_JSON).get_async() + assert zarr_json_bytes is not None + return cls.from_json( + store_path, json.loads(zarr_json_bytes), runtime_configuration + ) + + @classmethod + def open( + cls, + store: StoreLike, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> Group: + return sync( + cls.open_async(store, runtime_configuration), + runtime_configuration.asyncio_loop, + ) + + @classmethod + def from_json( + cls, + store_path: StorePath, + zarr_json: Any, + runtime_configuration: RuntimeConfiguration, + ) -> Group: + group = cls( + metadata=GroupMetadata.from_json(zarr_json), + store_path=store_path, + runtime_configuration=runtime_configuration, + ) + return group + + @classmethod + async def open_or_array( + cls, + store: StoreLike, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> Union[Array, Group]: + store_path = make_store_path(store) + zarr_json_bytes = await (store_path / ZARR_JSON).get_async() + if zarr_json_bytes is None: + raise KeyError + zarr_json = json.loads(zarr_json_bytes) + if zarr_json["node_type"] == "group": + return cls.from_json(store_path, zarr_json, runtime_configuration) + if zarr_json["node_type"] == "array": + return Array.from_json( + store_path, zarr_json, runtime_configuration=runtime_configuration + ) + raise KeyError + + async def _save_metadata(self) -> None: + await (self.store_path / ZARR_JSON).set_async(self.metadata.to_bytes()) + + async def get_async(self, path: str) -> Union[Array, Group]: + return await self.__class__.open_or_array( + self.store_path / path, self.runtime_configuration + ) + + def __getitem__(self, path: str) -> Union[Array, Group]: + return sync(self.get_async(path), self.runtime_configuration.asyncio_loop) + + async def create_group_async(self, path: str, **kwargs) -> Group: + runtime_configuration = kwargs.pop( + "runtime_configuration", self.runtime_configuration + ) + return await self.__class__.create_async( + self.store_path / path, + runtime_configuration=runtime_configuration, + **kwargs, + ) + + def create_group(self, path: str, **kwargs) -> Group: + return sync( + self.create_group_async(path), self.runtime_configuration.asyncio_loop + ) + + async def create_array_async(self, path: str, **kwargs) -> Array: + runtime_configuration = kwargs.pop( + "runtime_configuration", self.runtime_configuration + ) + return await Array.create_async( + self.store_path / path, + runtime_configuration=runtime_configuration, + **kwargs, + ) + + def create_array(self, path: str, **kwargs) -> Array: + return sync( + self.create_array_async(path, **kwargs), + self.runtime_configuration.asyncio_loop, + ) + + async def update_attributes_async(self, new_attributes: Dict[str, Any]) -> Group: + new_metadata = evolve(self.metadata, attributes=new_attributes) + + # Write new metadata + await (self.store_path / ZARR_JSON).set_async(new_metadata.to_bytes()) + return evolve(self, metadata=new_metadata) + + def update_attributes(self, new_attributes: Dict[str, Any]) -> Group: + return sync( + self.update_attributes_async(new_attributes), + self.runtime_configuration.asyncio_loop, + ) + + def __repr__(self): + return f"" diff --git a/zarr/zarrita/group_v2.py b/zarr/zarrita/group_v2.py new file mode 100644 index 0000000000..c4d0a4be12 --- /dev/null +++ b/zarr/zarrita/group_v2.py @@ -0,0 +1,228 @@ +from __future__ import annotations + +import asyncio +import json +from typing import TYPE_CHECKING, Any, Dict, Literal, Optional, Union + +from attr import asdict, evolve, frozen + +from zarrita.array_v2 import ArrayV2 +from zarrita.common import ZARRAY_JSON, ZATTRS_JSON, ZGROUP_JSON, make_cattr +from zarrita.metadata import RuntimeConfiguration +from zarrita.store import StoreLike, StorePath, make_store_path +from zarrita.sync import sync + +if TYPE_CHECKING: + from zarrita.group import Group + + +@frozen +class GroupV2Metadata: + zarr_format: Literal[2] = 2 + + def to_bytes(self) -> bytes: + return json.dumps(asdict(self)).encode() + + @classmethod + def from_json(cls, zarr_json: Any) -> GroupV2Metadata: + return make_cattr().structure(zarr_json, cls) + + +@frozen +class GroupV2: + metadata: GroupV2Metadata + store_path: StorePath + runtime_configuration: RuntimeConfiguration + attributes: Optional[Dict[str, Any]] = None + + @classmethod + async def create_async( + cls, + store: StoreLike, + *, + attributes: Optional[Dict[str, Any]] = None, + exists_ok: bool = False, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> GroupV2: + store_path = make_store_path(store) + if not exists_ok: + assert not await (store_path / ZGROUP_JSON).exists_async() + group = cls( + metadata=GroupV2Metadata(), + attributes=attributes, + store_path=store_path, + runtime_configuration=runtime_configuration, + ) + await group._save_metadata() + return group + + @classmethod + def create( + cls, + store: StoreLike, + *, + attributes: Optional[Dict[str, Any]] = None, + exists_ok: bool = False, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> GroupV2: + return sync( + cls.create_async( + store, + attributes=attributes, + exists_ok=exists_ok, + runtime_configuration=runtime_configuration, + ), + runtime_configuration.asyncio_loop if runtime_configuration else None, + ) + + @classmethod + async def open_async( + cls, + store: StoreLike, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> GroupV2: + store_path = make_store_path(store) + zgroup_bytes = await (store_path / ZGROUP_JSON).get_async() + assert zgroup_bytes is not None + zattrs_bytes = await (store_path / ZATTRS_JSON).get_async() + metadata = json.loads(zgroup_bytes) + attributes = json.loads(zattrs_bytes) if zattrs_bytes is not None else None + + return cls.from_json( + store_path, + metadata, + runtime_configuration, + attributes, + ) + + @classmethod + def open( + cls, + store_path: StorePath, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> GroupV2: + return sync( + cls.open_async(store_path, runtime_configuration), + runtime_configuration.asyncio_loop, + ) + + @classmethod + def from_json( + cls, + store_path: StorePath, + zarr_json: Any, + runtime_configuration: RuntimeConfiguration, + attributes: Optional[Dict[str, Any]] = None, + ) -> GroupV2: + group = cls( + metadata=GroupV2Metadata.from_json(zarr_json), + store_path=store_path, + runtime_configuration=runtime_configuration, + attributes=attributes, + ) + return group + + @staticmethod + async def open_or_array( + store: StoreLike, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> Union[ArrayV2, GroupV2]: + store_path = make_store_path(store) + zgroup_bytes, zattrs_bytes = await asyncio.gather( + (store_path / ZGROUP_JSON).get_async(), + (store_path / ZATTRS_JSON).get_async(), + ) + attributes = json.loads(zattrs_bytes) if zattrs_bytes is not None else None + if zgroup_bytes is not None: + return GroupV2.from_json( + store_path, json.loads(zgroup_bytes), runtime_configuration, attributes + ) + zarray_bytes = await (store_path / ZARRAY_JSON).get_async() + if zarray_bytes is not None: + return ArrayV2.from_json( + store_path, json.loads(zarray_bytes), attributes, runtime_configuration + ) + raise KeyError + + async def _save_metadata(self) -> None: + await (self.store_path / ZGROUP_JSON).set_async(self.metadata.to_bytes()) + if self.attributes is not None and len(self.attributes) > 0: + await (self.store_path / ZATTRS_JSON).set_async( + json.dumps(self.attributes).encode(), + ) + else: + await (self.store_path / ZATTRS_JSON).delete_async() + + async def get_async(self, path: str) -> Union[ArrayV2, GroupV2]: + return await self.__class__.open_or_array( + self.store_path / path, self.runtime_configuration + ) + + def __getitem__(self, path: str) -> Union[ArrayV2, GroupV2]: + return sync(self.get_async(path), self.runtime_configuration.asyncio_loop) + + async def create_group_async(self, path: str, **kwargs) -> GroupV2: + runtime_configuration = kwargs.pop( + "runtime_configuration", self.runtime_configuration + ) + return await self.__class__.create_async( + self.store_path / path, + runtime_configuration=runtime_configuration, + **kwargs, + ) + + def create_group(self, path: str, **kwargs) -> GroupV2: + return sync( + self.create_group_async(path), self.runtime_configuration.asyncio_loop + ) + + async def create_array_async(self, path: str, **kwargs) -> ArrayV2: + runtime_configuration = kwargs.pop( + "runtime_configuration", self.runtime_configuration + ) + return await ArrayV2.create_async( + self.store_path / path, + runtime_configuration=runtime_configuration, + **kwargs, + ) + + def create_array(self, path: str, **kwargs) -> ArrayV2: + return sync( + self.create_array_async(path, **kwargs), + self.runtime_configuration.asyncio_loop, + ) + + async def convert_to_v3_async(self) -> Group: + from zarrita.common import ZARR_JSON + from zarrita.group import Group, GroupMetadata + + new_metadata = GroupMetadata(attributes=self.attributes or {}) + new_metadata_bytes = new_metadata.to_bytes() + + await (self.store_path / ZARR_JSON).set_async(new_metadata_bytes) + + return Group.from_json( + store_path=self.store_path, + zarr_json=json.loads(new_metadata_bytes), + runtime_configuration=self.runtime_configuration, + ) + + async def update_attributes_async(self, new_attributes: Dict[str, Any]) -> GroupV2: + await (self.store_path / ZATTRS_JSON).set_async( + json.dumps(new_attributes).encode() + ) + return evolve(self, attributes=new_attributes) + + def update_attributes(self, new_attributes: Dict[str, Any]) -> GroupV2: + return sync( + self.update_attributes_async(new_attributes), + self.runtime_configuration.asyncio_loop, + ) + + def convert_to_v3(self) -> Group: + return sync( + self.convert_to_v3_async(), loop=self.runtime_configuration.asyncio_loop + ) + + def __repr__(self): + return f"" diff --git a/zarr/zarrita/indexing.py b/zarr/zarrita/indexing.py new file mode 100644 index 0000000000..ce814f5c64 --- /dev/null +++ b/zarr/zarrita/indexing.py @@ -0,0 +1,219 @@ +from __future__ import annotations + +import itertools +import math +from typing import Iterator, List, NamedTuple, Optional, Tuple + +from zarrita.common import ChunkCoords, Selection, SliceSelection, product + + +def _ensure_tuple(v: Selection) -> SliceSelection: + if not isinstance(v, tuple): + v = (v,) + return v + + +def _err_too_many_indices(selection: SliceSelection, shape: ChunkCoords): + raise IndexError( + "too many indices for array; expected {}, got {}".format( + len(shape), len(selection) + ) + ) + + +def _err_negative_step(): + raise IndexError("only slices with step >= 1 are supported") + + +def _check_selection_length(selection: SliceSelection, shape: ChunkCoords): + if len(selection) > len(shape): + _err_too_many_indices(selection, shape) + + +def _ensure_selection( + selection: Selection, + shape: ChunkCoords, +) -> SliceSelection: + selection = _ensure_tuple(selection) + + # fill out selection if not completely specified + if len(selection) < len(shape): + selection += (slice(None),) * (len(shape) - len(selection)) + + # check selection not too long + _check_selection_length(selection, shape) + + return selection + + +class _ChunkDimProjection(NamedTuple): + dim_chunk_ix: int + dim_chunk_sel: slice + dim_out_sel: Optional[slice] + + +def _ceildiv(a, b): + return math.ceil(a / b) + + +class _SliceDimIndexer: + dim_sel: slice + dim_len: int + dim_chunk_len: int + nitems: int + + start: int + stop: int + step: int + + def __init__(self, dim_sel: slice, dim_len: int, dim_chunk_len: int): + self.start, self.stop, self.step = dim_sel.indices(dim_len) + if self.step < 1: + _err_negative_step() + + self.dim_len = dim_len + self.dim_chunk_len = dim_chunk_len + self.nitems = max(0, _ceildiv((self.stop - self.start), self.step)) + self.nchunks = _ceildiv(self.dim_len, self.dim_chunk_len) + + def __iter__(self) -> Iterator[_ChunkDimProjection]: + # figure out the range of chunks we need to visit + dim_chunk_ix_from = self.start // self.dim_chunk_len + dim_chunk_ix_to = _ceildiv(self.stop, self.dim_chunk_len) + + # iterate over chunks in range + for dim_chunk_ix in range(dim_chunk_ix_from, dim_chunk_ix_to): + # compute offsets for chunk within overall array + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_limit = min(self.dim_len, (dim_chunk_ix + 1) * self.dim_chunk_len) + + # determine chunk length, accounting for trailing chunk + dim_chunk_len = dim_limit - dim_offset + + if self.start < dim_offset: + # selection starts before current chunk + dim_chunk_sel_start = 0 + remainder = (dim_offset - self.start) % self.step + if remainder: + dim_chunk_sel_start += self.step - remainder + # compute number of previous items, provides offset into output array + dim_out_offset = _ceildiv((dim_offset - self.start), self.step) + + else: + # selection starts within current chunk + dim_chunk_sel_start = self.start - dim_offset + dim_out_offset = 0 + + if self.stop > dim_limit: + # selection ends after current chunk + dim_chunk_sel_stop = dim_chunk_len + + else: + # selection ends within current chunk + dim_chunk_sel_stop = self.stop - dim_offset + + dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop, self.step) + dim_chunk_nitems = _ceildiv( + (dim_chunk_sel_stop - dim_chunk_sel_start), self.step + ) + dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) + + yield _ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + + +class _ChunkProjection(NamedTuple): + chunk_coords: ChunkCoords + chunk_selection: SliceSelection + out_selection: SliceSelection + + +class BasicIndexer: + dim_indexers: List[_SliceDimIndexer] + shape: ChunkCoords + + def __init__( + self, + selection: Selection, + shape: Tuple[int, ...], + chunk_shape: Tuple[int, ...], + ): + # setup per-dimension indexers + self.dim_indexers = [ + _SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) + for dim_sel, dim_len, dim_chunk_len in zip( + _ensure_selection(selection, shape), shape, chunk_shape + ) + ] + self.shape = tuple(s.nitems for s in self.dim_indexers) + + def __iter__(self) -> Iterator[_ChunkProjection]: + for dim_projections in itertools.product(*self.dim_indexers): + chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) + chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) + out_selection = tuple( + p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None + ) + + yield _ChunkProjection(chunk_coords, chunk_selection, out_selection) + + +def morton_order_iter(chunk_shape: ChunkCoords) -> Iterator[ChunkCoords]: + def decode_morton(z: int, chunk_shape: ChunkCoords) -> ChunkCoords: + # Inspired by compressed morton code as implemented in Neuroglancer + # https://github.com/google/neuroglancer/blob/master/src/neuroglancer/datasource/precomputed/volume.md#compressed-morton-code + bits = tuple(math.ceil(math.log2(c)) for c in chunk_shape) + max_coords_bits = max(*bits) + input_bit = 0 + input_value = z + out = [0 for _ in range(len(chunk_shape))] + + for coord_bit in range(max_coords_bits): + for dim in range(len(chunk_shape)): + if coord_bit < bits[dim]: + bit = (input_value >> input_bit) & 1 + out[dim] |= bit << coord_bit + input_bit += 1 + return tuple(out) + + for i in range(product(chunk_shape)): + yield decode_morton(i, chunk_shape) + + +def c_order_iter(chunks_per_shard: ChunkCoords) -> Iterator[ChunkCoords]: + return itertools.product(*(range(x) for x in chunks_per_shard)) + + +def is_total_slice(item: Selection, shape: ChunkCoords): + """Determine whether `item` specifies a complete slice of array with the + given `shape`. Used to optimize __setitem__ operations on the Chunk + class.""" + + # N.B., assume shape is normalized + if item == slice(None): + return True + if isinstance(item, slice): + item = (item,) + if isinstance(item, tuple): + return all( + ( + isinstance(dim_sel, slice) + and ( + (dim_sel == slice(None)) + or ( + (dim_sel.stop - dim_sel.start == dim_len) + and (dim_sel.step in [1, None]) + ) + ) + ) + for dim_sel, dim_len in zip(item, shape) + ) + else: + raise TypeError("expected slice or tuple of slices, found %r" % item) + + +def all_chunk_coords( + shape: ChunkCoords, chunk_shape: ChunkCoords +) -> Iterator[ChunkCoords]: + return itertools.product( + *(range(0, _ceildiv(s, c)) for s, c in zip(shape, chunk_shape)) + ) diff --git a/zarr/zarrita/metadata.py b/zarr/zarrita/metadata.py new file mode 100644 index 0000000000..45922e1edd --- /dev/null +++ b/zarr/zarrita/metadata.py @@ -0,0 +1,344 @@ +from __future__ import annotations + +import json +from asyncio import AbstractEventLoop +from enum import Enum +from typing import Any, Dict, List, Literal, Optional, Tuple, Union + +import numpy as np +from attr import asdict, field, frozen + +from zarrita.common import ChunkCoords, make_cattr + + +@frozen +class RuntimeConfiguration: + order: Literal["C", "F"] = "C" + concurrency: Optional[int] = None + asyncio_loop: Optional[AbstractEventLoop] = None + + +def runtime_configuration( + order: Literal["C", "F"], concurrency: Optional[int] = None +) -> RuntimeConfiguration: + return RuntimeConfiguration(order=order, concurrency=concurrency) + + +class DataType(Enum): + bool = "bool" + int8 = "int8" + int16 = "int16" + int32 = "int32" + int64 = "int64" + uint8 = "uint8" + uint16 = "uint16" + uint32 = "uint32" + uint64 = "uint64" + float32 = "float32" + float64 = "float64" + + @property + def byte_count(self) -> int: + data_type_byte_counts = { + DataType.bool: 1, + DataType.int8: 1, + DataType.int16: 2, + DataType.int32: 4, + DataType.int64: 8, + DataType.uint8: 1, + DataType.uint16: 2, + DataType.uint32: 4, + DataType.uint64: 8, + DataType.float32: 4, + DataType.float64: 8, + } + return data_type_byte_counts[self] + + def to_numpy_shortname(self) -> str: + data_type_to_numpy = { + DataType.bool: "bool", + DataType.int8: "i1", + DataType.int16: "i2", + DataType.int32: "i4", + DataType.int64: "i8", + DataType.uint8: "u1", + DataType.uint16: "u2", + DataType.uint32: "u4", + DataType.uint64: "u8", + DataType.float32: "f4", + DataType.float64: "f8", + } + return data_type_to_numpy[self] + + +dtype_to_data_type = { + "|b1": "bool", + "bool": "bool", + "|i1": "int8", + " ChunkCoords: + if chunk_key == "c": + return () + return tuple(map(int, chunk_key[1:].split(self.configuration.separator))) + + def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: + return self.configuration.separator.join(map(str, ("c",) + chunk_coords)) + + +@frozen +class V2ChunkKeyEncodingConfigurationMetadata: + separator: Literal[".", "/"] = "." + + +@frozen +class V2ChunkKeyEncodingMetadata: + configuration: V2ChunkKeyEncodingConfigurationMetadata = ( + V2ChunkKeyEncodingConfigurationMetadata() + ) + name: Literal["v2"] = "v2" + + def decode_chunk_key(self, chunk_key: str) -> ChunkCoords: + return tuple(map(int, chunk_key.split(self.configuration.separator))) + + def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: + chunk_identifier = self.configuration.separator.join(map(str, chunk_coords)) + return "0" if chunk_identifier == "" else chunk_identifier + + +ChunkKeyEncodingMetadata = Union[ + DefaultChunkKeyEncodingMetadata, V2ChunkKeyEncodingMetadata +] + + +BloscShuffle = Literal["noshuffle", "shuffle", "bitshuffle"] + + +@frozen +class BloscCodecConfigurationMetadata: + typesize: int + cname: Literal["lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib"] = "zstd" + clevel: int = 5 + shuffle: BloscShuffle = "noshuffle" + blocksize: int = 0 + + +blosc_shuffle_int_to_str: Dict[int, BloscShuffle] = { + 0: "noshuffle", + 1: "shuffle", + 2: "bitshuffle", +} + + +@frozen +class BloscCodecMetadata: + configuration: BloscCodecConfigurationMetadata + name: Literal["blosc"] = "blosc" + + +@frozen +class BytesCodecConfigurationMetadata: + endian: Optional[Literal["big", "little"]] = "little" + + +@frozen +class BytesCodecMetadata: + configuration: BytesCodecConfigurationMetadata + name: Literal["bytes"] = "bytes" + + +@frozen +class TransposeCodecConfigurationMetadata: + order: Union[Literal["C", "F"], Tuple[int, ...]] = "C" + + +@frozen +class TransposeCodecMetadata: + configuration: TransposeCodecConfigurationMetadata + name: Literal["transpose"] = "transpose" + + +@frozen +class GzipCodecConfigurationMetadata: + level: int = 5 + + +@frozen +class GzipCodecMetadata: + configuration: GzipCodecConfigurationMetadata + name: Literal["gzip"] = "gzip" + + +@frozen +class ZstdCodecConfigurationMetadata: + level: int = 0 + checksum: bool = False + + +@frozen +class ZstdCodecMetadata: + configuration: ZstdCodecConfigurationMetadata + name: Literal["zstd"] = "zstd" + + +@frozen +class Crc32cCodecMetadata: + name: Literal["crc32c"] = "crc32c" + + +@frozen +class ShardingCodecConfigurationMetadata: + chunk_shape: ChunkCoords + codecs: List["CodecMetadata"] + index_codecs: List["CodecMetadata"] + + +@frozen +class ShardingCodecMetadata: + configuration: ShardingCodecConfigurationMetadata + name: Literal["sharding_indexed"] = "sharding_indexed" + + +CodecMetadata = Union[ + BloscCodecMetadata, + BytesCodecMetadata, + TransposeCodecMetadata, + GzipCodecMetadata, + ZstdCodecMetadata, + ShardingCodecMetadata, + Crc32cCodecMetadata, +] + + +@frozen +class CoreArrayMetadata: + shape: ChunkCoords + chunk_shape: ChunkCoords + data_type: DataType + fill_value: Any + runtime_configuration: RuntimeConfiguration + + @property + def dtype(self) -> np.dtype: + return np.dtype(self.data_type.value) + + @property + def ndim(self) -> int: + return len(self.shape) + + +@frozen +class ArrayMetadata: + shape: ChunkCoords + data_type: DataType + chunk_grid: RegularChunkGridMetadata + chunk_key_encoding: ChunkKeyEncodingMetadata + fill_value: Any + codecs: List[CodecMetadata] + attributes: Dict[str, Any] = field(factory=dict) + dimension_names: Optional[Tuple[str, ...]] = None + zarr_format: Literal[3] = 3 + node_type: Literal["array"] = "array" + + @property + def dtype(self) -> np.dtype: + return np.dtype(self.data_type.value) + + @property + def ndim(self) -> int: + return len(self.shape) + + def get_core_metadata( + self, runtime_configuration: RuntimeConfiguration + ) -> CoreArrayMetadata: + return CoreArrayMetadata( + shape=self.shape, + chunk_shape=self.chunk_grid.configuration.chunk_shape, + data_type=self.data_type, + fill_value=self.fill_value, + runtime_configuration=runtime_configuration, + ) + + def to_bytes(self) -> bytes: + def _json_convert(o): + if isinstance(o, DataType): + return o.name + raise TypeError + + return json.dumps( + asdict( + self, + filter=lambda attr, value: attr.name != "dimension_names" + or value is not None, + ), + default=_json_convert, + ).encode() + + @classmethod + def from_json(cls, zarr_json: Any) -> ArrayMetadata: + return make_cattr().structure(zarr_json, cls) + + +@frozen +class ArrayV2Metadata: + shape: ChunkCoords + chunks: ChunkCoords + dtype: np.dtype + fill_value: Union[None, int, float] = 0 + order: Literal["C", "F"] = "C" + filters: Optional[List[Dict[str, Any]]] = None + dimension_separator: Literal[".", "/"] = "." + compressor: Optional[Dict[str, Any]] = None + zarr_format: Literal[2] = 2 + + @property + def ndim(self) -> int: + return len(self.shape) + + def to_bytes(self) -> bytes: + def _json_convert(o): + if isinstance(o, np.dtype): + if o.fields is None: + return o.str + else: + return o.descr + raise TypeError + + return json.dumps(asdict(self), default=_json_convert).encode() + + @classmethod + def from_json(cls, zarr_json: Any) -> ArrayV2Metadata: + return make_cattr().structure(zarr_json, cls) diff --git a/zarr/zarrita/sharding.py b/zarr/zarrita/sharding.py new file mode 100644 index 0000000000..283f06ea35 --- /dev/null +++ b/zarr/zarrita/sharding.py @@ -0,0 +1,541 @@ +from __future__ import annotations + +from typing import Iterator, List, Mapping, NamedTuple, Optional, Set, Tuple + +import numpy as np +from attrs import frozen + +from zarrita.codecs import ArrayBytesCodec, CodecPipeline +from zarrita.common import ( + BytesLike, + ChunkCoords, + SliceSelection, + concurrent_map, + product, +) +from zarrita.indexing import ( + BasicIndexer, + c_order_iter, + is_total_slice, + morton_order_iter, +) +from zarrita.metadata import ( + CoreArrayMetadata, + DataType, + ShardingCodecConfigurationMetadata, + ShardingCodecMetadata, +) +from zarrita.store import StorePath + +MAX_UINT_64 = 2**64 - 1 + + +class _ShardIndex(NamedTuple): + # dtype uint64, shape (chunks_per_shard_0, chunks_per_shard_1, ..., 2) + offsets_and_lengths: np.ndarray + + def _localize_chunk(self, chunk_coords: ChunkCoords) -> ChunkCoords: + return tuple( + chunk_i % shard_i + for chunk_i, shard_i in zip(chunk_coords, self.offsets_and_lengths.shape) + ) + + def is_all_empty(self) -> bool: + return bool(np.array_equiv(self.offsets_and_lengths, MAX_UINT_64)) + + def get_chunk_slice(self, chunk_coords: ChunkCoords) -> Optional[Tuple[int, int]]: + localized_chunk = self._localize_chunk(chunk_coords) + chunk_start, chunk_len = self.offsets_and_lengths[localized_chunk] + if (chunk_start, chunk_len) == (MAX_UINT_64, MAX_UINT_64): + return None + else: + return (int(chunk_start), int(chunk_start + chunk_len)) + + def set_chunk_slice( + self, chunk_coords: ChunkCoords, chunk_slice: Optional[slice] + ) -> None: + localized_chunk = self._localize_chunk(chunk_coords) + if chunk_slice is None: + self.offsets_and_lengths[localized_chunk] = (MAX_UINT_64, MAX_UINT_64) + else: + self.offsets_and_lengths[localized_chunk] = ( + chunk_slice.start, + chunk_slice.stop - chunk_slice.start, + ) + + def is_dense(self, chunk_byte_length: int) -> bool: + sorted_offsets_and_lengths = sorted( + [ + (offset, length) + for offset, length in self.offsets_and_lengths + if offset != MAX_UINT_64 + ], + key=lambda entry: entry[0], + ) + + # Are all non-empty offsets unique? + if len( + set( + offset + for offset, _ in sorted_offsets_and_lengths + if offset != MAX_UINT_64 + ) + ) != len(sorted_offsets_and_lengths): + return False + + return all( + offset % chunk_byte_length == 0 and length == chunk_byte_length + for offset, length in sorted_offsets_and_lengths + ) + + @classmethod + def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardIndex: + offsets_and_lengths = np.zeros(chunks_per_shard + (2,), dtype=" _ShardProxy: + obj = cls() + obj.buf = memoryview(buf) + obj.index = await codec._decode_shard_index( + obj.buf[-codec._shard_index_size() :] + ) + return obj + + @classmethod + def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardProxy: + index = _ShardIndex.create_empty(chunks_per_shard) + obj = cls() + obj.buf = memoryview(b"") + obj.index = index + return obj + + def __getitem__(self, chunk_coords: ChunkCoords) -> Optional[BytesLike]: + chunk_byte_slice = self.index.get_chunk_slice(chunk_coords) + if chunk_byte_slice: + return self.buf[chunk_byte_slice[0] : chunk_byte_slice[1]] + return None + + def __len__(self) -> int: + return int(self.index.offsets_and_lengths.size / 2) + + def __iter__(self) -> Iterator[ChunkCoords]: + return c_order_iter(self.index.offsets_and_lengths.shape[:-1]) + + +class _ShardBuilder(_ShardProxy): + buf: bytearray + index: _ShardIndex + + @classmethod + def merge_with_morton_order( + cls, + chunks_per_shard: ChunkCoords, + tombstones: Set[ChunkCoords], + *shard_dicts: Mapping[ChunkCoords, BytesLike], + ) -> _ShardBuilder: + obj = cls.create_empty(chunks_per_shard) + for chunk_coords in morton_order_iter(chunks_per_shard): + if tombstones is not None and chunk_coords in tombstones: + continue + for shard_dict in shard_dicts: + maybe_value = shard_dict.get(chunk_coords, None) + if maybe_value is not None: + obj.append(chunk_coords, maybe_value) + break + return obj + + @classmethod + def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardBuilder: + obj = cls() + obj.buf = bytearray() + obj.index = _ShardIndex.create_empty(chunks_per_shard) + return obj + + def append(self, chunk_coords: ChunkCoords, value: BytesLike): + chunk_start = len(self.buf) + chunk_length = len(value) + self.buf.extend(value) + self.index.set_chunk_slice( + chunk_coords, slice(chunk_start, chunk_start + chunk_length) + ) + + def finalize(self, index_bytes: BytesLike) -> BytesLike: + self.buf.extend(index_bytes) + return self.buf + + +@frozen +class ShardingCodec(ArrayBytesCodec): + array_metadata: CoreArrayMetadata + configuration: ShardingCodecConfigurationMetadata + codec_pipeline: CodecPipeline + index_codec_pipeline: CodecPipeline + chunks_per_shard: Tuple[int, ...] + + @classmethod + def from_metadata( + cls, + codec_metadata: ShardingCodecMetadata, + array_metadata: CoreArrayMetadata, + ) -> ShardingCodec: + chunks_per_shard = tuple( + s // c + for s, c in zip( + array_metadata.chunk_shape, + codec_metadata.configuration.chunk_shape, + ) + ) + # rewriting the metadata to scope it to the shard + shard_metadata = CoreArrayMetadata( + shape=array_metadata.chunk_shape, + chunk_shape=codec_metadata.configuration.chunk_shape, + data_type=array_metadata.data_type, + fill_value=array_metadata.fill_value, + runtime_configuration=array_metadata.runtime_configuration, + ) + codec_pipeline = CodecPipeline.from_metadata( + codec_metadata.configuration.codecs, shard_metadata + ) + index_codec_pipeline = CodecPipeline.from_metadata( + codec_metadata.configuration.index_codecs, + CoreArrayMetadata( + shape=chunks_per_shard + (2,), + chunk_shape=chunks_per_shard + (2,), + data_type=DataType.uint64, + fill_value=MAX_UINT_64, + runtime_configuration=array_metadata.runtime_configuration, + ), + ) + return cls( + array_metadata=array_metadata, + configuration=codec_metadata.configuration, + codec_pipeline=codec_pipeline, + index_codec_pipeline=index_codec_pipeline, + chunks_per_shard=chunks_per_shard, + ) + + async def decode( + self, + shard_bytes: BytesLike, + ) -> np.ndarray: + # print("decode") + shard_shape = self.array_metadata.chunk_shape + chunk_shape = self.configuration.chunk_shape + + indexer = BasicIndexer( + tuple(slice(0, s) for s in shard_shape), + shape=shard_shape, + chunk_shape=chunk_shape, + ) + + # setup output array + out = np.zeros( + shard_shape, + dtype=self.array_metadata.dtype, + order=self.array_metadata.runtime_configuration.order, + ) + shard_dict = await _ShardProxy.from_bytes(shard_bytes, self) + + if shard_dict.index.is_all_empty(): + out.fill(self.array_metadata.fill_value) + return out + + # decoding chunks and writing them into the output buffer + await concurrent_map( + [ + ( + shard_dict, + chunk_coords, + chunk_selection, + out_selection, + out, + ) + for chunk_coords, chunk_selection, out_selection in indexer + ], + self._read_chunk, + self.array_metadata.runtime_configuration.concurrency, + ) + + return out + + async def decode_partial( + self, + store_path: StorePath, + selection: SliceSelection, + ) -> Optional[np.ndarray]: + # print("decode_partial") + shard_shape = self.array_metadata.chunk_shape + chunk_shape = self.configuration.chunk_shape + + indexer = BasicIndexer( + selection, + shape=shard_shape, + chunk_shape=chunk_shape, + ) + + # setup output array + out = np.zeros( + indexer.shape, + dtype=self.array_metadata.dtype, + order=self.array_metadata.runtime_configuration.order, + ) + + indexed_chunks = list(indexer) + all_chunk_coords = set(chunk_coords for chunk_coords, _, _ in indexed_chunks) + + # reading bytes of all requested chunks + shard_dict: Mapping[ChunkCoords, BytesLike] = {} + if self._is_total_shard(all_chunk_coords): + # read entire shard + shard_dict_maybe = await self._load_full_shard_maybe(store_path) + if shard_dict_maybe is None: + return None + shard_dict = shard_dict_maybe + else: + # read some chunks within the shard + shard_index = await self._load_shard_index_maybe(store_path) + if shard_index is None: + return None + shard_dict = {} + for chunk_coords in all_chunk_coords: + chunk_byte_slice = shard_index.get_chunk_slice(chunk_coords) + if chunk_byte_slice: + chunk_bytes = await store_path.get_async(chunk_byte_slice) + if chunk_bytes: + shard_dict[chunk_coords] = chunk_bytes + + # decoding chunks and writing them into the output buffer + await concurrent_map( + [ + ( + shard_dict, + chunk_coords, + chunk_selection, + out_selection, + out, + ) + for chunk_coords, chunk_selection, out_selection in indexed_chunks + ], + self._read_chunk, + self.array_metadata.runtime_configuration.concurrency, + ) + + return out + + async def _read_chunk( + self, + shard_dict: Mapping[ChunkCoords, Optional[BytesLike]], + chunk_coords: ChunkCoords, + chunk_selection: SliceSelection, + out_selection: SliceSelection, + out: np.ndarray, + ): + chunk_bytes = shard_dict.get(chunk_coords, None) + if chunk_bytes is not None: + chunk_array = await self.codec_pipeline.decode(chunk_bytes) + tmp = chunk_array[chunk_selection] + out[out_selection] = tmp + else: + out[out_selection] = self.array_metadata.fill_value + + async def encode( + self, + shard_array: np.ndarray, + ) -> Optional[BytesLike]: + shard_shape = self.array_metadata.chunk_shape + chunk_shape = self.configuration.chunk_shape + + indexer = list( + BasicIndexer( + tuple(slice(0, s) for s in shard_shape), + shape=shard_shape, + chunk_shape=chunk_shape, + ) + ) + + async def _write_chunk( + shard_array: np.ndarray, + chunk_coords: ChunkCoords, + chunk_selection: SliceSelection, + out_selection: SliceSelection, + ) -> Tuple[ChunkCoords, Optional[BytesLike]]: + if is_total_slice(chunk_selection, chunk_shape): + chunk_array = shard_array[out_selection] + else: + # handling writing partial chunks + chunk_array = np.empty( + chunk_shape, + dtype=self.array_metadata.dtype, + ) + chunk_array.fill(self.array_metadata.fill_value) + chunk_array[chunk_selection] = shard_array[out_selection] + if not np.array_equiv(chunk_array, self.array_metadata.fill_value): + return ( + chunk_coords, + await self.codec_pipeline.encode(chunk_array), + ) + return (chunk_coords, None) + + # assembling and encoding chunks within the shard + encoded_chunks: List[ + Tuple[ChunkCoords, Optional[BytesLike]] + ] = await concurrent_map( + [ + (shard_array, chunk_coords, chunk_selection, out_selection) + for chunk_coords, chunk_selection, out_selection in indexer + ], + _write_chunk, + self.array_metadata.runtime_configuration.concurrency, + ) + if len(encoded_chunks) == 0: + return None + + shard_builder = _ShardBuilder.create_empty(self.chunks_per_shard) + for chunk_coords, chunk_bytes in encoded_chunks: + if chunk_bytes is not None: + shard_builder.append(chunk_coords, chunk_bytes) + + return shard_builder.finalize( + await self._encode_shard_index(shard_builder.index) + ) + + async def encode_partial( + self, + store_path: StorePath, + shard_array: np.ndarray, + selection: SliceSelection, + ) -> None: + # print("encode_partial") + shard_shape = self.array_metadata.chunk_shape + chunk_shape = self.configuration.chunk_shape + + old_shard_dict = ( + await self._load_full_shard_maybe(store_path) + ) or _ShardProxy.create_empty(self.chunks_per_shard) + new_shard_builder = _ShardBuilder.create_empty(self.chunks_per_shard) + tombstones: Set[ChunkCoords] = set() + + indexer = list( + BasicIndexer( + selection, + shape=shard_shape, + chunk_shape=chunk_shape, + ) + ) + + async def _write_chunk( + chunk_coords: ChunkCoords, + chunk_selection: SliceSelection, + out_selection: SliceSelection, + ) -> Tuple[ChunkCoords, Optional[BytesLike]]: + chunk_array = None + if is_total_slice(chunk_selection, self.configuration.chunk_shape): + chunk_array = shard_array[out_selection] + else: + # handling writing partial chunks + # read chunk first + chunk_bytes = old_shard_dict.get(chunk_coords, None) + + # merge new value + if chunk_bytes is None: + chunk_array = np.empty( + self.configuration.chunk_shape, + dtype=self.array_metadata.dtype, + ) + chunk_array.fill(self.array_metadata.fill_value) + else: + chunk_array = ( + await self.codec_pipeline.decode(chunk_bytes) + ).copy() # make a writable copy + chunk_array[chunk_selection] = shard_array[out_selection] + + if not np.array_equiv(chunk_array, self.array_metadata.fill_value): + return ( + chunk_coords, + await self.codec_pipeline.encode(chunk_array), + ) + else: + return (chunk_coords, None) + + encoded_chunks: List[ + Tuple[ChunkCoords, Optional[BytesLike]] + ] = await concurrent_map( + [ + ( + chunk_coords, + chunk_selection, + out_selection, + ) + for chunk_coords, chunk_selection, out_selection in indexer + ], + _write_chunk, + self.array_metadata.runtime_configuration.concurrency, + ) + + for chunk_coords, chunk_bytes in encoded_chunks: + if chunk_bytes is not None: + new_shard_builder.append(chunk_coords, chunk_bytes) + else: + tombstones.add(chunk_coords) + + shard_builder = _ShardBuilder.merge_with_morton_order( + self.chunks_per_shard, tombstones, new_shard_builder, old_shard_dict + ) + + if shard_builder.index.is_all_empty(): + await store_path.delete_async() + else: + await store_path.set_async( + shard_builder.finalize( + await self._encode_shard_index(shard_builder.index) + ) + ) + + def _is_total_shard(self, all_chunk_coords: Set[ChunkCoords]) -> bool: + return len(all_chunk_coords) == product(self.chunks_per_shard) and all( + chunk_coords in all_chunk_coords + for chunk_coords in c_order_iter(self.chunks_per_shard) + ) + + async def _decode_shard_index(self, index_bytes: BytesLike) -> _ShardIndex: + return _ShardIndex(await self.index_codec_pipeline.decode(index_bytes)) + + async def _encode_shard_index(self, index: _ShardIndex) -> BytesLike: + index_bytes = await self.index_codec_pipeline.encode(index.offsets_and_lengths) + assert index_bytes is not None + return index_bytes + + def _shard_index_size(self) -> int: + return self.index_codec_pipeline.compute_encoded_size( + 16 * product(self.chunks_per_shard) + ) + + async def _load_shard_index_maybe( + self, store_path: StorePath + ) -> Optional[_ShardIndex]: + index_bytes = await store_path.get_async((-self._shard_index_size(), None)) + if index_bytes is not None: + return await self._decode_shard_index(index_bytes) + return None + + async def _load_shard_index(self, store_path: StorePath) -> _ShardIndex: + return ( + await self._load_shard_index_maybe(store_path) + ) or _ShardIndex.create_empty(self.chunks_per_shard) + + async def _load_full_shard_maybe( + self, store_path: StorePath + ) -> Optional[_ShardProxy]: + shard_bytes = await store_path.get_async() + + return await _ShardProxy.from_bytes(shard_bytes, self) if shard_bytes else None + + def compute_encoded_size(self, input_byte_length: int) -> int: + return input_byte_length + self._shard_index_size() diff --git a/zarr/zarrita/store.py b/zarr/zarrita/store.py new file mode 100644 index 0000000000..119650113d --- /dev/null +++ b/zarr/zarrita/store.py @@ -0,0 +1,307 @@ +from __future__ import annotations + +import asyncio +import io +from pathlib import Path +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union + +import fsspec +from fsspec.asyn import AsyncFileSystem + +from zarrita.common import BytesLike, to_thread + +if TYPE_CHECKING: + from upath import UPath + + +def _dereference_path(root: str, path: str) -> str: + assert isinstance(root, str) + assert isinstance(path, str) + root = root.rstrip("/") + path = f"{root}/{path}" if root != "" else path + path = path.rstrip("/") + return path + + +class StorePath: + store: Store + path: str + + def __init__(self, store: Store, path: Optional[str] = None): + self.store = store + self.path = path or "" + + @classmethod + def from_path(cls, pth: Path) -> StorePath: + return cls(Store.from_path(pth)) + + async def get_async( + self, byte_range: Optional[Tuple[int, Optional[int]]] = None + ) -> Optional[BytesLike]: + return await self.store.get_async(self.path, byte_range) + + async def set_async( + self, value: BytesLike, byte_range: Optional[Tuple[int, int]] = None + ) -> None: + await self.store.set_async(self.path, value, byte_range) + + async def delete_async(self) -> None: + await self.store.delete_async(self.path) + + async def exists_async(self) -> bool: + return await self.store.exists_async(self.path) + + def __truediv__(self, other: str) -> StorePath: + return self.__class__(self.store, _dereference_path(self.path, other)) + + def __str__(self) -> str: + return _dereference_path(str(self.store), self.path) + + def __repr__(self) -> str: + return f"StorePath({self.store.__class__.__name__}, {repr(str(self))})" + + +class Store: + supports_partial_writes = False + + @classmethod + def from_path(cls, pth: Path) -> Store: + try: + from upath import UPath + from upath.implementations.local import PosixUPath, WindowsUPath + + if isinstance(pth, UPath) and not isinstance( + pth, (PosixUPath, WindowsUPath) + ): + storage_options = pth._kwargs.copy() + storage_options.pop("_url", None) + return RemoteStore(str(pth), **storage_options) + except ImportError: + pass + + return LocalStore(pth) + + async def multi_get_async( + self, keys: List[Tuple[str, Optional[Tuple[int, int]]]] + ) -> List[Optional[BytesLike]]: + return await asyncio.gather( + *[self.get_async(key, byte_range) for key, byte_range in keys] + ) + + async def get_async( + self, key: str, byte_range: Optional[Tuple[int, Optional[int]]] = None + ) -> Optional[BytesLike]: + raise NotImplementedError + + async def multi_set_async( + self, key_values: List[Tuple[str, BytesLike, Optional[Tuple[int, int]]]] + ) -> None: + await asyncio.gather( + *[ + self.set_async(key, value, byte_range) + for key, value, byte_range in key_values + ] + ) + + async def set_async( + self, key: str, value: BytesLike, byte_range: Optional[Tuple[int, int]] = None + ) -> None: + raise NotImplementedError + + async def delete_async(self, key: str) -> None: + raise NotImplementedError + + async def exists_async(self, key: str) -> bool: + raise NotImplementedError + + def __truediv__(self, other: str) -> StorePath: + return StorePath(self, other) + + +class LocalStore(Store): + supports_partial_writes = True + root: Path + auto_mkdir: bool + + def __init__(self, root: Union[Path, str], auto_mkdir: bool = True): + if isinstance(root, str): + root = Path(root) + assert isinstance(root, Path) + + self.root = root + self.auto_mkdir = auto_mkdir + + def _cat_file( + self, path: Path, start: Optional[int] = None, end: Optional[int] = None + ) -> BytesLike: + if start is None and end is None: + return path.read_bytes() + with path.open("rb") as f: + size = f.seek(0, io.SEEK_END) + if start is not None: + if start >= 0: + f.seek(start) + else: + f.seek(max(0, size + start)) + if end is not None: + if end < 0: + end = size + end + return f.read(end - f.tell()) + return f.read() + + def _put_file( + self, + path: Path, + value: BytesLike, + start: Optional[int] = None, + ): + if self.auto_mkdir: + path.parent.mkdir(parents=True, exist_ok=True) + if start is not None: + with path.open("r+b") as f: + f.seek(start) + f.write(value) + else: + return path.write_bytes(value) + + async def get_async( + self, key: str, byte_range: Optional[Tuple[int, Optional[int]]] = None + ) -> Optional[BytesLike]: + assert isinstance(key, str) + path = self.root / key + + try: + value = await ( + to_thread(self._cat_file, path, byte_range[0], byte_range[1]) + if byte_range is not None + else to_thread(self._cat_file, path) + ) + except (FileNotFoundError, IsADirectoryError, NotADirectoryError): + return None + + return value + + async def set_async( + self, key: str, value: BytesLike, byte_range: Optional[Tuple[int, int]] = None + ) -> None: + assert isinstance(key, str) + path = self.root / key + + if byte_range is not None: + await to_thread(self._put_file, path, value, byte_range[0]) + else: + await to_thread(self._put_file, path, value) + + async def delete_async(self, key: str) -> None: + path = self.root / key + await to_thread(path.unlink, True) + + async def exists_async(self, key: str) -> bool: + path = self.root / key + return await to_thread(path.exists) + + def __str__(self) -> str: + return f"file://{self.root}" + + def __repr__(self) -> str: + return f"LocalStore({repr(str(self))})" + + +class RemoteStore(Store): + root: UPath + + def __init__(self, url: Union[UPath, str], **storage_options: Dict[str, Any]): + from upath import UPath + + if isinstance(url, str): + self.root = UPath(url, **storage_options) + else: + assert len(storage_options) == 0, ( + "If constructed with a UPath object, no additional " + + "storage_options are allowed." + ) + self.root = url.rstrip("/") + # test instantiate file system + fs, _ = fsspec.core.url_to_fs( + str(self.root), asynchronous=True, **self.root._kwargs + ) + assert fs.__class__.async_impl, "FileSystem needs to support async operations." + + def make_fs(self) -> Tuple[AsyncFileSystem, str]: + storage_options = self.root._kwargs.copy() + storage_options.pop("_url", None) + fs, root = fsspec.core.url_to_fs( + str(self.root), asynchronous=True, **self.root._kwargs + ) + assert fs.__class__.async_impl, "FileSystem needs to support async operations." + return fs, root + + async def get_async( + self, key: str, byte_range: Optional[Tuple[int, Optional[int]]] = None + ) -> Optional[BytesLike]: + assert isinstance(key, str) + fs, root = self.make_fs() + path = _dereference_path(root, key) + + try: + value = await ( + fs._cat_file(path, start=byte_range[0], end=byte_range[1]) + if byte_range + else fs._cat_file(path) + ) + except (FileNotFoundError, IsADirectoryError, NotADirectoryError): + return None + + return value + + async def set_async( + self, key: str, value: BytesLike, byte_range: Optional[Tuple[int, int]] = None + ) -> None: + assert isinstance(key, str) + fs, root = self.make_fs() + path = _dereference_path(root, key) + + # write data + if byte_range: + with fs._open(path, "r+b") as f: + f.seek(byte_range[0]) + f.write(value) + else: + await fs._pipe_file(path, value) + + async def delete_async(self, key: str) -> None: + fs, root = self.make_fs() + path = _dereference_path(root, key) + if await fs._exists(path): + await fs._rm(path) + + async def exists_async(self, key: str) -> bool: + fs, root = self.make_fs() + path = _dereference_path(root, key) + return await fs._exists(path) + + def __str__(self) -> str: + return str(self.root) + + def __repr__(self) -> str: + return f"RemoteStore({repr(str(self))})" + + +StoreLike = Union[Store, StorePath, Path, str] + + +def make_store_path(store_like: StoreLike) -> StorePath: + if isinstance(store_like, StorePath): + return store_like + elif isinstance(store_like, Store): + return StorePath(store_like) + elif isinstance(store_like, Path): + return StorePath(Store.from_path(store_like)) + elif isinstance(store_like, str): + try: + from upath import UPath + + return StorePath(Store.from_path(UPath(store_like))) + except ImportError: + return StorePath(LocalStore(Path(store_like))) + raise TypeError diff --git a/zarr/zarrita/sync.py b/zarr/zarrita/sync.py new file mode 100644 index 0000000000..6f33dd925b --- /dev/null +++ b/zarr/zarrita/sync.py @@ -0,0 +1,89 @@ +from __future__ import annotations + +import asyncio +import threading +from typing import Any, Coroutine, List, Optional + +# From https://github.com/fsspec/filesystem_spec/blob/master/fsspec/asyn.py + +iothread: List[Optional[threading.Thread]] = [None] # dedicated IO thread +loop: List[Optional[asyncio.AbstractEventLoop]] = [ + None +] # global event loop for any non-async instance +_lock: Optional[threading.Lock] = None # global lock placeholder +get_running_loop = asyncio.get_running_loop + + +def _get_lock() -> threading.Lock: + """Allocate or return a threading lock. + + The lock is allocated on first use to allow setting one lock per forked process. + """ + global _lock + if not _lock: + _lock = threading.Lock() + return _lock + + +async def _runner( + event: threading.Event, coro: Coroutine, result_box: List[Optional[Any]] +): + try: + result_box[0] = await coro + except Exception as ex: + result_box[0] = ex + finally: + event.set() + + +def sync(coro: Coroutine, loop: Optional[asyncio.AbstractEventLoop] = None): + """ + Make loop run coroutine until it returns. Runs in other thread + + Examples + -------- + >>> sync(async_function(), existing_loop) + """ + if loop is None: + # NB: if the loop is not running *yet*, it is OK to submit work + # and we will wait for it + loop = _get_loop() + if loop is None or loop.is_closed(): + raise RuntimeError("Loop is not running") + try: + loop0 = asyncio.events.get_running_loop() + if loop0 is loop: + raise NotImplementedError("Calling sync() from within a running loop") + except RuntimeError: + pass + result_box: List[Optional[Any]] = [None] + event = threading.Event() + asyncio.run_coroutine_threadsafe(_runner(event, coro, result_box), loop) + while True: + # this loops allows thread to get interrupted + if event.wait(1): + break + + return_result = result_box[0] + if isinstance(return_result, BaseException): + raise return_result + else: + return return_result + + +def _get_loop(): + """Create or return the default fsspec IO loop + + The loop will be running on a separate thread. + """ + if loop[0] is None: + with _get_lock(): + # repeat the check just in case the loop got filled between the + # previous two calls from another thread + if loop[0] is None: + loop[0] = asyncio.new_event_loop() + th = threading.Thread(target=loop[0].run_forever, name="zarritaIO") + th.daemon = True + th.start() + iothread[0] = th + return loop[0] From a31a4e52e066abbbb77180ba1ca934440151cba5 Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Fri, 3 Nov 2023 12:38:35 -0700 Subject: [PATCH 02/12] apply zarr lint rules --- zarr/zarrita/__init__.py | 4 +-- zarr/zarrita/array.py | 16 +++-------- zarr/zarrita/array_v2.py | 53 +++++++++--------------------------- zarr/zarrita/codecs.py | 59 ++++++++++------------------------------ zarr/zarrita/common.py | 11 ++------ zarr/zarrita/group.py | 16 +++-------- zarr/zarrita/group_v2.py | 20 ++++---------- zarr/zarrita/indexing.py | 21 ++++---------- zarr/zarrita/metadata.py | 11 ++------ zarr/zarrita/sharding.py | 55 ++++++++++--------------------------- zarr/zarrita/store.py | 21 ++++---------- zarr/zarrita/sync.py | 4 +-- 12 files changed, 74 insertions(+), 217 deletions(-) diff --git a/zarr/zarrita/__init__.py b/zarr/zarrita/__init__.py index 22a96e0a75..bd65411825 100644 --- a/zarr/zarrita/__init__.py +++ b/zarr/zarrita/__init__.py @@ -25,9 +25,7 @@ async def open_auto_async( ) -> Union[Array, ArrayV2, Group, GroupV2]: store_path = make_store_path(store) try: - return await Group.open_or_array( - store_path, runtime_configuration=runtime_configuration_ - ) + return await Group.open_or_array(store_path, runtime_configuration=runtime_configuration_) except KeyError: return await GroupV2.open_or_array(store_path, runtime_configuration_) diff --git a/zarr/zarrita/array.py b/zarr/zarrita/array.py index 6ffd0b01e0..2c0fef0d61 100644 --- a/zarr/zarrita/array.py +++ b/zarr/zarrita/array.py @@ -90,9 +90,7 @@ async def create_async( assert not await (store_path / ZARR_JSON).exists_async() data_type = ( - DataType[dtype] - if isinstance(dtype, str) - else DataType[dtype_to_data_type[dtype.str]] + DataType[dtype] if isinstance(dtype, str) else DataType[dtype_to_data_type[dtype.str]] ) codecs = list(codecs) if codecs is not None else [bytes_codec()] @@ -107,9 +105,7 @@ async def create_async( shape=shape, data_type=data_type, chunk_grid=RegularChunkGridMetadata( - configuration=RegularChunkGridConfigurationMetadata( - chunk_shape=chunk_shape - ) + configuration=RegularChunkGridConfigurationMetadata(chunk_shape=chunk_shape) ), chunk_key_encoding=( V2ChunkKeyEncodingMetadata( @@ -437,9 +433,7 @@ async def _write_chunk( await self._write_chunk_to_store(store_path, chunk_array) - async def _write_chunk_to_store( - self, store_path: StorePath, chunk_array: np.ndarray - ): + async def _write_chunk_to_store(self, store_path: StorePath, chunk_array: np.ndarray): if np.all(chunk_array == self.metadata.fill_value): # chunks that only contain fill_value will be removed await store_path.delete_async() @@ -477,9 +471,7 @@ async def _delete_key(key: str) -> None: return evolve(self, metadata=new_metadata) def resize(self, new_shape: ChunkCoords) -> Array: - return sync( - self.resize_async(new_shape), self.runtime_configuration.asyncio_loop - ) + return sync(self.resize_async(new_shape), self.runtime_configuration.asyncio_loop) async def update_attributes_async(self, new_attributes: Dict[str, Any]) -> Array: new_metadata = evolve(self.metadata, attributes=new_attributes) diff --git a/zarr/zarrita/array_v2.py b/zarr/zarrita/array_v2.py index b9ce51b29b..119a4920da 100644 --- a/zarr/zarrita/array_v2.py +++ b/zarr/zarrita/array_v2.py @@ -261,17 +261,13 @@ async def _read_chunk( else: out[out_selection] = self.metadata.fill_value - async def _decode_chunk( - self, chunk_bytes: Optional[BytesLike] - ) -> Optional[np.ndarray]: + async def _decode_chunk(self, chunk_bytes: Optional[BytesLike]) -> Optional[np.ndarray]: if chunk_bytes is None: return None if self.metadata.compressor is not None: compressor = numcodecs.get_codec(self.metadata.compressor) - chunk_array = ensure_ndarray( - await to_thread(compressor.decode, chunk_bytes) - ) + chunk_array = ensure_ndarray(await to_thread(compressor.decode, chunk_bytes)) else: chunk_array = ensure_ndarray(chunk_bytes) @@ -377,9 +373,7 @@ async def _write_chunk( await self._write_chunk_to_store(store_path, chunk_array) - async def _write_chunk_to_store( - self, store_path: StorePath, chunk_array: np.ndarray - ): + async def _write_chunk_to_store(self, store_path: StorePath, chunk_array: np.ndarray): chunk_bytes: Optional[BytesLike] if np.all(chunk_array == self.metadata.fill_value): # chunks that only contain fill_value will be removed @@ -401,23 +395,16 @@ async def _encode_chunk(self, chunk_array: np.ndarray) -> Optional[BytesLike]: if self.metadata.compressor is not None: compressor = numcodecs.get_codec(self.metadata.compressor) - if ( - not chunk_array.flags.c_contiguous - and not chunk_array.flags.f_contiguous - ): + if not chunk_array.flags.c_contiguous and not chunk_array.flags.f_contiguous: chunk_array = chunk_array.copy(order="A") - encoded_chunk_bytes = ensure_bytes( - await to_thread(compressor.encode, chunk_array) - ) + encoded_chunk_bytes = ensure_bytes(await to_thread(compressor.encode, chunk_array)) else: encoded_chunk_bytes = ensure_bytes(chunk_array) return encoded_chunk_bytes def _encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: - chunk_identifier = self.metadata.dimension_separator.join( - map(str, chunk_coords) - ) + chunk_identifier = self.metadata.dimension_separator.join(map(str, chunk_coords)) return "0" if chunk_identifier == "" else chunk_identifier async def resize_async(self, new_shape: ChunkCoords) -> ArrayV2: @@ -445,9 +432,7 @@ async def _delete_key(key: str) -> None: return evolve(self, metadata=new_metadata) def resize(self, new_shape: ChunkCoords) -> ArrayV2: - return sync( - self.resize_async(new_shape), self.runtime_configuration.asyncio_loop - ) + return sync(self.resize_async(new_shape), self.runtime_configuration.asyncio_loop) async def convert_to_v3_async(self) -> Array: from sys import byteorder as sys_byteorder @@ -491,14 +476,10 @@ async def convert_to_v3_async(self) -> Array: if self.metadata.order == "F": codecs.append( - TransposeCodecMetadata( - configuration=TransposeCodecConfigurationMetadata(order="F") - ) + TransposeCodecMetadata(configuration=TransposeCodecConfigurationMetadata(order="F")) ) codecs.append( - BytesCodecMetadata( - configuration=BytesCodecConfigurationMetadata(endian=endian) - ) + BytesCodecMetadata(configuration=BytesCodecConfigurationMetadata(endian=endian)) ) if self.metadata.compressor is not None: @@ -523,9 +504,7 @@ async def convert_to_v3_async(self) -> Array: elif v2_codec["id"] == "gzip": codecs.append( GzipCodecMetadata( - configuration=GzipCodecConfigurationMetadata( - level=v2_codec.get("level", 5) - ) + configuration=GzipCodecConfigurationMetadata(level=v2_codec.get("level", 5)) ) ) @@ -537,9 +516,7 @@ async def convert_to_v3_async(self) -> Array: ) ), data_type=data_type, - fill_value=0 - if self.metadata.fill_value is None - else self.metadata.fill_value, + fill_value=0 if self.metadata.fill_value is None else self.metadata.fill_value, chunk_key_encoding=V2ChunkKeyEncodingMetadata( configuration=V2ChunkKeyEncodingConfigurationMetadata( separator=self.metadata.dimension_separator @@ -559,9 +536,7 @@ async def convert_to_v3_async(self) -> Array: ) async def update_attributes_async(self, new_attributes: Dict[str, Any]) -> ArrayV2: - await (self.store_path / ZATTRS_JSON).set_async( - json.dumps(new_attributes).encode() - ) + await (self.store_path / ZATTRS_JSON).set_async(json.dumps(new_attributes).encode()) return evolve(self, attributes=new_attributes) def update_attributes(self, new_attributes: Dict[str, Any]) -> ArrayV2: @@ -571,9 +546,7 @@ def update_attributes(self, new_attributes: Dict[str, Any]) -> ArrayV2: ) def convert_to_v3(self) -> Array: - return sync( - self.convert_to_v3_async(), loop=self.runtime_configuration.asyncio_loop - ) + return sync(self.convert_to_v3_async(), loop=self.runtime_configuration.asyncio_loop) def __repr__(self): return f"" diff --git a/zarr/zarrita/codecs.py b/zarr/zarrita/codecs.py index 56f99e3e06..5dbb26e137 100644 --- a/zarr/zarrita/codecs.py +++ b/zarr/zarrita/codecs.py @@ -141,9 +141,7 @@ def from_metadata( return cls(out) @staticmethod - def _validate_codecs( - codecs: List[Codec], array_metadata: CoreArrayMetadata - ) -> None: + def _validate_codecs(codecs: List[Codec], array_metadata: CoreArrayMetadata) -> None: from zarrita.sharding import ShardingCodec assert any( @@ -180,9 +178,7 @@ def _validate_codecs( ) if isinstance(codec, ShardingCodec): - assert len(codec.configuration.chunk_shape) == len( - array_metadata.shape - ), ( + assert len(codec.configuration.chunk_shape) == len(array_metadata.shape), ( "The shard's `chunk_shape` and array's `shape` need to have the " + "same number of dimensions." ) @@ -198,10 +194,7 @@ def _validate_codecs( ) prev_codec = codec - if ( - any(isinstance(codec, ShardingCodec) for codec in codecs) - and len(codecs) > 1 - ): + if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(codecs) > 1: warn( "Combining a `sharding_indexed` codec disables partial reads and " + "writes, which may lead to inefficient performance." @@ -211,9 +204,7 @@ def _array_array_codecs(self) -> List[ArrayArrayCodec]: return [codec for codec in self.codecs if isinstance(codec, ArrayArrayCodec)] def _array_bytes_codec(self) -> ArrayBytesCodec: - return next( - codec for codec in self.codecs if isinstance(codec, ArrayBytesCodec) - ) + return next(codec for codec in self.codecs if isinstance(codec, ArrayBytesCodec)) def _bytes_bytes_codecs(self) -> List[BytesBytesCodec]: return [codec for codec in self.codecs if isinstance(codec, BytesBytesCodec)] @@ -250,9 +241,7 @@ async def encode(self, chunk_array: np.ndarray) -> Optional[BytesLike]: return chunk_bytes def compute_encoded_size(self, byte_length: int) -> int: - return reduce( - lambda acc, codec: codec.compute_encoded_size(acc), self.codecs, byte_length - ) + return reduce(lambda acc, codec: codec.compute_encoded_size(acc), self.codecs, byte_length) @frozen @@ -268,9 +257,7 @@ def from_metadata( ) -> BloscCodec: configuration = codec_metadata.configuration if configuration.typesize == 0: - configuration = evolve( - configuration, typesize=array_metadata.data_type.byte_count - ) + configuration = evolve(configuration, typesize=array_metadata.data_type.byte_count) config_dict = asdict(codec_metadata.configuration) config_dict.pop("typesize", None) map_shuffle_str_to_int = {"noshuffle": 0, "shuffle": 1, "bitshuffle": 2} @@ -309,8 +296,7 @@ def from_metadata( cls, codec_metadata: BytesCodecMetadata, array_metadata: CoreArrayMetadata ) -> BytesCodec: assert ( - array_metadata.dtype.itemsize == 1 - or codec_metadata.configuration.endian is not None + array_metadata.dtype.itemsize == 1 or codec_metadata.configuration.endian is not None ), "The `endian` configuration needs to be specified for multi-byte data types." return cls( array_metadata=array_metadata, @@ -336,9 +322,7 @@ async def decode( prefix = "<" else: prefix = ">" - dtype = np.dtype( - f"{prefix}{self.array_metadata.data_type.to_numpy_shortname()}" - ) + dtype = np.dtype(f"{prefix}{self.array_metadata.data_type.to_numpy_shortname()}") else: dtype = np.dtype(f"|{self.array_metadata.data_type.to_numpy_shortname()}") chunk_array = np.frombuffer(chunk_bytes, dtype) @@ -377,9 +361,7 @@ def from_metadata( ) -> TransposeCodec: configuration = codec_metadata.configuration if configuration.order == "F": - order = tuple( - array_metadata.ndim - x - 1 for x in range(array_metadata.ndim) - ) + order = tuple(array_metadata.ndim - x - 1 for x in range(array_metadata.ndim)) elif configuration.order == "C": order = tuple(range(array_metadata.ndim)) @@ -409,8 +391,7 @@ def resolve_metadata(self) -> CoreArrayMetadata: return CoreArrayMetadata( shape=tuple( - self.array_metadata.shape[self.order[i]] - for i in range(self.array_metadata.ndim) + self.array_metadata.shape[self.order[i]] for i in range(self.array_metadata.ndim) ), chunk_shape=tuple( self.array_metadata.chunk_shape[self.order[i]] @@ -563,18 +544,12 @@ def blosc_codec( ) -def bytes_codec( - endian: Optional[Literal["big", "little"]] = "little" -) -> BytesCodecMetadata: +def bytes_codec(endian: Optional[Literal["big", "little"]] = "little") -> BytesCodecMetadata: return BytesCodecMetadata(configuration=BytesCodecConfigurationMetadata(endian)) -def transpose_codec( - order: Union[Tuple[int, ...], Literal["C", "F"]] -) -> TransposeCodecMetadata: - return TransposeCodecMetadata( - configuration=TransposeCodecConfigurationMetadata(order) - ) +def transpose_codec(order: Union[Tuple[int, ...], Literal["C", "F"]]) -> TransposeCodecMetadata: + return TransposeCodecMetadata(configuration=TransposeCodecConfigurationMetadata(order)) def gzip_codec(level: int = 5) -> GzipCodecMetadata: @@ -582,9 +557,7 @@ def gzip_codec(level: int = 5) -> GzipCodecMetadata: def zstd_codec(level: int = 0, checksum: bool = False) -> ZstdCodecMetadata: - return ZstdCodecMetadata( - configuration=ZstdCodecConfigurationMetadata(level, checksum) - ) + return ZstdCodecMetadata(configuration=ZstdCodecConfigurationMetadata(level, checksum)) def crc32c_codec() -> Crc32cCodecMetadata: @@ -599,7 +572,5 @@ def sharding_codec( codecs = codecs or [bytes_codec()] index_codecs = index_codecs or [bytes_codec(), crc32c_codec()] return ShardingCodecMetadata( - configuration=ShardingCodecConfigurationMetadata( - chunk_shape, codecs, index_codecs - ) + configuration=ShardingCodecConfigurationMetadata(chunk_shape, codecs, index_codecs) ) diff --git a/zarr/zarrita/common.py b/zarr/zarrita/common.py index 99b925bdaa..e906c50b9e 100644 --- a/zarr/zarrita/common.py +++ b/zarr/zarrita/common.py @@ -47,9 +47,7 @@ def make_cattr(): converter = Converter() - def _structure_chunk_key_encoding_metadata( - d: Dict[str, Any], _t - ) -> ChunkKeyEncodingMetadata: + def _structure_chunk_key_encoding_metadata(d: Dict[str, Any], _t) -> ChunkKeyEncodingMetadata: if d["name"] == "default": return converter.structure(d, DefaultChunkKeyEncodingMetadata) if d["name"] == "v2": @@ -97,8 +95,7 @@ def _structure_order(d: Any, _t=None) -> Union[Literal["C", "F"], Tuple[int, ... raise KeyError converter.register_structure_hook_factory( - lambda t: str(t) - == "typing.Union[typing.Literal['C', 'F'], typing.Tuple[int, ...]]", + lambda t: str(t) == "typing.Union[typing.Literal['C', 'F'], typing.Tuple[int, ...]]", lambda t: _structure_order, ) @@ -151,9 +148,7 @@ async def run(item): async with sem: return await func(*item) - return await asyncio.gather( - *[asyncio.ensure_future(run(item)) for item in items] - ) + return await asyncio.gather(*[asyncio.ensure_future(run(item)) for item in items]) async def to_thread(func, /, *args, **kwargs): diff --git a/zarr/zarrita/group.py b/zarr/zarrita/group.py index 6842589f58..ed5faca911 100644 --- a/zarr/zarrita/group.py +++ b/zarr/zarrita/group.py @@ -80,9 +80,7 @@ async def open_async( store_path = make_store_path(store) zarr_json_bytes = await (store_path / ZARR_JSON).get_async() assert zarr_json_bytes is not None - return cls.from_json( - store_path, json.loads(zarr_json_bytes), runtime_configuration - ) + return cls.from_json(store_path, json.loads(zarr_json_bytes), runtime_configuration) @classmethod def open( @@ -140,9 +138,7 @@ def __getitem__(self, path: str) -> Union[Array, Group]: return sync(self.get_async(path), self.runtime_configuration.asyncio_loop) async def create_group_async(self, path: str, **kwargs) -> Group: - runtime_configuration = kwargs.pop( - "runtime_configuration", self.runtime_configuration - ) + runtime_configuration = kwargs.pop("runtime_configuration", self.runtime_configuration) return await self.__class__.create_async( self.store_path / path, runtime_configuration=runtime_configuration, @@ -150,14 +146,10 @@ async def create_group_async(self, path: str, **kwargs) -> Group: ) def create_group(self, path: str, **kwargs) -> Group: - return sync( - self.create_group_async(path), self.runtime_configuration.asyncio_loop - ) + return sync(self.create_group_async(path), self.runtime_configuration.asyncio_loop) async def create_array_async(self, path: str, **kwargs) -> Array: - runtime_configuration = kwargs.pop( - "runtime_configuration", self.runtime_configuration - ) + runtime_configuration = kwargs.pop("runtime_configuration", self.runtime_configuration) return await Array.create_async( self.store_path / path, runtime_configuration=runtime_configuration, diff --git a/zarr/zarrita/group_v2.py b/zarr/zarrita/group_v2.py index c4d0a4be12..e8380c9c9b 100644 --- a/zarr/zarrita/group_v2.py +++ b/zarr/zarrita/group_v2.py @@ -162,9 +162,7 @@ def __getitem__(self, path: str) -> Union[ArrayV2, GroupV2]: return sync(self.get_async(path), self.runtime_configuration.asyncio_loop) async def create_group_async(self, path: str, **kwargs) -> GroupV2: - runtime_configuration = kwargs.pop( - "runtime_configuration", self.runtime_configuration - ) + runtime_configuration = kwargs.pop("runtime_configuration", self.runtime_configuration) return await self.__class__.create_async( self.store_path / path, runtime_configuration=runtime_configuration, @@ -172,14 +170,10 @@ async def create_group_async(self, path: str, **kwargs) -> GroupV2: ) def create_group(self, path: str, **kwargs) -> GroupV2: - return sync( - self.create_group_async(path), self.runtime_configuration.asyncio_loop - ) + return sync(self.create_group_async(path), self.runtime_configuration.asyncio_loop) async def create_array_async(self, path: str, **kwargs) -> ArrayV2: - runtime_configuration = kwargs.pop( - "runtime_configuration", self.runtime_configuration - ) + runtime_configuration = kwargs.pop("runtime_configuration", self.runtime_configuration) return await ArrayV2.create_async( self.store_path / path, runtime_configuration=runtime_configuration, @@ -208,9 +202,7 @@ async def convert_to_v3_async(self) -> Group: ) async def update_attributes_async(self, new_attributes: Dict[str, Any]) -> GroupV2: - await (self.store_path / ZATTRS_JSON).set_async( - json.dumps(new_attributes).encode() - ) + await (self.store_path / ZATTRS_JSON).set_async(json.dumps(new_attributes).encode()) return evolve(self, attributes=new_attributes) def update_attributes(self, new_attributes: Dict[str, Any]) -> GroupV2: @@ -220,9 +212,7 @@ def update_attributes(self, new_attributes: Dict[str, Any]) -> GroupV2: ) def convert_to_v3(self) -> Group: - return sync( - self.convert_to_v3_async(), loop=self.runtime_configuration.asyncio_loop - ) + return sync(self.convert_to_v3_async(), loop=self.runtime_configuration.asyncio_loop) def __repr__(self): return f"" diff --git a/zarr/zarrita/indexing.py b/zarr/zarrita/indexing.py index ce814f5c64..df147a843c 100644 --- a/zarr/zarrita/indexing.py +++ b/zarr/zarrita/indexing.py @@ -15,9 +15,7 @@ def _ensure_tuple(v: Selection) -> SliceSelection: def _err_too_many_indices(selection: SliceSelection, shape: ChunkCoords): raise IndexError( - "too many indices for array; expected {}, got {}".format( - len(shape), len(selection) - ) + "too many indices for array; expected {}, got {}".format(len(shape), len(selection)) ) @@ -113,9 +111,7 @@ def __iter__(self) -> Iterator[_ChunkDimProjection]: dim_chunk_sel_stop = self.stop - dim_offset dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop, self.step) - dim_chunk_nitems = _ceildiv( - (dim_chunk_sel_stop - dim_chunk_sel_start), self.step - ) + dim_chunk_nitems = _ceildiv((dim_chunk_sel_stop - dim_chunk_sel_start), self.step) dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) yield _ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) @@ -199,10 +195,7 @@ def is_total_slice(item: Selection, shape: ChunkCoords): isinstance(dim_sel, slice) and ( (dim_sel == slice(None)) - or ( - (dim_sel.stop - dim_sel.start == dim_len) - and (dim_sel.step in [1, None]) - ) + or ((dim_sel.stop - dim_sel.start == dim_len) and (dim_sel.step in [1, None])) ) ) for dim_sel, dim_len in zip(item, shape) @@ -211,9 +204,5 @@ def is_total_slice(item: Selection, shape: ChunkCoords): raise TypeError("expected slice or tuple of slices, found %r" % item) -def all_chunk_coords( - shape: ChunkCoords, chunk_shape: ChunkCoords -) -> Iterator[ChunkCoords]: - return itertools.product( - *(range(0, _ceildiv(s, c)) for s, c in zip(shape, chunk_shape)) - ) +def all_chunk_coords(shape: ChunkCoords, chunk_shape: ChunkCoords) -> Iterator[ChunkCoords]: + return itertools.product(*(range(0, _ceildiv(s, c)) for s, c in zip(shape, chunk_shape))) diff --git a/zarr/zarrita/metadata.py b/zarr/zarrita/metadata.py index 45922e1edd..80f305c0a4 100644 --- a/zarr/zarrita/metadata.py +++ b/zarr/zarrita/metadata.py @@ -139,9 +139,7 @@ def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: return "0" if chunk_identifier == "" else chunk_identifier -ChunkKeyEncodingMetadata = Union[ - DefaultChunkKeyEncodingMetadata, V2ChunkKeyEncodingMetadata -] +ChunkKeyEncodingMetadata = Union[DefaultChunkKeyEncodingMetadata, V2ChunkKeyEncodingMetadata] BloscShuffle = Literal["noshuffle", "shuffle", "bitshuffle"] @@ -281,9 +279,7 @@ def dtype(self) -> np.dtype: def ndim(self) -> int: return len(self.shape) - def get_core_metadata( - self, runtime_configuration: RuntimeConfiguration - ) -> CoreArrayMetadata: + def get_core_metadata(self, runtime_configuration: RuntimeConfiguration) -> CoreArrayMetadata: return CoreArrayMetadata( shape=self.shape, chunk_shape=self.chunk_grid.configuration.chunk_shape, @@ -301,8 +297,7 @@ def _json_convert(o): return json.dumps( asdict( self, - filter=lambda attr, value: attr.name != "dimension_names" - or value is not None, + filter=lambda attr, value: attr.name != "dimension_names" or value is not None, ), default=_json_convert, ).encode() diff --git a/zarr/zarrita/sharding.py b/zarr/zarrita/sharding.py index 283f06ea35..363d62f1af 100644 --- a/zarr/zarrita/sharding.py +++ b/zarr/zarrita/sharding.py @@ -51,9 +51,7 @@ def get_chunk_slice(self, chunk_coords: ChunkCoords) -> Optional[Tuple[int, int] else: return (int(chunk_start), int(chunk_start + chunk_len)) - def set_chunk_slice( - self, chunk_coords: ChunkCoords, chunk_slice: Optional[slice] - ) -> None: + def set_chunk_slice(self, chunk_coords: ChunkCoords, chunk_slice: Optional[slice]) -> None: localized_chunk = self._localize_chunk(chunk_coords) if chunk_slice is None: self.offsets_and_lengths[localized_chunk] = (MAX_UINT_64, MAX_UINT_64) @@ -75,11 +73,7 @@ def is_dense(self, chunk_byte_length: int) -> bool: # Are all non-empty offsets unique? if len( - set( - offset - for offset, _ in sorted_offsets_and_lengths - if offset != MAX_UINT_64 - ) + set(offset for offset, _ in sorted_offsets_and_lengths if offset != MAX_UINT_64) ) != len(sorted_offsets_and_lengths): return False @@ -103,9 +97,7 @@ class _ShardProxy(Mapping): async def from_bytes(cls, buf: BytesLike, codec: ShardingCodec) -> _ShardProxy: obj = cls() obj.buf = memoryview(buf) - obj.index = await codec._decode_shard_index( - obj.buf[-codec._shard_index_size() :] - ) + obj.index = await codec._decode_shard_index(obj.buf[-codec._shard_index_size() :]) return obj @classmethod @@ -162,9 +154,7 @@ def append(self, chunk_coords: ChunkCoords, value: BytesLike): chunk_start = len(self.buf) chunk_length = len(value) self.buf.extend(value) - self.index.set_chunk_slice( - chunk_coords, slice(chunk_start, chunk_start + chunk_length) - ) + self.index.set_chunk_slice(chunk_coords, slice(chunk_start, chunk_start + chunk_length)) def finalize(self, index_bytes: BytesLike) -> BytesLike: self.buf.extend(index_bytes) @@ -384,9 +374,7 @@ async def _write_chunk( return (chunk_coords, None) # assembling and encoding chunks within the shard - encoded_chunks: List[ - Tuple[ChunkCoords, Optional[BytesLike]] - ] = await concurrent_map( + encoded_chunks: List[Tuple[ChunkCoords, Optional[BytesLike]]] = await concurrent_map( [ (shard_array, chunk_coords, chunk_selection, out_selection) for chunk_coords, chunk_selection, out_selection in indexer @@ -402,9 +390,7 @@ async def _write_chunk( if chunk_bytes is not None: shard_builder.append(chunk_coords, chunk_bytes) - return shard_builder.finalize( - await self._encode_shard_index(shard_builder.index) - ) + return shard_builder.finalize(await self._encode_shard_index(shard_builder.index)) async def encode_partial( self, @@ -464,9 +450,7 @@ async def _write_chunk( else: return (chunk_coords, None) - encoded_chunks: List[ - Tuple[ChunkCoords, Optional[BytesLike]] - ] = await concurrent_map( + encoded_chunks: List[Tuple[ChunkCoords, Optional[BytesLike]]] = await concurrent_map( [ ( chunk_coords, @@ -493,15 +477,12 @@ async def _write_chunk( await store_path.delete_async() else: await store_path.set_async( - shard_builder.finalize( - await self._encode_shard_index(shard_builder.index) - ) + shard_builder.finalize(await self._encode_shard_index(shard_builder.index)) ) def _is_total_shard(self, all_chunk_coords: Set[ChunkCoords]) -> bool: return len(all_chunk_coords) == product(self.chunks_per_shard) and all( - chunk_coords in all_chunk_coords - for chunk_coords in c_order_iter(self.chunks_per_shard) + chunk_coords in all_chunk_coords for chunk_coords in c_order_iter(self.chunks_per_shard) ) async def _decode_shard_index(self, index_bytes: BytesLike) -> _ShardIndex: @@ -513,26 +494,20 @@ async def _encode_shard_index(self, index: _ShardIndex) -> BytesLike: return index_bytes def _shard_index_size(self) -> int: - return self.index_codec_pipeline.compute_encoded_size( - 16 * product(self.chunks_per_shard) - ) + return self.index_codec_pipeline.compute_encoded_size(16 * product(self.chunks_per_shard)) - async def _load_shard_index_maybe( - self, store_path: StorePath - ) -> Optional[_ShardIndex]: + async def _load_shard_index_maybe(self, store_path: StorePath) -> Optional[_ShardIndex]: index_bytes = await store_path.get_async((-self._shard_index_size(), None)) if index_bytes is not None: return await self._decode_shard_index(index_bytes) return None async def _load_shard_index(self, store_path: StorePath) -> _ShardIndex: - return ( - await self._load_shard_index_maybe(store_path) - ) or _ShardIndex.create_empty(self.chunks_per_shard) + return (await self._load_shard_index_maybe(store_path)) or _ShardIndex.create_empty( + self.chunks_per_shard + ) - async def _load_full_shard_maybe( - self, store_path: StorePath - ) -> Optional[_ShardProxy]: + async def _load_full_shard_maybe(self, store_path: StorePath) -> Optional[_ShardProxy]: shard_bytes = await store_path.get_async() return await _ShardProxy.from_bytes(shard_bytes, self) if shard_bytes else None diff --git a/zarr/zarrita/store.py b/zarr/zarrita/store.py index 119650113d..280c746d49 100644 --- a/zarr/zarrita/store.py +++ b/zarr/zarrita/store.py @@ -70,9 +70,7 @@ def from_path(cls, pth: Path) -> Store: from upath import UPath from upath.implementations.local import PosixUPath, WindowsUPath - if isinstance(pth, UPath) and not isinstance( - pth, (PosixUPath, WindowsUPath) - ): + if isinstance(pth, UPath) and not isinstance(pth, (PosixUPath, WindowsUPath)): storage_options = pth._kwargs.copy() storage_options.pop("_url", None) return RemoteStore(str(pth), **storage_options) @@ -84,9 +82,7 @@ def from_path(cls, pth: Path) -> Store: async def multi_get_async( self, keys: List[Tuple[str, Optional[Tuple[int, int]]]] ) -> List[Optional[BytesLike]]: - return await asyncio.gather( - *[self.get_async(key, byte_range) for key, byte_range in keys] - ) + return await asyncio.gather(*[self.get_async(key, byte_range) for key, byte_range in keys]) async def get_async( self, key: str, byte_range: Optional[Tuple[int, Optional[int]]] = None @@ -97,10 +93,7 @@ async def multi_set_async( self, key_values: List[Tuple[str, BytesLike, Optional[Tuple[int, int]]]] ) -> None: await asyncio.gather( - *[ - self.set_async(key, value, byte_range) - for key, value, byte_range in key_values - ] + *[self.set_async(key, value, byte_range) for key, value, byte_range in key_values] ) async def set_async( @@ -222,17 +215,13 @@ def __init__(self, url: Union[UPath, str], **storage_options: Dict[str, Any]): ) self.root = url.rstrip("/") # test instantiate file system - fs, _ = fsspec.core.url_to_fs( - str(self.root), asynchronous=True, **self.root._kwargs - ) + fs, _ = fsspec.core.url_to_fs(str(self.root), asynchronous=True, **self.root._kwargs) assert fs.__class__.async_impl, "FileSystem needs to support async operations." def make_fs(self) -> Tuple[AsyncFileSystem, str]: storage_options = self.root._kwargs.copy() storage_options.pop("_url", None) - fs, root = fsspec.core.url_to_fs( - str(self.root), asynchronous=True, **self.root._kwargs - ) + fs, root = fsspec.core.url_to_fs(str(self.root), asynchronous=True, **self.root._kwargs) assert fs.__class__.async_impl, "FileSystem needs to support async operations." return fs, root diff --git a/zarr/zarrita/sync.py b/zarr/zarrita/sync.py index 6f33dd925b..32b0d7a65a 100644 --- a/zarr/zarrita/sync.py +++ b/zarr/zarrita/sync.py @@ -25,9 +25,7 @@ def _get_lock() -> threading.Lock: return _lock -async def _runner( - event: threading.Event, coro: Coroutine, result_box: List[Optional[Any]] -): +async def _runner(event: threading.Event, coro: Coroutine, result_box: List[Optional[Any]]): try: result_box[0] = await coro except Exception as ex: From 15f667cf87a29e15adc3b9302376b1d61aef2dcb Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Fri, 3 Nov 2023 12:43:08 -0700 Subject: [PATCH 03/12] zarrita -> v3 --- zarr/{zarrita => v3}/__init__.py | 16 ++++++++-------- zarr/{zarrita => v3}/array.py | 16 ++++++++-------- zarr/{zarrita => v3}/array_v2.py | 18 +++++++++--------- zarr/{zarrita => v3}/codecs.py | 12 ++++++------ zarr/{zarrita => v3}/common.py | 2 +- zarr/{zarrita => v3}/group.py | 10 +++++----- zarr/{zarrita => v3}/group_v2.py | 16 ++++++++-------- zarr/{zarrita => v3}/indexing.py | 2 +- zarr/{zarrita => v3}/metadata.py | 2 +- zarr/{zarrita => v3}/sharding.py | 10 +++++----- zarr/{zarrita => v3}/store.py | 2 +- zarr/{zarrita => v3}/sync.py | 2 +- 12 files changed, 54 insertions(+), 54 deletions(-) rename zarr/{zarrita => v3}/__init__.py (70%) rename zarr/{zarrita => v3}/array.py (97%) rename zarr/{zarrita => v3}/array_v2.py (97%) rename zarr/{zarrita => v3}/codecs.py (98%) rename zarr/{zarrita => v3}/common.py (99%) rename zarr/{zarrita => v3}/group.py (96%) rename zarr/{zarrita => v3}/group_v2.py (94%) rename zarr/{zarrita => v3}/indexing.py (99%) rename zarr/{zarrita => v3}/metadata.py (99%) rename zarr/{zarrita => v3}/sharding.py (98%) rename zarr/{zarrita => v3}/store.py (99%) rename zarr/{zarrita => v3}/sync.py (99%) diff --git a/zarr/zarrita/__init__.py b/zarr/v3/__init__.py similarity index 70% rename from zarr/zarrita/__init__.py rename to zarr/v3/__init__.py index bd65411825..bbf5aa0359 100644 --- a/zarr/zarrita/__init__.py +++ b/zarr/v3/__init__.py @@ -2,13 +2,13 @@ from typing import Union -import zarrita.codecs # noqa: F401 -from zarrita.array import Array # noqa: F401 -from zarrita.array_v2 import ArrayV2 # noqa: F401 -from zarrita.group import Group # noqa: F401 -from zarrita.group_v2 import GroupV2 # noqa: F401 -from zarrita.metadata import RuntimeConfiguration, runtime_configuration # noqa: F401 -from zarrita.store import ( # noqa: F401 +import zarr.v3.codecs # noqa: F401 +from zarr.v3.array import Array # noqa: F401 +from zarr.v3.array_v2 import ArrayV2 # noqa: F401 +from zarr.v3.group import Group # noqa: F401 +from zarr.v3.group_v2 import GroupV2 # noqa: F401 +from zarr.v3.metadata import RuntimeConfiguration, runtime_configuration # noqa: F401 +from zarr.v3.store import ( # noqa: F401 LocalStore, RemoteStore, Store, @@ -16,7 +16,7 @@ StorePath, make_store_path, ) -from zarrita.sync import sync as _sync +from zarr.v3.sync import sync as _sync async def open_auto_async( diff --git a/zarr/zarrita/array.py b/zarr/v3/array.py similarity index 97% rename from zarr/zarrita/array.py rename to zarr/v3/array.py index 2c0fef0d61..e69f306eca 100644 --- a/zarr/zarrita/array.py +++ b/zarr/v3/array.py @@ -6,17 +6,17 @@ import numpy as np from attr import evolve, frozen -from zarrita.array_v2 import ArrayV2 -from zarrita.codecs import CodecMetadata, CodecPipeline, bytes_codec -from zarrita.common import ( +from zarr.v3.array_v2 import ArrayV2 +from zarr.v3.codecs import CodecMetadata, CodecPipeline, bytes_codec +from zarr.v3.common import ( ZARR_JSON, ChunkCoords, Selection, SliceSelection, concurrent_map, ) -from zarrita.indexing import BasicIndexer, all_chunk_coords, is_total_slice -from zarrita.metadata import ( +from zarr.v3.indexing import BasicIndexer, all_chunk_coords, is_total_slice +from zarr.v3.metadata import ( ArrayMetadata, DataType, DefaultChunkKeyEncodingConfigurationMetadata, @@ -28,9 +28,9 @@ V2ChunkKeyEncodingMetadata, dtype_to_data_type, ) -from zarrita.sharding import ShardingCodec -from zarrita.store import StoreLike, StorePath, make_store_path -from zarrita.sync import sync +from zarr.v3.sharding import ShardingCodec +from zarr.v3.store import StoreLike, StorePath, make_store_path +from zarr.v3.sync import sync @frozen diff --git a/zarr/zarrita/array_v2.py b/zarr/v3/array_v2.py similarity index 97% rename from zarr/zarrita/array_v2.py rename to zarr/v3/array_v2.py index 119a4920da..a2f26f01b0 100644 --- a/zarr/zarrita/array_v2.py +++ b/zarr/v3/array_v2.py @@ -9,7 +9,7 @@ from attr import evolve, frozen from numcodecs.compat import ensure_bytes, ensure_ndarray -from zarrita.common import ( +from zarr.v3.common import ( ZARRAY_JSON, ZATTRS_JSON, BytesLike, @@ -19,13 +19,13 @@ concurrent_map, to_thread, ) -from zarrita.indexing import BasicIndexer, all_chunk_coords, is_total_slice -from zarrita.metadata import ArrayV2Metadata, RuntimeConfiguration -from zarrita.store import StoreLike, StorePath, make_store_path -from zarrita.sync import sync +from zarr.v3.indexing import BasicIndexer, all_chunk_coords, is_total_slice +from zarr.v3.metadata import ArrayV2Metadata, RuntimeConfiguration +from zarr.v3.store import StoreLike, StorePath, make_store_path +from zarr.v3.sync import sync if TYPE_CHECKING: - from zarrita.array import Array + from zarr.v3.array import Array @frozen @@ -437,9 +437,9 @@ def resize(self, new_shape: ChunkCoords) -> ArrayV2: async def convert_to_v3_async(self) -> Array: from sys import byteorder as sys_byteorder - from zarrita.array import Array - from zarrita.common import ZARR_JSON - from zarrita.metadata import ( + from zarr.v3.array import Array + from zarr.v3.common import ZARR_JSON + from zarr.v3.metadata import ( ArrayMetadata, BloscCodecConfigurationMetadata, BloscCodecMetadata, diff --git a/zarr/zarrita/codecs.py b/zarr/v3/codecs.py similarity index 98% rename from zarr/zarrita/codecs.py rename to zarr/v3/codecs.py index 5dbb26e137..ff15f2ebf9 100644 --- a/zarr/zarrita/codecs.py +++ b/zarr/v3/codecs.py @@ -13,8 +13,8 @@ from numcodecs.gzip import GZip from zstandard import ZstdCompressor, ZstdDecompressor -from zarrita.common import BytesLike, to_thread -from zarrita.metadata import ( +from zarr.v3.common import BytesLike, to_thread +from zarr.v3.metadata import ( BloscCodecConfigurationMetadata, BloscCodecMetadata, BytesCodecConfigurationMetadata, @@ -32,7 +32,7 @@ ) if TYPE_CHECKING: - from zarrita.metadata import CoreArrayMetadata + from zarr.v3.metadata import CoreArrayMetadata # See https://zarr.readthedocs.io/en/stable/tutorial.html#configuring-blosc numcodecs.blosc.use_threads = False @@ -129,7 +129,7 @@ def from_metadata( elif codec_metadata.name == "crc32c": codec = Crc32cCodec.from_metadata(codec_metadata, array_metadata) elif codec_metadata.name == "sharding_indexed": - from zarrita.sharding import ShardingCodec + from zarr.v3.sharding import ShardingCodec codec = ShardingCodec.from_metadata(codec_metadata, array_metadata) else: @@ -142,7 +142,7 @@ def from_metadata( @staticmethod def _validate_codecs(codecs: List[Codec], array_metadata: CoreArrayMetadata) -> None: - from zarrita.sharding import ShardingCodec + from zarr.v3.sharding import ShardingCodec assert any( isinstance(codec, ArrayBytesCodec) for codec in codecs @@ -387,7 +387,7 @@ def from_metadata( ) def resolve_metadata(self) -> CoreArrayMetadata: - from zarrita.metadata import CoreArrayMetadata + from zarr.v3.metadata import CoreArrayMetadata return CoreArrayMetadata( shape=tuple( diff --git a/zarr/zarrita/common.py b/zarr/v3/common.py similarity index 99% rename from zarr/zarrita/common.py rename to zarr/v3/common.py index e906c50b9e..0e55a7c1fd 100644 --- a/zarr/zarrita/common.py +++ b/zarr/v3/common.py @@ -31,7 +31,7 @@ def make_cattr(): - from zarrita.metadata import ( + from zarr.v3.metadata import ( BloscCodecMetadata, BytesCodecMetadata, ChunkKeyEncodingMetadata, diff --git a/zarr/zarrita/group.py b/zarr/v3/group.py similarity index 96% rename from zarr/zarrita/group.py rename to zarr/v3/group.py index ed5faca911..aa43c706a5 100644 --- a/zarr/zarrita/group.py +++ b/zarr/v3/group.py @@ -5,11 +5,11 @@ from attr import asdict, evolve, field, frozen -from zarrita.array import Array -from zarrita.common import ZARR_JSON, make_cattr -from zarrita.metadata import RuntimeConfiguration -from zarrita.store import StoreLike, StorePath, make_store_path -from zarrita.sync import sync +from zarr.v3.array import Array +from zarr.v3.common import ZARR_JSON, make_cattr +from zarr.v3.metadata import RuntimeConfiguration +from zarr.v3.store import StoreLike, StorePath, make_store_path +from zarr.v3.sync import sync @frozen diff --git a/zarr/zarrita/group_v2.py b/zarr/v3/group_v2.py similarity index 94% rename from zarr/zarrita/group_v2.py rename to zarr/v3/group_v2.py index e8380c9c9b..3b1a369ae2 100644 --- a/zarr/zarrita/group_v2.py +++ b/zarr/v3/group_v2.py @@ -6,14 +6,14 @@ from attr import asdict, evolve, frozen -from zarrita.array_v2 import ArrayV2 -from zarrita.common import ZARRAY_JSON, ZATTRS_JSON, ZGROUP_JSON, make_cattr -from zarrita.metadata import RuntimeConfiguration -from zarrita.store import StoreLike, StorePath, make_store_path -from zarrita.sync import sync +from zarr.v3.array_v2 import ArrayV2 +from zarr.v3.common import ZARRAY_JSON, ZATTRS_JSON, ZGROUP_JSON, make_cattr +from zarr.v3.metadata import RuntimeConfiguration +from zarr.v3.store import StoreLike, StorePath, make_store_path +from zarr.v3.sync import sync if TYPE_CHECKING: - from zarrita.group import Group + from zarr.v3.group import Group @frozen @@ -187,8 +187,8 @@ def create_array(self, path: str, **kwargs) -> ArrayV2: ) async def convert_to_v3_async(self) -> Group: - from zarrita.common import ZARR_JSON - from zarrita.group import Group, GroupMetadata + from zarr.v3.common import ZARR_JSON + from zarr.v3.group import Group, GroupMetadata new_metadata = GroupMetadata(attributes=self.attributes or {}) new_metadata_bytes = new_metadata.to_bytes() diff --git a/zarr/zarrita/indexing.py b/zarr/v3/indexing.py similarity index 99% rename from zarr/zarrita/indexing.py rename to zarr/v3/indexing.py index df147a843c..15adad111d 100644 --- a/zarr/zarrita/indexing.py +++ b/zarr/v3/indexing.py @@ -4,7 +4,7 @@ import math from typing import Iterator, List, NamedTuple, Optional, Tuple -from zarrita.common import ChunkCoords, Selection, SliceSelection, product +from zarr.v3.common import ChunkCoords, Selection, SliceSelection, product def _ensure_tuple(v: Selection) -> SliceSelection: diff --git a/zarr/zarrita/metadata.py b/zarr/v3/metadata.py similarity index 99% rename from zarr/zarrita/metadata.py rename to zarr/v3/metadata.py index 80f305c0a4..1fc43b19f0 100644 --- a/zarr/zarrita/metadata.py +++ b/zarr/v3/metadata.py @@ -8,7 +8,7 @@ import numpy as np from attr import asdict, field, frozen -from zarrita.common import ChunkCoords, make_cattr +from zarr.v3.common import ChunkCoords, make_cattr @frozen diff --git a/zarr/zarrita/sharding.py b/zarr/v3/sharding.py similarity index 98% rename from zarr/zarrita/sharding.py rename to zarr/v3/sharding.py index 363d62f1af..3c5b4bd12d 100644 --- a/zarr/zarrita/sharding.py +++ b/zarr/v3/sharding.py @@ -5,27 +5,27 @@ import numpy as np from attrs import frozen -from zarrita.codecs import ArrayBytesCodec, CodecPipeline -from zarrita.common import ( +from zarr.v3.codecs import ArrayBytesCodec, CodecPipeline +from zarr.v3.common import ( BytesLike, ChunkCoords, SliceSelection, concurrent_map, product, ) -from zarrita.indexing import ( +from zarr.v3.indexing import ( BasicIndexer, c_order_iter, is_total_slice, morton_order_iter, ) -from zarrita.metadata import ( +from zarr.v3.metadata import ( CoreArrayMetadata, DataType, ShardingCodecConfigurationMetadata, ShardingCodecMetadata, ) -from zarrita.store import StorePath +from zarr.v3.store import StorePath MAX_UINT_64 = 2**64 - 1 diff --git a/zarr/zarrita/store.py b/zarr/v3/store.py similarity index 99% rename from zarr/zarrita/store.py rename to zarr/v3/store.py index 280c746d49..67e54340d0 100644 --- a/zarr/zarrita/store.py +++ b/zarr/v3/store.py @@ -8,7 +8,7 @@ import fsspec from fsspec.asyn import AsyncFileSystem -from zarrita.common import BytesLike, to_thread +from zarr.v3.common import BytesLike, to_thread if TYPE_CHECKING: from upath import UPath diff --git a/zarr/zarrita/sync.py b/zarr/v3/sync.py similarity index 99% rename from zarr/zarrita/sync.py rename to zarr/v3/sync.py index 32b0d7a65a..ef3a6e08c0 100644 --- a/zarr/zarrita/sync.py +++ b/zarr/v3/sync.py @@ -80,7 +80,7 @@ def _get_loop(): # previous two calls from another thread if loop[0] is None: loop[0] = asyncio.new_event_loop() - th = threading.Thread(target=loop[0].run_forever, name="zarritaIO") + th = threading.Thread(target=loop[0].run_forever, name="zarrIO") th.daemon = True th.start() iothread[0] = th From c3f87644c41d894819b12f78be541d364baac661 Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Fri, 3 Nov 2023 14:32:06 -0700 Subject: [PATCH 04/12] v3/abc [wip] --- zarr/v3/abc/__init__.py | 0 zarr/v3/abc/array.py | 140 ++++++++++++++++++++++++++++++++++++++++ zarr/v3/abc/codec.py | 74 +++++++++++++++++++++ zarr/v3/abc/group.py | 86 ++++++++++++++++++++++++ zarr/v3/abc/store.py | 115 +++++++++++++++++++++++++++++++++ 5 files changed, 415 insertions(+) create mode 100644 zarr/v3/abc/__init__.py create mode 100644 zarr/v3/abc/array.py create mode 100644 zarr/v3/abc/codec.py create mode 100644 zarr/v3/abc/group.py create mode 100644 zarr/v3/abc/store.py diff --git a/zarr/v3/abc/__init__.py b/zarr/v3/abc/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/zarr/v3/abc/array.py b/zarr/v3/abc/array.py new file mode 100644 index 0000000000..976aa48618 --- /dev/null +++ b/zarr/v3/abc/array.py @@ -0,0 +1,140 @@ +from __future__ import annotations +from abc import abstractproperty, abstractmethod, ABC +from typing import Tuple, Any, Dict + +import numpy as np + +from zarr.v3.abc.store import ReadStore, WriteStore +from zarr.v3.common import Selection + + +class BaseArray(ABC): + @abstractproperty + def store_path(self) -> str: # TODO: rename to `path`? + """Path to this array in the underlying store.""" + ... + + @abstractproperty + def dtype(self) -> np.dtype: + """Data type of the array elements. + + Returns + ------- + dtype + array data type + """ + ... + + @abstractproperty + def ndim(self) -> int: + """Number of array dimensions (axes). + + Returns + ------- + int + number of array dimensions (axes) + """ + ... + + @abstractproperty + def shape(self) -> Tuple[int, ...]: + """Array dimensions. + + Returns + ------- + tuple of int + array dimensions + """ + ... + + @abstractproperty + def size(self) -> int: + """Number of elements in the array. + + Returns + ------- + int + number of elements in an array. + """ + + @abstractproperty + def attrs(self) -> Dict[str, Any]: + """Array attributes. + + Returns + ------- + dict + user defined attributes + """ + ... + + @abstractproperty + def info(self) -> Any: + """Report some diagnostic information about the array. + + Returns + ------- + out + """ + ... + + +class AsynchronousArray(BaseArray): + """This class can be implemented as a v2 or v3 array""" + + @classmethod + @abstractmethod + async def from_json(cls, zarr_json: Any, store: ReadStore) -> AsynchronousArray: + ... + + @classmethod + @abstractmethod + async def open(cls, store: ReadStore) -> AsynchronousArray: + ... + + @classmethod + @abstractmethod + async def create(cls, store: WriteStore, *, shape, **kwargs) -> AsynchronousArray: + ... + + @abstractmethod + async def getitem(self, selection: Selection): + ... + + @abstractmethod + async def setitem(self, selection: Selection, value: np.ndarray) -> None: + ... + + +class SynchronousArray(BaseArray): + """ + This class can be implemented as a v2 or v3 array + """ + + @classmethod + @abstractmethod + def from_json(cls, zarr_json: Any, store: ReadStore) -> SynchronousArray: + ... + + @classmethod + @abstractmethod + def open(cls, store: ReadStore) -> SynchronousArray: + ... + + @classmethod + @abstractmethod + def create(cls, store: WriteStore, *, shape, **kwargs) -> SynchronousArray: + ... + + @abstractmethod + def __getitem__(self, selection: Selection): # TODO: type as np.ndarray | scalar + ... + + @abstractmethod + def __setitem__(self, selection: Selection, value: np.ndarray) -> None: + ... + + # some day ;) + # @property + # def __array_api_version__(self) -> str: + # return "2022.12" diff --git a/zarr/v3/abc/codec.py b/zarr/v3/abc/codec.py new file mode 100644 index 0000000000..383d26fd0d --- /dev/null +++ b/zarr/v3/abc/codec.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +from abc import abstractmethod, ABC +from typing import TYPE_CHECKING, Optional + +import numpy as np + +from zarr.v3.common import BytesLike + + +if TYPE_CHECKING: + from zarr.v3.metadata import CoreArrayMetadata + + +class Codec(ABC): + supports_partial_decode: bool + supports_partial_encode: bool + is_fixed_size: bool + array_metadata: CoreArrayMetadata + + @abstractmethod + def compute_encoded_size(self, input_byte_length: int) -> int: + pass + + def resolve_metadata(self) -> CoreArrayMetadata: + return self.array_metadata + + +class ArrayArrayCodec(Codec): + @abstractmethod + async def decode( + self, + chunk_array: np.ndarray, + ) -> np.ndarray: + pass + + @abstractmethod + async def encode( + self, + chunk_array: np.ndarray, + ) -> Optional[np.ndarray]: + pass + + +class ArrayBytesCodec(Codec): + @abstractmethod + async def decode( + self, + chunk_array: BytesLike, + ) -> np.ndarray: + pass + + @abstractmethod + async def encode( + self, + chunk_array: np.ndarray, + ) -> Optional[BytesLike]: + pass + + +class BytesBytesCodec(Codec): + @abstractmethod + async def decode( + self, + chunk_array: BytesLike, + ) -> BytesLike: + pass + + @abstractmethod + async def encode( + self, + chunk_array: BytesLike, + ) -> Optional[BytesLike]: + pass diff --git a/zarr/v3/abc/group.py b/zarr/v3/abc/group.py new file mode 100644 index 0000000000..02de819894 --- /dev/null +++ b/zarr/v3/abc/group.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +from abc import abstractproperty, ABC +from collections.abc import MutableMapping +from typing import Dict, Any + + +class BaseGroup(ABC): + @abstractproperty + def attrs(self) -> Dict[str, Any]: + """User-defined attributes.""" + ... + + @abstractproperty + def info(self) -> Any: # TODO: type this later + """Return diagnostic information about the group.""" + ... + + +class AsynchronousGroup(BaseGroup): + pass + # TODO: (considering the following api) + # store_path (rename to path?) + # nchildren - number of child groups + arrays + # children (async iterator) + # contains - check if child exists + # getitem - get child + # group_keys (async iterator) + # groups (async iterator) + # array_keys (async iterator) + # arrays (async iterator) + # visit + # visitkeys + # visitvalues + # tree + # create_group + # require_group + # create_groups + # require_groups + # create_dataset + # require_dataset + # create + # empty + # zeros + # ones + # full + # array + # empty_like + # zeros_like + # ones_like + # full_like + # move + + +class SynchronousGroup(BaseGroup, MutableMapping): + # TODO - think about if we want to keep the MutableMapping abstraction or + pass + # store_path (rename to path?) + # __enter__ + # __exit__ + # group_keys + # groups + # array_keys + # arrays + # visit + # visitkeys + # visitvalues + # visititems + # tree + # create_group + # require_group + # create_groups + # require_groups + # create_dataset + # require_dataset + # create + # empty + # zeros + # ones + # full + # array + # empty_like + # zeros_like + # ones_like + # full_like + # move diff --git a/zarr/v3/abc/store.py b/zarr/v3/abc/store.py new file mode 100644 index 0000000000..fc275801fa --- /dev/null +++ b/zarr/v3/abc/store.py @@ -0,0 +1,115 @@ +from abc import abstractmethod, ABC + +from typing import List, Tuple + + +class Store(ABC): + pass + + +class ReadStore(Store): + @abstractmethod + def get(self, key: str) -> bytes: + """Retrieve the value associated with a given key. + + Parameters + ---------- + key : str + + Returns + ------- + bytes + """ + ... + + @abstractmethod + def get_partial_values(self, key_ranges: List[Tuple[str, int]]) -> bytes: + """Retrieve possibly partial values from given key_ranges. + + Parameters + ---------- + key_ranges : list[tuple[str, int]] + Ordered set of key, range pairs, a key may occur multiple times with different ranges + + Returns + ------- + list[bytes] + list of values, in the order of the key_ranges, may contain null/none for missing keys + """ + ... + + +class WriteStore(ReadStore): + @abstractmethod + def set(self, key: str, value: bytes) -> None: + """Store a (key, value) pair. + + Parameters + ---------- + key : str + value : bytes + """ + ... + + @abstractmethod + def set_partial_values(self, key_start_values: List[Tuple[str, int, bytes]]) -> None: + """Store values at a given key, starting at byte range_start. + + Parameters + ---------- + key_start_values : list[tuple[str, int, bytes]] + set of key, range_start, values triples, a key may occur multiple times with different + range_starts, range_starts (considering the length of the respective values) must not + specify overlapping ranges for the same key + """ + ... + + +class ListMixin: + @abstractmethod + def list(self) -> List[str]: + """Retrieve all keys in the store. + + Returns + ------- + list[str] + """ + ... + + @abstractmethod + def list_prefix(self, prefix: str) -> List[str]: + """Retrieve all keys in the store. + + Parameters + ---------- + prefix : str + + Returns + ------- + list[str] + """ + ... + + @abstractmethod + def list_dir(self, prefix: str) -> List[str]: + """ + Retrieve all keys and prefixes with a given prefix and which do not contain the character + “/” after the given prefix. + + Parameters + ---------- + prefix : str + + Returns + ------- + list[str] + """ + ... + + +class ReadListStore(ReadStore, ListMixin): + pass + + +class WriteListStore(WriteStore, ListMixin): + pass From 08641872d5feb768aaba6beaff73eb5aab087241 Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Fri, 3 Nov 2023 16:39:33 -0700 Subject: [PATCH 05/12] use abcs plus implementation notes --- zarr/v3/abc/codec.py | 10 ++ zarr/v3/abc/store.py | 14 +- zarr/v3/array.py | 336 +++++++++++++++++++++++++------------------ zarr/v3/codecs.py | 64 +-------- zarr/v3/store.py | 14 +- 5 files changed, 227 insertions(+), 211 deletions(-) diff --git a/zarr/v3/abc/codec.py b/zarr/v3/abc/codec.py index 383d26fd0d..f84fc74af9 100644 --- a/zarr/v3/abc/codec.py +++ b/zarr/v3/abc/codec.py @@ -1,3 +1,13 @@ +# Notes: +# 1. These are missing methods described in the spec. I expected to see these method definitions: +# def compute_encoded_representation_type(self, decoded_representation_type): +# def encode(self, decoded_value): +# def decode(self, encoded_value, decoded_representation_type): +# def partial_decode(self, input_handle, decoded_representation_type, decoded_regions): +# def compute_encoded_size(self, input_size): +# 2. Understand why array metadata is included on all codecs + + from __future__ import annotations from abc import abstractmethod, ABC diff --git a/zarr/v3/abc/store.py b/zarr/v3/abc/store.py index fc275801fa..5469cafe6d 100644 --- a/zarr/v3/abc/store.py +++ b/zarr/v3/abc/store.py @@ -9,7 +9,7 @@ class Store(ABC): class ReadStore(Store): @abstractmethod - def get(self, key: str) -> bytes: + async def get(self, key: str) -> bytes: """Retrieve the value associated with a given key. Parameters @@ -23,7 +23,7 @@ def get(self, key: str) -> bytes: ... @abstractmethod - def get_partial_values(self, key_ranges: List[Tuple[str, int]]) -> bytes: + async def get_partial_values(self, key_ranges: List[Tuple[str, int]]) -> bytes: """Retrieve possibly partial values from given key_ranges. Parameters @@ -41,7 +41,7 @@ def get_partial_values(self, key_ranges: List[Tuple[str, int]]) -> bytes: class WriteStore(ReadStore): @abstractmethod - def set(self, key: str, value: bytes) -> None: + async def set(self, key: str, value: bytes) -> None: """Store a (key, value) pair. Parameters @@ -52,7 +52,7 @@ def set(self, key: str, value: bytes) -> None: ... @abstractmethod - def set_partial_values(self, key_start_values: List[Tuple[str, int, bytes]]) -> None: + async def set_partial_values(self, key_start_values: List[Tuple[str, int, bytes]]) -> None: """Store values at a given key, starting at byte range_start. Parameters @@ -67,7 +67,7 @@ def set_partial_values(self, key_start_values: List[Tuple[str, int, bytes]]) -> class ListMixin: @abstractmethod - def list(self) -> List[str]: + async def list(self) -> List[str]: """Retrieve all keys in the store. Returns @@ -77,7 +77,7 @@ def list(self) -> List[str]: ... @abstractmethod - def list_prefix(self, prefix: str) -> List[str]: + async def list_prefix(self, prefix: str) -> List[str]: """Retrieve all keys in the store. Parameters @@ -91,7 +91,7 @@ def list_prefix(self, prefix: str) -> List[str]: ... @abstractmethod - def list_dir(self, prefix: str) -> List[str]: + async def list_dir(self, prefix: str) -> List[str]: """ Retrieve all keys and prefixes with a given prefix and which do not contain the character “/” after the given prefix. diff --git a/zarr/v3/array.py b/zarr/v3/array.py index e69f306eca..3c0d7eba5c 100644 --- a/zarr/v3/array.py +++ b/zarr/v3/array.py @@ -1,3 +1,14 @@ +# Notes on what I've changed here: +# 1. Split Array into AsyncArray and Array +# 2. Inherit from abc (SynchronousArray, AsynchronousArray) +# 3. Added .size and .attrs methods +# 4. Temporarily disabled the creation of ArrayV2 +# 5. Added from_json to AsyncArray + +# Questions to consider: +# 1. Was splitting the array into two classes really necessary? +# 2. Do we really need runtime_configuration? Specifically, the asyncio_loop seems problematic + from __future__ import annotations import json @@ -6,7 +17,9 @@ import numpy as np from attr import evolve, frozen -from zarr.v3.array_v2 import ArrayV2 +from zarr.v3.abc.array import SynchronousArray, AsynchronousArray + +# from zarr.v3.array_v2 import ArrayV2 from zarr.v3.codecs import CodecMetadata, CodecPipeline, bytes_codec from zarr.v3.common import ( ZARR_JSON, @@ -34,40 +47,14 @@ @frozen -class _AsyncArrayProxy: - array: Array - - def __getitem__(self, selection: Selection) -> _AsyncArraySelectionProxy: - return _AsyncArraySelectionProxy(self.array, selection) - - -@frozen -class _AsyncArraySelectionProxy: - array: Array - selection: Selection - - async def get(self) -> np.ndarray: - return await self.array._get_async(self.selection) - - async def set(self, value: np.ndarray): - return await self.array._set_async(self.selection, value) - - -def _json_convert(o): - if isinstance(o, DataType): - return o.name - raise TypeError - - -@frozen -class Array: +class AsyncArray(AsynchronousArray): metadata: ArrayMetadata store_path: StorePath runtime_configuration: RuntimeConfiguration codec_pipeline: CodecPipeline @classmethod - async def create_async( + async def create( cls, store: StoreLike, *, @@ -84,7 +71,7 @@ async def create_async( attributes: Optional[Dict[str, Any]] = None, runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), exists_ok: bool = False, - ) -> Array: + ) -> AsyncArray: store_path = make_store_path(store) if not exists_ok: assert not await (store_path / ZARR_JSON).exists_async() @@ -140,92 +127,45 @@ async def create_async( return array @classmethod - def create( + def from_json( cls, - store: StoreLike, - *, - shape: ChunkCoords, - dtype: Union[str, np.dtype], - chunk_shape: ChunkCoords, - fill_value: Optional[Any] = None, - chunk_key_encoding: Union[ - Tuple[Literal["default"], Literal[".", "/"]], - Tuple[Literal["v2"], Literal[".", "/"]], - ] = ("default", "/"), - codecs: Optional[Iterable[CodecMetadata]] = None, - dimension_names: Optional[Iterable[str]] = None, - attributes: Optional[Dict[str, Any]] = None, - runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), - exists_ok: bool = False, - ) -> Array: - return sync( - cls.create_async( - store=store, - shape=shape, - dtype=dtype, - chunk_shape=chunk_shape, - fill_value=fill_value, - chunk_key_encoding=chunk_key_encoding, - codecs=codecs, - dimension_names=dimension_names, - attributes=attributes, - runtime_configuration=runtime_configuration, - exists_ok=exists_ok, + store_path: StorePath, + zarr_json: Any, + runtime_configuration: RuntimeConfiguration, + ) -> AsyncArray: + metadata = ArrayMetadata.from_json(zarr_json) + async_array = cls( + metadata=metadata, + store_path=store_path, + runtime_configuration=runtime_configuration, + codec_pipeline=CodecPipeline.from_metadata( + metadata.codecs, metadata.get_core_metadata(runtime_configuration) ), - runtime_configuration.asyncio_loop, ) + async_array._validate_metadata() + return async_array @classmethod - async def open_async( + async def open( cls, store: StoreLike, runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), - ) -> Array: + ) -> AsyncArray: store_path = make_store_path(store) zarr_json_bytes = await (store_path / ZARR_JSON).get_async() assert zarr_json_bytes is not None return cls.from_json( store_path, json.loads(zarr_json_bytes), - runtime_configuration=runtime_configuration or RuntimeConfiguration(), - ) - - @classmethod - def open( - cls, - store: StoreLike, - runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), - ) -> Array: - return sync( - cls.open_async(store, runtime_configuration=runtime_configuration), - runtime_configuration.asyncio_loop, - ) - - @classmethod - def from_json( - cls, - store_path: StorePath, - zarr_json: Any, - runtime_configuration: RuntimeConfiguration, - ) -> Array: - metadata = ArrayMetadata.from_json(zarr_json) - out = cls( - metadata=metadata, - store_path=store_path, runtime_configuration=runtime_configuration, - codec_pipeline=CodecPipeline.from_metadata( - metadata.codecs, metadata.get_core_metadata(runtime_configuration) - ), ) - out._validate_metadata() - return out @classmethod - async def open_auto_async( + async def open_auto( cls, store: StoreLike, runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), - ) -> Union[Array, ArrayV2]: + ) -> AsyncArray: # TODO: Union[AsyncArray, ArrayV2] store_path = make_store_path(store) v3_metadata_bytes = await (store_path / ZARR_JSON).get_async() if v3_metadata_bytes is not None: @@ -234,32 +174,9 @@ async def open_auto_async( json.loads(v3_metadata_bytes), runtime_configuration=runtime_configuration or RuntimeConfiguration(), ) - return await ArrayV2.open_async(store_path) - - @classmethod - def open_auto( - cls, - store: StoreLike, - runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), - ) -> Union[Array, ArrayV2]: - return sync( - cls.open_auto_async(store, runtime_configuration), - runtime_configuration.asyncio_loop, - ) - - async def _save_metadata(self) -> None: - self._validate_metadata() - - await (self.store_path / ZARR_JSON).set_async(self.metadata.to_bytes()) - - def _validate_metadata(self) -> None: - assert len(self.metadata.shape) == len( - self.metadata.chunk_grid.configuration.chunk_shape - ), "`chunk_shape` and `shape` need to have the same number of dimensions." - assert self.metadata.dimension_names is None or len(self.metadata.shape) == len( - self.metadata.dimension_names - ), "`dimension_names` and `shape` need to have the same number of dimensions." - assert self.metadata.fill_value is not None, "`fill_value` is required." + else: + raise ValueError("no v2 support yet") + # return await ArrayV2.open_async(store_path) @property def ndim(self) -> int: @@ -269,18 +186,19 @@ def ndim(self) -> int: def shape(self) -> ChunkCoords: return self.metadata.shape + @property + def size(self) -> int: + return np.prod(self.metadata.shape) + @property def dtype(self) -> np.dtype: return self.metadata.dtype @property - def async_(self) -> _AsyncArrayProxy: - return _AsyncArrayProxy(self) - - def __getitem__(self, selection: Selection): - return sync(self._get_async(selection), self.runtime_configuration.asyncio_loop) + def attrs(self) -> dict: + return self.metadata.attributes - async def _get_async(self, selection: Selection): + async def getitem(self, selection: Selection): indexer = BasicIndexer( selection, shape=self.metadata.shape, @@ -309,6 +227,20 @@ async def _get_async(self, selection: Selection): else: return out[()] + async def _save_metadata(self) -> None: + self._validate_metadata() + + await (self.store_path / ZARR_JSON).set_async(self.metadata.to_bytes()) + + def _validate_metadata(self) -> None: + assert len(self.metadata.shape) == len( + self.metadata.chunk_grid.configuration.chunk_shape + ), "`chunk_shape` and `shape` need to have the same number of dimensions." + assert self.metadata.dimension_names is None or len(self.metadata.shape) == len( + self.metadata.dimension_names + ), "`dimension_names` and `shape` need to have the same number of dimensions." + assert self.metadata.fill_value is not None, "`fill_value` is required." + async def _read_chunk( self, chunk_coords: ChunkCoords, @@ -339,10 +271,7 @@ async def _read_chunk( else: out[out_selection] = self.metadata.fill_value - def __setitem__(self, selection: Selection, value: np.ndarray) -> None: - sync(self._set_async(selection, value), self.runtime_configuration.asyncio_loop) - - async def _set_async(self, selection: Selection, value: np.ndarray) -> None: + async def setitem(self, selection: Selection, value: np.ndarray) -> None: chunk_shape = self.metadata.chunk_grid.configuration.chunk_shape indexer = BasicIndexer( selection, @@ -444,7 +373,7 @@ async def _write_chunk_to_store(self, store_path: StorePath, chunk_array: np.nda else: await store_path.set_async(chunk_bytes) - async def resize_async(self, new_shape: ChunkCoords) -> Array: + async def resize(self, new_shape: ChunkCoords) -> Array: assert len(new_shape) == len(self.metadata.shape) new_metadata = evolve(self.metadata, shape=new_shape) @@ -470,21 +399,152 @@ async def _delete_key(key: str) -> None: await (self.store_path / ZARR_JSON).set_async(new_metadata.to_bytes()) return evolve(self, metadata=new_metadata) - def resize(self, new_shape: ChunkCoords) -> Array: - return sync(self.resize_async(new_shape), self.runtime_configuration.asyncio_loop) - - async def update_attributes_async(self, new_attributes: Dict[str, Any]) -> Array: + async def update_attributes(self, new_attributes: Dict[str, Any]) -> Array: new_metadata = evolve(self.metadata, attributes=new_attributes) # Write new metadata await (self.store_path / ZARR_JSON).set_async(new_metadata.to_bytes()) return evolve(self, metadata=new_metadata) + def __repr__(self): + return f"" + + async def info(self): + return NotImplemented + + +@frozen +class Array(SynchronousArray): + _async_array: AsyncArray + + @classmethod + def create( + cls, + store: StoreLike, + *, + shape: ChunkCoords, + dtype: Union[str, np.dtype], + chunk_shape: ChunkCoords, + fill_value: Optional[Any] = None, + chunk_key_encoding: Union[ + Tuple[Literal["default"], Literal[".", "/"]], + Tuple[Literal["v2"], Literal[".", "/"]], + ] = ("default", "/"), + codecs: Optional[Iterable[CodecMetadata]] = None, + dimension_names: Optional[Iterable[str]] = None, + attributes: Optional[Dict[str, Any]] = None, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + exists_ok: bool = False, + ) -> Array: + async_array = sync( + AsyncArray.create( + store=store, + shape=shape, + dtype=dtype, + chunk_shape=chunk_shape, + fill_value=fill_value, + chunk_key_encoding=chunk_key_encoding, + codecs=codecs, + dimension_names=dimension_names, + attributes=attributes, + runtime_configuration=runtime_configuration, + exists_ok=exists_ok, + ), + runtime_configuration.asyncio_loop, + ) + return cls(async_array) + + @classmethod + def from_json( + cls, + store_path: StorePath, + zarr_json: Any, + runtime_configuration: RuntimeConfiguration, + ) -> Array: + async_array = AsyncArray.from_json( + store_path=store_path, zarr_json=zarr_json, runtime_configuration=runtime_configuration + ) + return cls(async_array) + + @classmethod + def open( + cls, + store: StoreLike, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> Array: + + async_array = sync( + AsyncArray.open(store, runtime_configuration=runtime_configuration), + runtime_configuration.asyncio_loop, + ) + async_array._validate_metadata() + return cls(async_array) + + @classmethod + def open_auto( + cls, + store: StoreLike, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> Array: # TODO: Union[Array, ArrayV2]: + async_array = sync( + AsyncArray.open_auto(store, runtime_configuration), + runtime_configuration.asyncio_loop, + ) + return cls(async_array) + + @property + def ndim(self) -> int: + return self._async_array.ndim + + @property + def shape(self) -> ChunkCoords: + return self._async_array.shape + + @property + def size(self) -> int: + return self._async_array.size + + @property + def dtype(self) -> np.dtype: + return self._async_array.dtype + + @property + def attrs(self) -> dict: + return self._async_array.attrs + + @property + def store_path(self) -> str: + return self._async_array.store_path + + def __getitem__(self, selection: Selection): + return sync( + self._async_array.getitem(selection), + self._async_array.runtime_configuration.asyncio_loop, + ) + + def __setitem__(self, selection: Selection, value: np.ndarray) -> None: + sync( + self._async_array.setitem(selection, value), + self._async_array.runtime_configuration.asyncio_loop, + ) + + def resize(self, new_shape: ChunkCoords) -> Array: + return sync( + self._async_array.resize(new_shape), + self._async_array.runtime_configuration.asyncio_loop, + ) + def update_attributes(self, new_attributes: Dict[str, Any]) -> Array: return sync( - self.update_attributes_async(new_attributes), - self.runtime_configuration.asyncio_loop, + self._async_array.update_attributes(new_attributes), + self._async_array.runtime_configuration.asyncio_loop, ) def __repr__(self): return f"" + + def info(self): + return sync( + self._async_array.info(), + self._async_array.runtime_configuration.asyncio_loop, + ) diff --git a/zarr/v3/codecs.py b/zarr/v3/codecs.py index ff15f2ebf9..ff913c42b2 100644 --- a/zarr/v3/codecs.py +++ b/zarr/v3/codecs.py @@ -1,6 +1,5 @@ from __future__ import annotations -from abc import ABC, abstractmethod from functools import reduce from typing import TYPE_CHECKING, Iterable, List, Literal, Optional, Tuple, Union from warnings import warn @@ -13,6 +12,7 @@ from numcodecs.gzip import GZip from zstandard import ZstdCompressor, ZstdDecompressor +from zarr.v3.abc.codec import Codec, ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.v3.common import BytesLike, to_thread from zarr.v3.metadata import ( BloscCodecConfigurationMetadata, @@ -38,68 +38,6 @@ numcodecs.blosc.use_threads = False -class Codec(ABC): - supports_partial_decode: bool - supports_partial_encode: bool - is_fixed_size: bool - array_metadata: CoreArrayMetadata - - @abstractmethod - def compute_encoded_size(self, input_byte_length: int) -> int: - pass - - def resolve_metadata(self) -> CoreArrayMetadata: - return self.array_metadata - - -class ArrayArrayCodec(Codec): - @abstractmethod - async def decode( - self, - chunk_array: np.ndarray, - ) -> np.ndarray: - pass - - @abstractmethod - async def encode( - self, - chunk_array: np.ndarray, - ) -> Optional[np.ndarray]: - pass - - -class ArrayBytesCodec(Codec): - @abstractmethod - async def decode( - self, - chunk_array: BytesLike, - ) -> np.ndarray: - pass - - @abstractmethod - async def encode( - self, - chunk_array: np.ndarray, - ) -> Optional[BytesLike]: - pass - - -class BytesBytesCodec(Codec): - @abstractmethod - async def decode( - self, - chunk_array: BytesLike, - ) -> BytesLike: - pass - - @abstractmethod - async def encode( - self, - chunk_array: BytesLike, - ) -> Optional[BytesLike]: - pass - - @frozen class CodecPipeline: codecs: List[Codec] diff --git a/zarr/v3/store.py b/zarr/v3/store.py index 67e54340d0..f7472c68d2 100644 --- a/zarr/v3/store.py +++ b/zarr/v3/store.py @@ -1,3 +1,10 @@ +# TODO: +# 1. Stores should inherit from zarr.v3.abc.store classes +# 2. remove "_async" suffix from all methods? + +# Changes I've made here: +# 1. Make delay import of fsspec + from __future__ import annotations import asyncio @@ -5,13 +12,11 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union -import fsspec -from fsspec.asyn import AsyncFileSystem - from zarr.v3.common import BytesLike, to_thread if TYPE_CHECKING: from upath import UPath + from fsspec.asyn import AsyncFileSystem def _dereference_path(root: str, path: str) -> str: @@ -205,6 +210,7 @@ class RemoteStore(Store): def __init__(self, url: Union[UPath, str], **storage_options: Dict[str, Any]): from upath import UPath + import fsspec if isinstance(url, str): self.root = UPath(url, **storage_options) @@ -219,6 +225,8 @@ def __init__(self, url: Union[UPath, str], **storage_options: Dict[str, Any]): assert fs.__class__.async_impl, "FileSystem needs to support async operations." def make_fs(self) -> Tuple[AsyncFileSystem, str]: + import fsspec + storage_options = self.root._kwargs.copy() storage_options.pop("_url", None) fs, root = fsspec.core.url_to_fs(str(self.root), asynchronous=True, **self.root._kwargs) From 78d0bc05c424ac3025d0b0958a72ef253fdd1a1e Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Tue, 5 Dec 2023 14:24:03 +0100 Subject: [PATCH 06/12] working on making codecs extensible --- zarr/v3/abc/codec.py | 36 ++- zarr/v3/array.py | 6 +- zarr/v3/array_v2.py | 2 +- zarr/v3/codecs.py | 514 ------------------------------- zarr/v3/codecs/__init__.py | 218 +++++++++++++ zarr/v3/codecs/blosc.py | 92 ++++++ zarr/v3/codecs/bytes.py | 98 ++++++ zarr/v3/codecs/crc32c_.py | 57 ++++ zarr/v3/codecs/gzip.py | 62 ++++ zarr/v3/codecs/registry.py | 26 ++ zarr/v3/{ => codecs}/sharding.py | 37 ++- zarr/v3/codecs/transpose.py | 106 +++++++ zarr/v3/codecs/zstd.py | 73 +++++ zarr/v3/common.py | 28 +- zarr/v3/metadata.py | 102 +----- 15 files changed, 804 insertions(+), 653 deletions(-) delete mode 100644 zarr/v3/codecs.py create mode 100644 zarr/v3/codecs/__init__.py create mode 100644 zarr/v3/codecs/blosc.py create mode 100644 zarr/v3/codecs/bytes.py create mode 100644 zarr/v3/codecs/crc32c_.py create mode 100644 zarr/v3/codecs/gzip.py create mode 100644 zarr/v3/codecs/registry.py rename zarr/v3/{ => codecs}/sharding.py (95%) create mode 100644 zarr/v3/codecs/transpose.py create mode 100644 zarr/v3/codecs/zstd.py diff --git a/zarr/v3/abc/codec.py b/zarr/v3/abc/codec.py index f84fc74af9..c856a772e0 100644 --- a/zarr/v3/abc/codec.py +++ b/zarr/v3/abc/codec.py @@ -11,20 +11,19 @@ from __future__ import annotations from abc import abstractmethod, ABC -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Optional, Type import numpy as np -from zarr.v3.common import BytesLike +from zarr.v3.common import BytesLike, SliceSelection +from zarr.v3.store import StorePath if TYPE_CHECKING: - from zarr.v3.metadata import CoreArrayMetadata + from zarr.v3.metadata import CoreArrayMetadata, CodecMetadata class Codec(ABC): - supports_partial_decode: bool - supports_partial_encode: bool is_fixed_size: bool array_metadata: CoreArrayMetadata @@ -35,6 +34,12 @@ def compute_encoded_size(self, input_byte_length: int) -> int: def resolve_metadata(self) -> CoreArrayMetadata: return self.array_metadata + @classmethod + def from_metadata( + cls, codec_metadata: "CodecMetadata", array_metadata: CoreArrayMetadata + ) -> "Type[Codec]": + pass + class ArrayArrayCodec(Codec): @abstractmethod @@ -68,6 +73,27 @@ async def encode( pass +class ArrayBytesCodecPartialDecodeMixin: + @abstractmethod + async def decode_partial( + self, + store_path: StorePath, + selection: SliceSelection, + ) -> Optional[np.ndarray]: + pass + + +class ArrayBytesCodecPartialEncodeMixin: + @abstractmethod + async def encode_partial( + self, + store_path: StorePath, + chunk_array: np.ndarray, + selection: SliceSelection, + ) -> None: + pass + + class BytesBytesCodec(Codec): @abstractmethod async def decode( diff --git a/zarr/v3/array.py b/zarr/v3/array.py index 3c0d7eba5c..2dc0063a50 100644 --- a/zarr/v3/array.py +++ b/zarr/v3/array.py @@ -18,6 +18,7 @@ from attr import evolve, frozen from zarr.v3.abc.array import SynchronousArray, AsynchronousArray +from zarr.v3.abc.codec import ArrayBytesCodecPartialDecodeMixin # from zarr.v3.array_v2 import ArrayV2 from zarr.v3.codecs import CodecMetadata, CodecPipeline, bytes_codec @@ -41,7 +42,7 @@ V2ChunkKeyEncodingMetadata, dtype_to_data_type, ) -from zarr.v3.sharding import ShardingCodec +from zarr.v3.codecs.sharding import ShardingCodec from zarr.v3.store import StoreLike, StorePath, make_store_path from zarr.v3.sync import sync @@ -253,7 +254,7 @@ async def _read_chunk( store_path = self.store_path / chunk_key if len(self.codec_pipeline.codecs) == 1 and isinstance( - self.codec_pipeline.codecs[0], ShardingCodec + self.codec_pipeline.codecs[0], ArrayBytesCodecPartialDecodeMixin ): chunk_array = await self.codec_pipeline.codecs[0].decode_partial( store_path, chunk_selection @@ -472,7 +473,6 @@ def open( store: StoreLike, runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), ) -> Array: - async_array = sync( AsyncArray.open(store, runtime_configuration=runtime_configuration), runtime_configuration.asyncio_loop, diff --git a/zarr/v3/array_v2.py b/zarr/v3/array_v2.py index a2f26f01b0..665e49470f 100644 --- a/zarr/v3/array_v2.py +++ b/zarr/v3/array_v2.py @@ -20,7 +20,7 @@ to_thread, ) from zarr.v3.indexing import BasicIndexer, all_chunk_coords, is_total_slice -from zarr.v3.metadata import ArrayV2Metadata, RuntimeConfiguration +from zarr.v3.metadata import ArrayV2Metadata, CodecMetadata, RuntimeConfiguration from zarr.v3.store import StoreLike, StorePath, make_store_path from zarr.v3.sync import sync diff --git a/zarr/v3/codecs.py b/zarr/v3/codecs.py deleted file mode 100644 index ff913c42b2..0000000000 --- a/zarr/v3/codecs.py +++ /dev/null @@ -1,514 +0,0 @@ -from __future__ import annotations - -from functools import reduce -from typing import TYPE_CHECKING, Iterable, List, Literal, Optional, Tuple, Union -from warnings import warn - -import numcodecs -import numpy as np -from attr import asdict, evolve, frozen -from crc32c import crc32c -from numcodecs.blosc import Blosc -from numcodecs.gzip import GZip -from zstandard import ZstdCompressor, ZstdDecompressor - -from zarr.v3.abc.codec import Codec, ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec -from zarr.v3.common import BytesLike, to_thread -from zarr.v3.metadata import ( - BloscCodecConfigurationMetadata, - BloscCodecMetadata, - BytesCodecConfigurationMetadata, - BytesCodecMetadata, - CodecMetadata, - Crc32cCodecMetadata, - GzipCodecConfigurationMetadata, - GzipCodecMetadata, - ShardingCodecConfigurationMetadata, - ShardingCodecMetadata, - TransposeCodecConfigurationMetadata, - TransposeCodecMetadata, - ZstdCodecConfigurationMetadata, - ZstdCodecMetadata, -) - -if TYPE_CHECKING: - from zarr.v3.metadata import CoreArrayMetadata - -# See https://zarr.readthedocs.io/en/stable/tutorial.html#configuring-blosc -numcodecs.blosc.use_threads = False - - -@frozen -class CodecPipeline: - codecs: List[Codec] - - @classmethod - def from_metadata( - cls, - codecs_metadata: Iterable[CodecMetadata], - array_metadata: CoreArrayMetadata, - ) -> CodecPipeline: - out: List[Codec] = [] - for codec_metadata in codecs_metadata or []: - if codec_metadata.name == "endian": - codec_metadata = evolve(codec_metadata, name="bytes") # type: ignore - - codec: Codec - if codec_metadata.name == "blosc": - codec = BloscCodec.from_metadata(codec_metadata, array_metadata) - elif codec_metadata.name == "gzip": - codec = GzipCodec.from_metadata(codec_metadata, array_metadata) - elif codec_metadata.name == "zstd": - codec = ZstdCodec.from_metadata(codec_metadata, array_metadata) - elif codec_metadata.name == "transpose": - codec = TransposeCodec.from_metadata(codec_metadata, array_metadata) - elif codec_metadata.name == "bytes": - codec = BytesCodec.from_metadata(codec_metadata, array_metadata) - elif codec_metadata.name == "crc32c": - codec = Crc32cCodec.from_metadata(codec_metadata, array_metadata) - elif codec_metadata.name == "sharding_indexed": - from zarr.v3.sharding import ShardingCodec - - codec = ShardingCodec.from_metadata(codec_metadata, array_metadata) - else: - raise RuntimeError(f"Unsupported codec: {codec_metadata}") - - out.append(codec) - array_metadata = codec.resolve_metadata() - CodecPipeline._validate_codecs(out, array_metadata) - return cls(out) - - @staticmethod - def _validate_codecs(codecs: List[Codec], array_metadata: CoreArrayMetadata) -> None: - from zarr.v3.sharding import ShardingCodec - - assert any( - isinstance(codec, ArrayBytesCodec) for codec in codecs - ), "Exactly one array-to-bytes codec is required." - - prev_codec: Optional[Codec] = None - for codec in codecs: - if prev_codec is not None: - assert not isinstance(codec, ArrayBytesCodec) or not isinstance( - prev_codec, ArrayBytesCodec - ), ( - f"ArrayBytesCodec '{type(codec)}' cannot follow after " - + f"ArrayBytesCodec '{type(prev_codec)}' because exactly " - + "1 ArrayBytesCodec is allowed." - ) - assert not isinstance(codec, ArrayBytesCodec) or not isinstance( - prev_codec, BytesBytesCodec - ), ( - f"ArrayBytesCodec '{type(codec)}' cannot follow after " - + f"BytesBytesCodec '{type(prev_codec)}'." - ) - assert not isinstance(codec, ArrayArrayCodec) or not isinstance( - prev_codec, ArrayBytesCodec - ), ( - f"ArrayArrayCodec '{type(codec)}' cannot follow after " - + f"ArrayBytesCodec '{type(prev_codec)}'." - ) - assert not isinstance(codec, ArrayArrayCodec) or not isinstance( - prev_codec, BytesBytesCodec - ), ( - f"ArrayArrayCodec '{type(codec)}' cannot follow after " - + f"BytesBytesCodec '{type(prev_codec)}'." - ) - - if isinstance(codec, ShardingCodec): - assert len(codec.configuration.chunk_shape) == len(array_metadata.shape), ( - "The shard's `chunk_shape` and array's `shape` need to have the " - + "same number of dimensions." - ) - assert all( - s % c == 0 - for s, c in zip( - array_metadata.chunk_shape, - codec.configuration.chunk_shape, - ) - ), ( - "The array's `chunk_shape` needs to be divisible by the " - + "shard's inner `chunk_shape`." - ) - prev_codec = codec - - if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(codecs) > 1: - warn( - "Combining a `sharding_indexed` codec disables partial reads and " - + "writes, which may lead to inefficient performance." - ) - - def _array_array_codecs(self) -> List[ArrayArrayCodec]: - return [codec for codec in self.codecs if isinstance(codec, ArrayArrayCodec)] - - def _array_bytes_codec(self) -> ArrayBytesCodec: - return next(codec for codec in self.codecs if isinstance(codec, ArrayBytesCodec)) - - def _bytes_bytes_codecs(self) -> List[BytesBytesCodec]: - return [codec for codec in self.codecs if isinstance(codec, BytesBytesCodec)] - - async def decode(self, chunk_bytes: BytesLike) -> np.ndarray: - for bb_codec in self._bytes_bytes_codecs()[::-1]: - chunk_bytes = await bb_codec.decode(chunk_bytes) - - chunk_array = await self._array_bytes_codec().decode(chunk_bytes) - - for aa_codec in self._array_array_codecs()[::-1]: - chunk_array = await aa_codec.decode(chunk_array) - - return chunk_array - - async def encode(self, chunk_array: np.ndarray) -> Optional[BytesLike]: - for aa_codec in self._array_array_codecs(): - chunk_array_maybe = await aa_codec.encode(chunk_array) - if chunk_array_maybe is None: - return None - chunk_array = chunk_array_maybe - - chunk_bytes_maybe = await self._array_bytes_codec().encode(chunk_array) - if chunk_bytes_maybe is None: - return None - chunk_bytes = chunk_bytes_maybe - - for bb_codec in self._bytes_bytes_codecs(): - chunk_bytes_maybe = await bb_codec.encode(chunk_bytes) - if chunk_bytes_maybe is None: - return None - chunk_bytes = chunk_bytes_maybe - - return chunk_bytes - - def compute_encoded_size(self, byte_length: int) -> int: - return reduce(lambda acc, codec: codec.compute_encoded_size(acc), self.codecs, byte_length) - - -@frozen -class BloscCodec(BytesBytesCodec): - array_metadata: CoreArrayMetadata - configuration: BloscCodecConfigurationMetadata - blosc_codec: Blosc - is_fixed_size = False - - @classmethod - def from_metadata( - cls, codec_metadata: BloscCodecMetadata, array_metadata: CoreArrayMetadata - ) -> BloscCodec: - configuration = codec_metadata.configuration - if configuration.typesize == 0: - configuration = evolve(configuration, typesize=array_metadata.data_type.byte_count) - config_dict = asdict(codec_metadata.configuration) - config_dict.pop("typesize", None) - map_shuffle_str_to_int = {"noshuffle": 0, "shuffle": 1, "bitshuffle": 2} - config_dict["shuffle"] = map_shuffle_str_to_int[config_dict["shuffle"]] - return cls( - array_metadata=array_metadata, - configuration=configuration, - blosc_codec=Blosc.from_config(config_dict), - ) - - async def decode( - self, - chunk_bytes: bytes, - ) -> BytesLike: - return await to_thread(self.blosc_codec.decode, chunk_bytes) - - async def encode( - self, - chunk_bytes: bytes, - ) -> Optional[BytesLike]: - chunk_array = np.frombuffer(chunk_bytes, dtype=self.array_metadata.dtype) - return await to_thread(self.blosc_codec.encode, chunk_array) - - def compute_encoded_size(self, _input_byte_length: int) -> int: - raise NotImplementedError - - -@frozen -class BytesCodec(ArrayBytesCodec): - array_metadata: CoreArrayMetadata - configuration: BytesCodecConfigurationMetadata - is_fixed_size = True - - @classmethod - def from_metadata( - cls, codec_metadata: BytesCodecMetadata, array_metadata: CoreArrayMetadata - ) -> BytesCodec: - assert ( - array_metadata.dtype.itemsize == 1 or codec_metadata.configuration.endian is not None - ), "The `endian` configuration needs to be specified for multi-byte data types." - return cls( - array_metadata=array_metadata, - configuration=codec_metadata.configuration, - ) - - def _get_byteorder(self, array: np.ndarray) -> Literal["big", "little"]: - if array.dtype.byteorder == "<": - return "little" - elif array.dtype.byteorder == ">": - return "big" - else: - import sys - - return sys.byteorder - - async def decode( - self, - chunk_bytes: BytesLike, - ) -> np.ndarray: - if self.array_metadata.dtype.itemsize > 0: - if self.configuration.endian == "little": - prefix = "<" - else: - prefix = ">" - dtype = np.dtype(f"{prefix}{self.array_metadata.data_type.to_numpy_shortname()}") - else: - dtype = np.dtype(f"|{self.array_metadata.data_type.to_numpy_shortname()}") - chunk_array = np.frombuffer(chunk_bytes, dtype) - - # ensure correct chunk shape - if chunk_array.shape != self.array_metadata.chunk_shape: - chunk_array = chunk_array.reshape( - self.array_metadata.chunk_shape, - ) - return chunk_array - - async def encode( - self, - chunk_array: np.ndarray, - ) -> Optional[BytesLike]: - if chunk_array.dtype.itemsize > 1: - byteorder = self._get_byteorder(chunk_array) - if self.configuration.endian != byteorder: - new_dtype = chunk_array.dtype.newbyteorder(self.configuration.endian) - chunk_array = chunk_array.astype(new_dtype) - return chunk_array.tobytes() - - def compute_encoded_size(self, input_byte_length: int) -> int: - return input_byte_length - - -@frozen -class TransposeCodec(ArrayArrayCodec): - array_metadata: CoreArrayMetadata - order: Tuple[int, ...] - is_fixed_size = True - - @classmethod - def from_metadata( - cls, codec_metadata: TransposeCodecMetadata, array_metadata: CoreArrayMetadata - ) -> TransposeCodec: - configuration = codec_metadata.configuration - if configuration.order == "F": - order = tuple(array_metadata.ndim - x - 1 for x in range(array_metadata.ndim)) - - elif configuration.order == "C": - order = tuple(range(array_metadata.ndim)) - - else: - assert len(configuration.order) == array_metadata.ndim, ( - "The `order` tuple needs have as many entries as " - + f"there are dimensions in the array. Got: {configuration.order}" - ) - assert len(configuration.order) == len(set(configuration.order)), ( - "There must not be duplicates in the `order` tuple. " - + f"Got: {configuration.order}" - ) - assert all(0 <= x < array_metadata.ndim for x in configuration.order), ( - "All entries in the `order` tuple must be between 0 and " - + f"the number of dimensions in the array. Got: {configuration.order}" - ) - order = tuple(configuration.order) - - return cls( - array_metadata=array_metadata, - order=order, - ) - - def resolve_metadata(self) -> CoreArrayMetadata: - from zarr.v3.metadata import CoreArrayMetadata - - return CoreArrayMetadata( - shape=tuple( - self.array_metadata.shape[self.order[i]] for i in range(self.array_metadata.ndim) - ), - chunk_shape=tuple( - self.array_metadata.chunk_shape[self.order[i]] - for i in range(self.array_metadata.ndim) - ), - data_type=self.array_metadata.data_type, - fill_value=self.array_metadata.fill_value, - runtime_configuration=self.array_metadata.runtime_configuration, - ) - - async def decode( - self, - chunk_array: np.ndarray, - ) -> np.ndarray: - inverse_order = [0 for _ in range(self.array_metadata.ndim)] - for x, i in enumerate(self.order): - inverse_order[x] = i - chunk_array = chunk_array.transpose(inverse_order) - return chunk_array - - async def encode( - self, - chunk_array: np.ndarray, - ) -> Optional[np.ndarray]: - chunk_array = chunk_array.transpose(self.order) - return chunk_array - - def compute_encoded_size(self, input_byte_length: int) -> int: - return input_byte_length - - -@frozen -class GzipCodec(BytesBytesCodec): - array_metadata: CoreArrayMetadata - configuration: GzipCodecConfigurationMetadata - is_fixed_size = True - - @classmethod - def from_metadata( - cls, codec_metadata: GzipCodecMetadata, array_metadata: CoreArrayMetadata - ) -> GzipCodec: - return cls( - array_metadata=array_metadata, - configuration=codec_metadata.configuration, - ) - - async def decode( - self, - chunk_bytes: bytes, - ) -> BytesLike: - return await to_thread(GZip(self.configuration.level).decode, chunk_bytes) - - async def encode( - self, - chunk_bytes: bytes, - ) -> Optional[BytesLike]: - return await to_thread(GZip(self.configuration.level).encode, chunk_bytes) - - def compute_encoded_size(self, _input_byte_length: int) -> int: - raise NotImplementedError - - -@frozen -class ZstdCodec(BytesBytesCodec): - array_metadata: CoreArrayMetadata - configuration: ZstdCodecConfigurationMetadata - is_fixed_size = True - - @classmethod - def from_metadata( - cls, codec_metadata: ZstdCodecMetadata, array_metadata: CoreArrayMetadata - ) -> ZstdCodec: - return cls( - array_metadata=array_metadata, - configuration=codec_metadata.configuration, - ) - - def _compress(self, data: bytes) -> bytes: - ctx = ZstdCompressor( - level=self.configuration.level, write_checksum=self.configuration.checksum - ) - return ctx.compress(data) - - def _decompress(self, data: bytes) -> bytes: - ctx = ZstdDecompressor() - return ctx.decompress(data) - - async def decode( - self, - chunk_bytes: bytes, - ) -> BytesLike: - return await to_thread(self._decompress, chunk_bytes) - - async def encode( - self, - chunk_bytes: bytes, - ) -> Optional[BytesLike]: - return await to_thread(self._compress, chunk_bytes) - - def compute_encoded_size(self, _input_byte_length: int) -> int: - raise NotImplementedError - - -@frozen -class Crc32cCodec(BytesBytesCodec): - array_metadata: CoreArrayMetadata - is_fixed_size = True - - @classmethod - def from_metadata( - cls, codec_metadata: Crc32cCodecMetadata, array_metadata: CoreArrayMetadata - ) -> Crc32cCodec: - return cls(array_metadata=array_metadata) - - async def decode( - self, - chunk_bytes: bytes, - ) -> BytesLike: - crc32_bytes = chunk_bytes[-4:] - inner_bytes = chunk_bytes[:-4] - - assert np.uint32(crc32c(inner_bytes)).tobytes() == bytes(crc32_bytes) - return inner_bytes - - async def encode( - self, - chunk_bytes: bytes, - ) -> Optional[BytesLike]: - return chunk_bytes + np.uint32(crc32c(chunk_bytes)).tobytes() - - def compute_encoded_size(self, input_byte_length: int) -> int: - return input_byte_length + 4 - - -def blosc_codec( - typesize: int, - cname: Literal["lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib"] = "zstd", - clevel: int = 5, - shuffle: Literal["noshuffle", "shuffle", "bitshuffle"] = "noshuffle", - blocksize: int = 0, -) -> BloscCodecMetadata: - return BloscCodecMetadata( - configuration=BloscCodecConfigurationMetadata( - cname=cname, - clevel=clevel, - shuffle=shuffle, - blocksize=blocksize, - typesize=typesize, - ) - ) - - -def bytes_codec(endian: Optional[Literal["big", "little"]] = "little") -> BytesCodecMetadata: - return BytesCodecMetadata(configuration=BytesCodecConfigurationMetadata(endian)) - - -def transpose_codec(order: Union[Tuple[int, ...], Literal["C", "F"]]) -> TransposeCodecMetadata: - return TransposeCodecMetadata(configuration=TransposeCodecConfigurationMetadata(order)) - - -def gzip_codec(level: int = 5) -> GzipCodecMetadata: - return GzipCodecMetadata(configuration=GzipCodecConfigurationMetadata(level)) - - -def zstd_codec(level: int = 0, checksum: bool = False) -> ZstdCodecMetadata: - return ZstdCodecMetadata(configuration=ZstdCodecConfigurationMetadata(level, checksum)) - - -def crc32c_codec() -> Crc32cCodecMetadata: - return Crc32cCodecMetadata() - - -def sharding_codec( - chunk_shape: Tuple[int, ...], - codecs: Optional[List[CodecMetadata]] = None, - index_codecs: Optional[List[CodecMetadata]] = None, -) -> ShardingCodecMetadata: - codecs = codecs or [bytes_codec()] - index_codecs = index_codecs or [bytes_codec(), crc32c_codec()] - return ShardingCodecMetadata( - configuration=ShardingCodecConfigurationMetadata(chunk_shape, codecs, index_codecs) - ) diff --git a/zarr/v3/codecs/__init__.py b/zarr/v3/codecs/__init__.py new file mode 100644 index 0000000000..fcfef4e233 --- /dev/null +++ b/zarr/v3/codecs/__init__.py @@ -0,0 +1,218 @@ +from __future__ import annotations + +from functools import reduce +from typing import ( + TYPE_CHECKING, + Iterable, + List, + Literal, + Optional, + Tuple, + Union, +) +from warnings import warn + +import numpy as np +from attr import frozen + +from zarr.v3.abc.codec import Codec, ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec +from zarr.v3.common import BytesLike +from zarr.v3.metadata import CodecMetadata +from zarr.v3.codecs.registry import get_codec_class + +if TYPE_CHECKING: + from zarr.v3.metadata import CoreArrayMetadata + from zarr.v3.codecs.sharding import ShardingCodecMetadata + from zarr.v3.codecs.blosc import BloscCodecMetadata + from zarr.v3.codecs.bytes import BytesCodecMetadata + from zarr.v3.codecs.transpose import TransposeCodecMetadata + from zarr.v3.codecs.gzip import GzipCodecMetadata + from zarr.v3.codecs.zstd import ZstdCodecMetadata + from zarr.v3.codecs.crc32c_ import Crc32cCodecMetadata + + +@frozen +class CodecPipeline: + codecs: List[Codec] + + @classmethod + def from_metadata( + cls, + codecs_metadata: Iterable[CodecMetadata], + array_metadata: CoreArrayMetadata, + ) -> CodecPipeline: + out: List[Codec] = [] + for codec_metadata in codecs_metadata or []: + codec_cls = get_codec_class(codec_metadata.name) + codec = codec_cls.from_metadata(codec_metadata, array_metadata) + out.append(codec) + array_metadata = codec.resolve_metadata() + CodecPipeline._validate_codecs(out, array_metadata) + return cls(out) + + @staticmethod + def _validate_codecs(codecs: List[Codec], array_metadata: CoreArrayMetadata) -> None: + from zarr.v3.codecs.sharding import ShardingCodec + + assert any( + isinstance(codec, ArrayBytesCodec) for codec in codecs + ), "Exactly one array-to-bytes codec is required." + + prev_codec: Optional[Codec] = None + for codec in codecs: + if prev_codec is not None: + assert not isinstance(codec, ArrayBytesCodec) or not isinstance( + prev_codec, ArrayBytesCodec + ), ( + f"ArrayBytesCodec '{type(codec)}' cannot follow after " + + f"ArrayBytesCodec '{type(prev_codec)}' because exactly " + + "1 ArrayBytesCodec is allowed." + ) + assert not isinstance(codec, ArrayBytesCodec) or not isinstance( + prev_codec, BytesBytesCodec + ), ( + f"ArrayBytesCodec '{type(codec)}' cannot follow after " + + f"BytesBytesCodec '{type(prev_codec)}'." + ) + assert not isinstance(codec, ArrayArrayCodec) or not isinstance( + prev_codec, ArrayBytesCodec + ), ( + f"ArrayArrayCodec '{type(codec)}' cannot follow after " + + f"ArrayBytesCodec '{type(prev_codec)}'." + ) + assert not isinstance(codec, ArrayArrayCodec) or not isinstance( + prev_codec, BytesBytesCodec + ), ( + f"ArrayArrayCodec '{type(codec)}' cannot follow after " + + f"BytesBytesCodec '{type(prev_codec)}'." + ) + + if isinstance(codec, ShardingCodec): + assert len(codec.configuration.chunk_shape) == len(array_metadata.shape), ( + "The shard's `chunk_shape` and array's `shape` need to have the " + + "same number of dimensions." + ) + assert all( + s % c == 0 + for s, c in zip( + array_metadata.chunk_shape, + codec.configuration.chunk_shape, + ) + ), ( + "The array's `chunk_shape` needs to be divisible by the " + + "shard's inner `chunk_shape`." + ) + prev_codec = codec + + if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(codecs) > 1: + warn( + "Combining a `sharding_indexed` codec disables partial reads and " + + "writes, which may lead to inefficient performance." + ) + + def _array_array_codecs(self) -> List[ArrayArrayCodec]: + return [codec for codec in self.codecs if isinstance(codec, ArrayArrayCodec)] + + def _array_bytes_codec(self) -> ArrayBytesCodec: + return next(codec for codec in self.codecs if isinstance(codec, ArrayBytesCodec)) + + def _bytes_bytes_codecs(self) -> List[BytesBytesCodec]: + return [codec for codec in self.codecs if isinstance(codec, BytesBytesCodec)] + + async def decode(self, chunk_bytes: BytesLike) -> np.ndarray: + for bb_codec in self._bytes_bytes_codecs()[::-1]: + chunk_bytes = await bb_codec.decode(chunk_bytes) + + chunk_array = await self._array_bytes_codec().decode(chunk_bytes) + + for aa_codec in self._array_array_codecs()[::-1]: + chunk_array = await aa_codec.decode(chunk_array) + + return chunk_array + + async def encode(self, chunk_array: np.ndarray) -> Optional[BytesLike]: + for aa_codec in self._array_array_codecs(): + chunk_array_maybe = await aa_codec.encode(chunk_array) + if chunk_array_maybe is None: + return None + chunk_array = chunk_array_maybe + + chunk_bytes_maybe = await self._array_bytes_codec().encode(chunk_array) + if chunk_bytes_maybe is None: + return None + chunk_bytes = chunk_bytes_maybe + + for bb_codec in self._bytes_bytes_codecs(): + chunk_bytes_maybe = await bb_codec.encode(chunk_bytes) + if chunk_bytes_maybe is None: + return None + chunk_bytes = chunk_bytes_maybe + + return chunk_bytes + + def compute_encoded_size(self, byte_length: int) -> int: + return reduce(lambda acc, codec: codec.compute_encoded_size(acc), self.codecs, byte_length) + + +def blosc_codec( + typesize: int, + cname: Literal["lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib"] = "zstd", + clevel: int = 5, + shuffle: Literal["noshuffle", "shuffle", "bitshuffle"] = "noshuffle", + blocksize: int = 0, +) -> "BloscCodecMetadata": + from zarr.v3.codecs.blosc import BloscCodecMetadata, BloscCodecConfigurationMetadata + + return BloscCodecMetadata( + configuration=BloscCodecConfigurationMetadata( + cname=cname, + clevel=clevel, + shuffle=shuffle, + blocksize=blocksize, + typesize=typesize, + ) + ) + + +def bytes_codec(endian: Optional[Literal["big", "little"]] = "little") -> "BytesCodecMetadata": + from zarr.v3.codecs.bytes import BytesCodecMetadata, BytesCodecConfigurationMetadata + + return BytesCodecMetadata(configuration=BytesCodecConfigurationMetadata(endian)) + + +def transpose_codec(order: Union[Tuple[int, ...], Literal["C", "F"]]) -> "TransposeCodecMetadata": + from zarr.v3.codecs.transpose import TransposeCodecMetadata, TransposeCodecConfigurationMetadata + + return TransposeCodecMetadata(configuration=TransposeCodecConfigurationMetadata(order)) + + +def gzip_codec(level: int = 5) -> "GzipCodecMetadata": + from zarr.v3.codecs.gzip import GzipCodecMetadata, GzipCodecConfigurationMetadata + + return GzipCodecMetadata(configuration=GzipCodecConfigurationMetadata(level)) + + +def zstd_codec(level: int = 0, checksum: bool = False) -> "ZstdCodecMetadata": + from zarr.v3.codecs.zstd import ZstdCodecMetadata, ZstdCodecConfigurationMetadata + + return ZstdCodecMetadata(configuration=ZstdCodecConfigurationMetadata(level, checksum)) + + +def crc32c_codec() -> "Crc32cCodecMetadata": + from zarr.v3.codecs.crc32c_ import Crc32cCodecMetadata + + return Crc32cCodecMetadata() + + +def sharding_codec( + chunk_shape: Tuple[int, ...], + codecs: Optional[List[CodecMetadata]] = None, + index_codecs: Optional[List[CodecMetadata]] = None, +) -> "ShardingCodecMetadata": + from zarr.v3.codecs.sharding import ShardingCodecMetadata, ShardingCodecConfigurationMetadata + + codecs = codecs or [bytes_codec()] + index_codecs = index_codecs or [bytes_codec(), crc32c_codec()] + return ShardingCodecMetadata( + configuration=ShardingCodecConfigurationMetadata(chunk_shape, codecs, index_codecs) + ) diff --git a/zarr/v3/codecs/blosc.py b/zarr/v3/codecs/blosc.py new file mode 100644 index 0000000000..4697512037 --- /dev/null +++ b/zarr/v3/codecs/blosc.py @@ -0,0 +1,92 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Dict, + Literal, + Optional, +) + +import numcodecs +import numpy as np +from attr import asdict, evolve, frozen, field +from numcodecs.blosc import Blosc + +from zarr.v3.abc.codec import BytesBytesCodec +from zarr.v3.codecs.registry import register_codec +from zarr.v3.common import BytesLike, to_thread + +if TYPE_CHECKING: + from zarr.v3.metadata import CoreArrayMetadata + + +BloscShuffle = Literal["noshuffle", "shuffle", "bitshuffle"] + +# See https://zarr.readthedocs.io/en/stable/tutorial.html#configuring-blosc +numcodecs.blosc.use_threads = False + + +@frozen +class BloscCodecConfigurationMetadata: + typesize: int + cname: Literal["lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib"] = "zstd" + clevel: int = 5 + shuffle: BloscShuffle = "noshuffle" + blocksize: int = 0 + + +blosc_shuffle_int_to_str: Dict[int, BloscShuffle] = { + 0: "noshuffle", + 1: "shuffle", + 2: "bitshuffle", +} + + +@frozen +class BloscCodecMetadata: + configuration: BloscCodecConfigurationMetadata + name: Literal["blosc"] = field(default="blosc", init=False) + + +@frozen +class BloscCodec(BytesBytesCodec): + array_metadata: CoreArrayMetadata + configuration: BloscCodecConfigurationMetadata + blosc_codec: Blosc + is_fixed_size = False + + @classmethod + def from_metadata( + cls, codec_metadata: BloscCodecMetadata, array_metadata: CoreArrayMetadata + ) -> BloscCodec: + configuration = codec_metadata.configuration + if configuration.typesize == 0: + configuration = evolve(configuration, typesize=array_metadata.data_type.byte_count) + config_dict = asdict(codec_metadata.configuration) + config_dict.pop("typesize", None) + map_shuffle_str_to_int = {"noshuffle": 0, "shuffle": 1, "bitshuffle": 2} + config_dict["shuffle"] = map_shuffle_str_to_int[config_dict["shuffle"]] + return cls( + array_metadata=array_metadata, + configuration=configuration, + blosc_codec=Blosc.from_config(config_dict), + ) + + async def decode( + self, + chunk_bytes: bytes, + ) -> BytesLike: + return await to_thread(self.blosc_codec.decode, chunk_bytes) + + async def encode( + self, + chunk_bytes: bytes, + ) -> Optional[BytesLike]: + chunk_array = np.frombuffer(chunk_bytes, dtype=self.array_metadata.dtype) + return await to_thread(self.blosc_codec.encode, chunk_array) + + def compute_encoded_size(self, _input_byte_length: int) -> int: + raise NotImplementedError + + +register_codec("blosc", BloscCodec, BloscCodecMetadata) diff --git a/zarr/v3/codecs/bytes.py b/zarr/v3/codecs/bytes.py new file mode 100644 index 0000000000..d180b5fd43 --- /dev/null +++ b/zarr/v3/codecs/bytes.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Literal, + Optional, +) + +import numpy as np +from attr import frozen, field + +from zarr.v3.abc.codec import ArrayBytesCodec +from zarr.v3.codecs.registry import register_codec +from zarr.v3.common import BytesLike + +if TYPE_CHECKING: + from zarr.v3.metadata import CoreArrayMetadata + + +@frozen +class BytesCodecConfigurationMetadata: + endian: Optional[Literal["big", "little"]] = "little" + + +@frozen +class BytesCodecMetadata: + configuration: BytesCodecConfigurationMetadata + name: Literal["bytes"] = field(default="bytes", init=False) + + +@frozen +class BytesCodec(ArrayBytesCodec): + array_metadata: CoreArrayMetadata + configuration: BytesCodecConfigurationMetadata + is_fixed_size = True + + @classmethod + def from_metadata( + cls, codec_metadata: BytesCodecMetadata, array_metadata: CoreArrayMetadata + ) -> BytesCodec: + assert ( + array_metadata.dtype.itemsize == 1 or codec_metadata.configuration.endian is not None + ), "The `endian` configuration needs to be specified for multi-byte data types." + return cls( + array_metadata=array_metadata, + configuration=codec_metadata.configuration, + ) + + def _get_byteorder(self, array: np.ndarray) -> Literal["big", "little"]: + if array.dtype.byteorder == "<": + return "little" + elif array.dtype.byteorder == ">": + return "big" + else: + import sys + + return sys.byteorder + + async def decode( + self, + chunk_bytes: BytesLike, + ) -> np.ndarray: + if self.array_metadata.dtype.itemsize > 0: + if self.configuration.endian == "little": + prefix = "<" + else: + prefix = ">" + dtype = np.dtype(f"{prefix}{self.array_metadata.data_type.to_numpy_shortname()}") + else: + dtype = np.dtype(f"|{self.array_metadata.data_type.to_numpy_shortname()}") + chunk_array = np.frombuffer(chunk_bytes, dtype) + + # ensure correct chunk shape + if chunk_array.shape != self.array_metadata.chunk_shape: + chunk_array = chunk_array.reshape( + self.array_metadata.chunk_shape, + ) + return chunk_array + + async def encode( + self, + chunk_array: np.ndarray, + ) -> Optional[BytesLike]: + if chunk_array.dtype.itemsize > 1: + byteorder = self._get_byteorder(chunk_array) + if self.configuration.endian != byteorder: + new_dtype = chunk_array.dtype.newbyteorder(self.configuration.endian) + chunk_array = chunk_array.astype(new_dtype) + return chunk_array.tobytes() + + def compute_encoded_size(self, input_byte_length: int) -> int: + return input_byte_length + + +register_codec("bytes", BytesCodec, BytesCodecMetadata) + +# compatibility with earlier versions of ZEP1 +register_codec("endian", BytesCodec, BytesCodecMetadata) diff --git a/zarr/v3/codecs/crc32c_.py b/zarr/v3/codecs/crc32c_.py new file mode 100644 index 0000000000..6272c6d2e4 --- /dev/null +++ b/zarr/v3/codecs/crc32c_.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Literal, + Optional, +) + +import numpy as np +from attr import frozen, field +from crc32c import crc32c + +from zarr.v3.abc.codec import BytesBytesCodec +from zarr.v3.codecs.registry import register_codec +from zarr.v3.common import BytesLike + +if TYPE_CHECKING: + from zarr.v3.metadata import CoreArrayMetadata + + +@frozen +class Crc32cCodecMetadata: + name: Literal["crc32c"] = field(default="crc32c", init=False) + + +@frozen +class Crc32cCodec(BytesBytesCodec): + array_metadata: CoreArrayMetadata + is_fixed_size = True + + @classmethod + def from_metadata( + cls, codec_metadata: Crc32cCodecMetadata, array_metadata: CoreArrayMetadata + ) -> Crc32cCodec: + return cls(array_metadata=array_metadata) + + async def decode( + self, + chunk_bytes: bytes, + ) -> BytesLike: + crc32_bytes = chunk_bytes[-4:] + inner_bytes = chunk_bytes[:-4] + + assert np.uint32(crc32c(inner_bytes)).tobytes() == bytes(crc32_bytes) + return inner_bytes + + async def encode( + self, + chunk_bytes: bytes, + ) -> Optional[BytesLike]: + return chunk_bytes + np.uint32(crc32c(chunk_bytes)).tobytes() + + def compute_encoded_size(self, input_byte_length: int) -> int: + return input_byte_length + 4 + + +register_codec("crc32c", Crc32cCodec, Crc32cCodecMetadata) diff --git a/zarr/v3/codecs/gzip.py b/zarr/v3/codecs/gzip.py new file mode 100644 index 0000000000..7275bc825d --- /dev/null +++ b/zarr/v3/codecs/gzip.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Literal, + Optional, +) + +from attr import frozen, field +from numcodecs.gzip import GZip + +from zarr.v3.abc.codec import BytesBytesCodec +from zarr.v3.codecs.registry import register_codec +from zarr.v3.common import BytesLike, to_thread + +if TYPE_CHECKING: + from zarr.v3.metadata import CoreArrayMetadata + + +@frozen +class GzipCodecConfigurationMetadata: + level: int = 5 + + +@frozen +class GzipCodecMetadata: + configuration: GzipCodecConfigurationMetadata + name: Literal["gzip"] = field(default="gzip", init=False) + + +@frozen +class GzipCodec(BytesBytesCodec): + array_metadata: CoreArrayMetadata + configuration: GzipCodecConfigurationMetadata + is_fixed_size = True + + @classmethod + def from_metadata( + cls, codec_metadata: GzipCodecMetadata, array_metadata: CoreArrayMetadata + ) -> GzipCodec: + return cls( + array_metadata=array_metadata, + configuration=codec_metadata.configuration, + ) + + async def decode( + self, + chunk_bytes: bytes, + ) -> BytesLike: + return await to_thread(GZip(self.configuration.level).decode, chunk_bytes) + + async def encode( + self, + chunk_bytes: bytes, + ) -> Optional[BytesLike]: + return await to_thread(GZip(self.configuration.level).encode, chunk_bytes) + + def compute_encoded_size(self, _input_byte_length: int) -> int: + raise NotImplementedError + + +register_codec("gzip", GzipCodec, GzipCodecMetadata) diff --git a/zarr/v3/codecs/registry.py b/zarr/v3/codecs/registry.py new file mode 100644 index 0000000000..276ae08f43 --- /dev/null +++ b/zarr/v3/codecs/registry.py @@ -0,0 +1,26 @@ +from typing import Dict, NamedTuple, Type + +from zarr.v3.abc.codec import Codec +from zarr.v3.metadata import CodecMetadata + + +class CodecRegistryItem(NamedTuple): + codec_cls: Type[Codec] + codec_metadata_cls: Type[CodecMetadata] + + +__codec_registry: Dict[str, CodecRegistryItem] = {} + + +def register_codec( + key: str, codec_cls: Type[Codec], codec_metadata_cls: Type[CodecMetadata] +) -> None: + __codec_registry[key] = CodecRegistryItem(codec_cls, codec_metadata_cls) + + +def get_codec_metadata_class(key: str) -> Type[CodecMetadata]: + return __codec_registry[key].codec_metadata_cls + + +def get_codec_class(key: str) -> Type[Codec]: + return __codec_registry[key].codec_cls diff --git a/zarr/v3/sharding.py b/zarr/v3/codecs/sharding.py similarity index 95% rename from zarr/v3/sharding.py rename to zarr/v3/codecs/sharding.py index 3c5b4bd12d..423d4081f9 100644 --- a/zarr/v3/sharding.py +++ b/zarr/v3/codecs/sharding.py @@ -1,11 +1,17 @@ from __future__ import annotations -from typing import Iterator, List, Mapping, NamedTuple, Optional, Set, Tuple +from typing import Iterator, List, Literal, Mapping, NamedTuple, Optional, Set, Tuple +from attr import field, frozen import numpy as np -from attrs import frozen +from zarr.v3.abc.codec import ( + ArrayBytesCodec, + ArrayBytesCodecPartialDecodeMixin, + ArrayBytesCodecPartialEncodeMixin, +) -from zarr.v3.codecs import ArrayBytesCodec, CodecPipeline +from zarr.v3.codecs import CodecPipeline +from zarr.v3.codecs.registry import register_codec from zarr.v3.common import ( BytesLike, ChunkCoords, @@ -22,14 +28,26 @@ from zarr.v3.metadata import ( CoreArrayMetadata, DataType, - ShardingCodecConfigurationMetadata, - ShardingCodecMetadata, + CodecMetadata, ) from zarr.v3.store import StorePath MAX_UINT_64 = 2**64 - 1 +@frozen +class ShardingCodecConfigurationMetadata: + chunk_shape: ChunkCoords + codecs: List["CodecMetadata"] + index_codecs: List["CodecMetadata"] + + +@frozen +class ShardingCodecMetadata: + configuration: ShardingCodecConfigurationMetadata + name: Literal["sharding_indexed"] = field(default="sharding_indexed", init=False) + + class _ShardIndex(NamedTuple): # dtype uint64, shape (chunks_per_shard_0, chunks_per_shard_1, ..., 2) offsets_and_lengths: np.ndarray @@ -162,7 +180,9 @@ def finalize(self, index_bytes: BytesLike) -> BytesLike: @frozen -class ShardingCodec(ArrayBytesCodec): +class ShardingCodec( + ArrayBytesCodec, ArrayBytesCodecPartialDecodeMixin, ArrayBytesCodecPartialEncodeMixin +): array_metadata: CoreArrayMetadata configuration: ShardingCodecConfigurationMetadata codec_pipeline: CodecPipeline @@ -260,7 +280,7 @@ async def decode_partial( store_path: StorePath, selection: SliceSelection, ) -> Optional[np.ndarray]: - # print("decode_partial") + print("decode_partial") shard_shape = self.array_metadata.chunk_shape chunk_shape = self.configuration.chunk_shape @@ -514,3 +534,6 @@ async def _load_full_shard_maybe(self, store_path: StorePath) -> Optional[_Shard def compute_encoded_size(self, input_byte_length: int) -> int: return input_byte_length + self._shard_index_size() + + +register_codec("sharding_indexed", ShardingCodec, ShardingCodecMetadata) diff --git a/zarr/v3/codecs/transpose.py b/zarr/v3/codecs/transpose.py new file mode 100644 index 0000000000..73c4bcbb96 --- /dev/null +++ b/zarr/v3/codecs/transpose.py @@ -0,0 +1,106 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Literal, + Optional, + Tuple, +) + +import numpy as np +from attr import frozen, field + +from zarr.v3.abc.codec import ArrayArrayCodec +from zarr.v3.codecs.registry import register_codec + +if TYPE_CHECKING: + from zarr.v3.metadata import CoreArrayMetadata + + +@frozen +class TransposeCodecConfigurationMetadata: + order: Tuple[int, ...] + + +@frozen +class TransposeCodecMetadata: + configuration: TransposeCodecConfigurationMetadata + name: Literal["transpose"] = field(default="transpose", init=False) + + +@frozen +class TransposeCodec(ArrayArrayCodec): + array_metadata: CoreArrayMetadata + order: Tuple[int, ...] + is_fixed_size = True + + @classmethod + def from_metadata( + cls, codec_metadata: TransposeCodecMetadata, array_metadata: CoreArrayMetadata + ) -> TransposeCodec: + configuration = codec_metadata.configuration + # Compatibility with older version of ZEP1 + if configuration.order == "F": # type: ignore + order = tuple(array_metadata.ndim - x - 1 for x in range(array_metadata.ndim)) + + elif configuration.order == "C": # type: ignore + order = tuple(range(array_metadata.ndim)) + + else: + assert len(configuration.order) == array_metadata.ndim, ( + "The `order` tuple needs have as many entries as " + + f"there are dimensions in the array. Got: {configuration.order}" + ) + assert len(configuration.order) == len(set(configuration.order)), ( + "There must not be duplicates in the `order` tuple. " + + f"Got: {configuration.order}" + ) + assert all(0 <= x < array_metadata.ndim for x in configuration.order), ( + "All entries in the `order` tuple must be between 0 and " + + f"the number of dimensions in the array. Got: {configuration.order}" + ) + order = tuple(configuration.order) + + return cls( + array_metadata=array_metadata, + order=order, + ) + + def resolve_metadata(self) -> CoreArrayMetadata: + from zarr.v3.metadata import CoreArrayMetadata + + return CoreArrayMetadata( + shape=tuple( + self.array_metadata.shape[self.order[i]] for i in range(self.array_metadata.ndim) + ), + chunk_shape=tuple( + self.array_metadata.chunk_shape[self.order[i]] + for i in range(self.array_metadata.ndim) + ), + data_type=self.array_metadata.data_type, + fill_value=self.array_metadata.fill_value, + runtime_configuration=self.array_metadata.runtime_configuration, + ) + + async def decode( + self, + chunk_array: np.ndarray, + ) -> np.ndarray: + inverse_order = [0 for _ in range(self.array_metadata.ndim)] + for x, i in enumerate(self.order): + inverse_order[x] = i + chunk_array = chunk_array.transpose(inverse_order) + return chunk_array + + async def encode( + self, + chunk_array: np.ndarray, + ) -> Optional[np.ndarray]: + chunk_array = chunk_array.transpose(self.order) + return chunk_array + + def compute_encoded_size(self, input_byte_length: int) -> int: + return input_byte_length + + +register_codec("transpose", TransposeCodec, TransposeCodecMetadata) diff --git a/zarr/v3/codecs/zstd.py b/zarr/v3/codecs/zstd.py new file mode 100644 index 0000000000..6bb7bbe1ef --- /dev/null +++ b/zarr/v3/codecs/zstd.py @@ -0,0 +1,73 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Literal, + Optional, +) + +from attr import frozen, field +from zstandard import ZstdCompressor, ZstdDecompressor + +from zarr.v3.abc.codec import BytesBytesCodec +from zarr.v3.codecs.registry import register_codec +from zarr.v3.common import BytesLike, to_thread + +if TYPE_CHECKING: + from zarr.v3.metadata import CoreArrayMetadata + + +@frozen +class ZstdCodecConfigurationMetadata: + level: int = 0 + checksum: bool = False + + +@frozen +class ZstdCodecMetadata: + configuration: ZstdCodecConfigurationMetadata + name: Literal["zstd"] = field(default="zstd", init=False) + + +@frozen +class ZstdCodec(BytesBytesCodec): + array_metadata: CoreArrayMetadata + configuration: ZstdCodecConfigurationMetadata + is_fixed_size = True + + @classmethod + def from_metadata( + cls, codec_metadata: ZstdCodecMetadata, array_metadata: CoreArrayMetadata + ) -> ZstdCodec: + return cls( + array_metadata=array_metadata, + configuration=codec_metadata.configuration, + ) + + def _compress(self, data: bytes) -> bytes: + ctx = ZstdCompressor( + level=self.configuration.level, write_checksum=self.configuration.checksum + ) + return ctx.compress(data) + + def _decompress(self, data: bytes) -> bytes: + ctx = ZstdDecompressor() + return ctx.decompress(data) + + async def decode( + self, + chunk_bytes: bytes, + ) -> BytesLike: + return await to_thread(self._decompress, chunk_bytes) + + async def encode( + self, + chunk_bytes: bytes, + ) -> Optional[BytesLike]: + return await to_thread(self._compress, chunk_bytes) + + def compute_encoded_size(self, _input_byte_length: int) -> int: + raise NotImplementedError + + +register_codec("zstd", ZstdCodec, ZstdCodecMetadata) diff --git a/zarr/v3/common.py b/zarr/v3/common.py index 0e55a7c1fd..e91356c4e2 100644 --- a/zarr/v3/common.py +++ b/zarr/v3/common.py @@ -32,18 +32,12 @@ def make_cattr(): from zarr.v3.metadata import ( - BloscCodecMetadata, - BytesCodecMetadata, ChunkKeyEncodingMetadata, CodecMetadata, - Crc32cCodecMetadata, DefaultChunkKeyEncodingMetadata, - GzipCodecMetadata, - ShardingCodecMetadata, - TransposeCodecMetadata, V2ChunkKeyEncodingMetadata, - ZstdCodecMetadata, ) + from zarr.v3.codecs.registry import get_codec_metadata_class converter = Converter() @@ -59,24 +53,8 @@ def _structure_chunk_key_encoding_metadata(d: Dict[str, Any], _t) -> ChunkKeyEnc ) def _structure_codec_metadata(d: Dict[str, Any], _t=None) -> CodecMetadata: - if d["name"] == "endian": - d["name"] = "bytes" - - if d["name"] == "blosc": - return converter.structure(d, BloscCodecMetadata) - if d["name"] == "bytes": - return converter.structure(d, BytesCodecMetadata) - if d["name"] == "transpose": - return converter.structure(d, TransposeCodecMetadata) - if d["name"] == "gzip": - return converter.structure(d, GzipCodecMetadata) - if d["name"] == "zstd": - return converter.structure(d, ZstdCodecMetadata) - if d["name"] == "sharding_indexed": - return converter.structure(d, ShardingCodecMetadata) - if d["name"] == "crc32c": - return converter.structure(d, Crc32cCodecMetadata) - raise KeyError + codec_metadata_cls = get_codec_metadata_class(d["name"]) + return converter.structure(d, codec_metadata_cls) converter.register_structure_hook(CodecMetadata, _structure_codec_metadata) diff --git a/zarr/v3/metadata.py b/zarr/v3/metadata.py index 1fc43b19f0..7f468d0a76 100644 --- a/zarr/v3/metadata.py +++ b/zarr/v3/metadata.py @@ -3,7 +3,7 @@ import json from asyncio import AbstractEventLoop from enum import Enum -from typing import Any, Dict, List, Literal, Optional, Tuple, Union +from typing import Any, Dict, List, Literal, Optional, Protocol, Tuple, Union import numpy as np from attr import asdict, field, frozen @@ -142,103 +142,9 @@ def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: ChunkKeyEncodingMetadata = Union[DefaultChunkKeyEncodingMetadata, V2ChunkKeyEncodingMetadata] -BloscShuffle = Literal["noshuffle", "shuffle", "bitshuffle"] - - -@frozen -class BloscCodecConfigurationMetadata: - typesize: int - cname: Literal["lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib"] = "zstd" - clevel: int = 5 - shuffle: BloscShuffle = "noshuffle" - blocksize: int = 0 - - -blosc_shuffle_int_to_str: Dict[int, BloscShuffle] = { - 0: "noshuffle", - 1: "shuffle", - 2: "bitshuffle", -} - - -@frozen -class BloscCodecMetadata: - configuration: BloscCodecConfigurationMetadata - name: Literal["blosc"] = "blosc" - - -@frozen -class BytesCodecConfigurationMetadata: - endian: Optional[Literal["big", "little"]] = "little" - - -@frozen -class BytesCodecMetadata: - configuration: BytesCodecConfigurationMetadata - name: Literal["bytes"] = "bytes" - - -@frozen -class TransposeCodecConfigurationMetadata: - order: Union[Literal["C", "F"], Tuple[int, ...]] = "C" - - -@frozen -class TransposeCodecMetadata: - configuration: TransposeCodecConfigurationMetadata - name: Literal["transpose"] = "transpose" - - -@frozen -class GzipCodecConfigurationMetadata: - level: int = 5 - - -@frozen -class GzipCodecMetadata: - configuration: GzipCodecConfigurationMetadata - name: Literal["gzip"] = "gzip" - - -@frozen -class ZstdCodecConfigurationMetadata: - level: int = 0 - checksum: bool = False - - -@frozen -class ZstdCodecMetadata: - configuration: ZstdCodecConfigurationMetadata - name: Literal["zstd"] = "zstd" - - -@frozen -class Crc32cCodecMetadata: - name: Literal["crc32c"] = "crc32c" - - -@frozen -class ShardingCodecConfigurationMetadata: - chunk_shape: ChunkCoords - codecs: List["CodecMetadata"] - index_codecs: List["CodecMetadata"] - - -@frozen -class ShardingCodecMetadata: - configuration: ShardingCodecConfigurationMetadata - name: Literal["sharding_indexed"] = "sharding_indexed" - - -CodecMetadata = Union[ - BloscCodecMetadata, - BytesCodecMetadata, - TransposeCodecMetadata, - GzipCodecMetadata, - ZstdCodecMetadata, - ShardingCodecMetadata, - Crc32cCodecMetadata, -] +class CodecMetadata(Protocol): + name: str + configuration: Optional[Any] @frozen From 4fac528d643db98a86c217c239f21a13318acc32 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Tue, 5 Dec 2023 15:14:48 +0100 Subject: [PATCH 07/12] adds index_location --- zarr/v3/codecs/__init__.py | 7 +++-- zarr/v3/codecs/sharding.py | 58 ++++++++++++++++++++++++++++++++------ zarr/v3/metadata.py | 7 ++++- 3 files changed, 60 insertions(+), 12 deletions(-) diff --git a/zarr/v3/codecs/__init__.py b/zarr/v3/codecs/__init__.py index fcfef4e233..e972151de1 100644 --- a/zarr/v3/codecs/__init__.py +++ b/zarr/v3/codecs/__init__.py @@ -17,7 +17,7 @@ from zarr.v3.abc.codec import Codec, ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.v3.common import BytesLike -from zarr.v3.metadata import CodecMetadata +from zarr.v3.metadata import CodecMetadata, ShardingCodecIndexLocation from zarr.v3.codecs.registry import get_codec_class if TYPE_CHECKING: @@ -208,11 +208,14 @@ def sharding_codec( chunk_shape: Tuple[int, ...], codecs: Optional[List[CodecMetadata]] = None, index_codecs: Optional[List[CodecMetadata]] = None, + index_location: ShardingCodecIndexLocation = ShardingCodecIndexLocation.end, ) -> "ShardingCodecMetadata": from zarr.v3.codecs.sharding import ShardingCodecMetadata, ShardingCodecConfigurationMetadata codecs = codecs or [bytes_codec()] index_codecs = index_codecs or [bytes_codec(), crc32c_codec()] return ShardingCodecMetadata( - configuration=ShardingCodecConfigurationMetadata(chunk_shape, codecs, index_codecs) + configuration=ShardingCodecConfigurationMetadata( + chunk_shape, codecs, index_codecs, index_location + ) ) diff --git a/zarr/v3/codecs/sharding.py b/zarr/v3/codecs/sharding.py index 423d4081f9..f182ef395e 100644 --- a/zarr/v3/codecs/sharding.py +++ b/zarr/v3/codecs/sharding.py @@ -1,6 +1,17 @@ from __future__ import annotations -from typing import Iterator, List, Literal, Mapping, NamedTuple, Optional, Set, Tuple +from typing import ( + Awaitable, + Callable, + Iterator, + List, + Literal, + Mapping, + NamedTuple, + Optional, + Set, + Tuple, +) from attr import field, frozen import numpy as np @@ -29,6 +40,7 @@ CoreArrayMetadata, DataType, CodecMetadata, + ShardingCodecIndexLocation, ) from zarr.v3.store import StorePath @@ -40,6 +52,7 @@ class ShardingCodecConfigurationMetadata: chunk_shape: ChunkCoords codecs: List["CodecMetadata"] index_codecs: List["CodecMetadata"] + index_location: ShardingCodecIndexLocation = ShardingCodecIndexLocation.end @frozen @@ -67,7 +80,7 @@ def get_chunk_slice(self, chunk_coords: ChunkCoords) -> Optional[Tuple[int, int] if (chunk_start, chunk_len) == (MAX_UINT_64, MAX_UINT_64): return None else: - return (int(chunk_start), int(chunk_start + chunk_len)) + return (int(chunk_start), int(chunk_start) + int(chunk_len)) def set_chunk_slice(self, chunk_coords: ChunkCoords, chunk_slice: Optional[slice]) -> None: localized_chunk = self._localize_chunk(chunk_coords) @@ -113,9 +126,15 @@ class _ShardProxy(Mapping): @classmethod async def from_bytes(cls, buf: BytesLike, codec: ShardingCodec) -> _ShardProxy: + shard_index_size = codec._shard_index_size() obj = cls() obj.buf = memoryview(buf) - obj.index = await codec._decode_shard_index(obj.buf[-codec._shard_index_size() :]) + if codec.configuration.index_location == ShardingCodecIndexLocation.start: + shard_index_bytes = obj.buf[:shard_index_size] + else: + shard_index_bytes = obj.buf[-shard_index_size:] + + obj.index = await codec._decode_shard_index(shard_index_bytes) return obj @classmethod @@ -174,9 +193,21 @@ def append(self, chunk_coords: ChunkCoords, value: BytesLike): self.buf.extend(value) self.index.set_chunk_slice(chunk_coords, slice(chunk_start, chunk_start + chunk_length)) - def finalize(self, index_bytes: BytesLike) -> BytesLike: - self.buf.extend(index_bytes) - return self.buf + async def finalize( + self, + index_location: ShardingCodecIndexLocation, + index_encoder: Callable[[_ShardIndex], Awaitable[BytesLike]], + ) -> BytesLike: + index_bytes = await index_encoder(self.index) + if index_location == ShardingCodecIndexLocation.start: + self.index.offsets_and_lengths[..., 0] += len(index_bytes) + index_bytes = await index_encoder(self.index) # encode again with corrected offsets + out_buf = bytearray(index_bytes) + out_buf.extend(self.buf) + else: + out_buf = self.buf + out_buf.extend(index_bytes) + return out_buf @frozen @@ -410,7 +441,9 @@ async def _write_chunk( if chunk_bytes is not None: shard_builder.append(chunk_coords, chunk_bytes) - return shard_builder.finalize(await self._encode_shard_index(shard_builder.index)) + return await shard_builder.finalize( + self.configuration.index_location, self._encode_shard_index + ) async def encode_partial( self, @@ -497,7 +530,10 @@ async def _write_chunk( await store_path.delete_async() else: await store_path.set_async( - shard_builder.finalize(await self._encode_shard_index(shard_builder.index)) + await shard_builder.finalize( + self.configuration.index_location, + self._encode_shard_index, + ) ) def _is_total_shard(self, all_chunk_coords: Set[ChunkCoords]) -> bool: @@ -517,7 +553,11 @@ def _shard_index_size(self) -> int: return self.index_codec_pipeline.compute_encoded_size(16 * product(self.chunks_per_shard)) async def _load_shard_index_maybe(self, store_path: StorePath) -> Optional[_ShardIndex]: - index_bytes = await store_path.get_async((-self._shard_index_size(), None)) + shard_index_size = self._shard_index_size() + if self.configuration.index_location == ShardingCodecIndexLocation.start: + index_bytes = await store_path.get_async((0, shard_index_size)) + else: + index_bytes = await store_path.get_async((-shard_index_size, None)) if index_bytes is not None: return await self._decode_shard_index(index_bytes) return None diff --git a/zarr/v3/metadata.py b/zarr/v3/metadata.py index 7f468d0a76..68b8c7bb44 100644 --- a/zarr/v3/metadata.py +++ b/zarr/v3/metadata.py @@ -147,6 +147,11 @@ class CodecMetadata(Protocol): configuration: Optional[Any] +class ShardingCodecIndexLocation(Enum): + start = "start" + end = "end" + + @frozen class CoreArrayMetadata: shape: ChunkCoords @@ -196,7 +201,7 @@ def get_core_metadata(self, runtime_configuration: RuntimeConfiguration) -> Core def to_bytes(self) -> bytes: def _json_convert(o): - if isinstance(o, DataType): + if isinstance(o, Enum): return o.name raise TypeError From fcf79b631b570857cbdff2d676569d3474bc4ea1 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Tue, 5 Dec 2023 17:07:38 +0100 Subject: [PATCH 08/12] adds support for codec entry points --- pyproject.toml | 3 +++ zarr/v3/abc/codec.py | 4 ++++ zarr/v3/codecs/blosc.py | 6 +++++- zarr/v3/codecs/bytes.py | 8 +++++-- zarr/v3/codecs/crc32c_.py | 6 +++++- zarr/v3/codecs/gzip.py | 6 +++++- zarr/v3/codecs/registry.py | 43 +++++++++++++++++++++++++++++++------ zarr/v3/codecs/sharding.py | 6 +++++- zarr/v3/codecs/transpose.py | 6 +++++- zarr/v3/codecs/zstd.py | 6 +++++- 10 files changed, 80 insertions(+), 14 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 22ea19f28f..6a3730e47a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -137,3 +137,6 @@ filterwarnings = [ "ignore:PY_SSIZE_T_CLEAN will be required.*:DeprecationWarning", "ignore:The loop argument is deprecated since Python 3.8.*:DeprecationWarning", ] + +[project.entry-points."zarr.codecs"] +test = "zarr.v3.codecs.bytes:BytesCodec" \ No newline at end of file diff --git a/zarr/v3/abc/codec.py b/zarr/v3/abc/codec.py index c856a772e0..71993ad4ac 100644 --- a/zarr/v3/abc/codec.py +++ b/zarr/v3/abc/codec.py @@ -40,6 +40,10 @@ def from_metadata( ) -> "Type[Codec]": pass + @classmethod + def get_metadata_class(cls) -> "Type[CodecMetadata]": + pass + class ArrayArrayCodec(Codec): @abstractmethod diff --git a/zarr/v3/codecs/blosc.py b/zarr/v3/codecs/blosc.py index 4697512037..1bbedc229a 100644 --- a/zarr/v3/codecs/blosc.py +++ b/zarr/v3/codecs/blosc.py @@ -72,6 +72,10 @@ def from_metadata( blosc_codec=Blosc.from_config(config_dict), ) + @classmethod + def get_metadata_class(cls) -> BloscCodecMetadata: + return BloscCodecMetadata + async def decode( self, chunk_bytes: bytes, @@ -89,4 +93,4 @@ def compute_encoded_size(self, _input_byte_length: int) -> int: raise NotImplementedError -register_codec("blosc", BloscCodec, BloscCodecMetadata) +register_codec("blosc", BloscCodec) diff --git a/zarr/v3/codecs/bytes.py b/zarr/v3/codecs/bytes.py index d180b5fd43..955c9e2f53 100644 --- a/zarr/v3/codecs/bytes.py +++ b/zarr/v3/codecs/bytes.py @@ -46,6 +46,10 @@ def from_metadata( configuration=codec_metadata.configuration, ) + @classmethod + def get_metadata_class(cls) -> BytesCodecMetadata: + return BytesCodecMetadata + def _get_byteorder(self, array: np.ndarray) -> Literal["big", "little"]: if array.dtype.byteorder == "<": return "little" @@ -92,7 +96,7 @@ def compute_encoded_size(self, input_byte_length: int) -> int: return input_byte_length -register_codec("bytes", BytesCodec, BytesCodecMetadata) +register_codec("bytes", BytesCodec) # compatibility with earlier versions of ZEP1 -register_codec("endian", BytesCodec, BytesCodecMetadata) +register_codec("endian", BytesCodec) diff --git a/zarr/v3/codecs/crc32c_.py b/zarr/v3/codecs/crc32c_.py index 6272c6d2e4..7e0e5997bc 100644 --- a/zarr/v3/codecs/crc32c_.py +++ b/zarr/v3/codecs/crc32c_.py @@ -34,6 +34,10 @@ def from_metadata( ) -> Crc32cCodec: return cls(array_metadata=array_metadata) + @classmethod + def get_metadata_class(cls) -> Crc32cCodecMetadata: + return Crc32cCodecMetadata + async def decode( self, chunk_bytes: bytes, @@ -54,4 +58,4 @@ def compute_encoded_size(self, input_byte_length: int) -> int: return input_byte_length + 4 -register_codec("crc32c", Crc32cCodec, Crc32cCodecMetadata) +register_codec("crc32c", Crc32cCodec) diff --git a/zarr/v3/codecs/gzip.py b/zarr/v3/codecs/gzip.py index 7275bc825d..dbbe9395f5 100644 --- a/zarr/v3/codecs/gzip.py +++ b/zarr/v3/codecs/gzip.py @@ -43,6 +43,10 @@ def from_metadata( configuration=codec_metadata.configuration, ) + @classmethod + def get_metadata_class(cls) -> GzipCodecMetadata: + return GzipCodecMetadata + async def decode( self, chunk_bytes: bytes, @@ -59,4 +63,4 @@ def compute_encoded_size(self, _input_byte_length: int) -> int: raise NotImplementedError -register_codec("gzip", GzipCodec, GzipCodecMetadata) +register_codec("gzip", GzipCodec) diff --git a/zarr/v3/codecs/registry.py b/zarr/v3/codecs/registry.py index 276ae08f43..7415c20ba4 100644 --- a/zarr/v3/codecs/registry.py +++ b/zarr/v3/codecs/registry.py @@ -1,4 +1,7 @@ +from __future__ import annotations + from typing import Dict, NamedTuple, Type +from importlib.metadata import EntryPoint, entry_points as get_entry_points from zarr.v3.abc.codec import Codec from zarr.v3.metadata import CodecMetadata @@ -10,17 +13,45 @@ class CodecRegistryItem(NamedTuple): __codec_registry: Dict[str, CodecRegistryItem] = {} +__lazy_load_codecs: Dict[str, EntryPoint] = {} + + +def _collect_entrypoints() -> None: + entry_points = get_entry_points() + print(entry_points.keys()) + if hasattr(entry_points, "select"): + # If entry_points() has a select method, use that. Python 3.10+ + for e in entry_points.select(group="zarr.codecs"): + __lazy_load_codecs[e.name] = e + else: + # Otherwise, fallback to using get + for e in entry_points.get("zarr.codecs", []): + __lazy_load_codecs[e.name] = e + +def register_codec(key: str, codec_cls: Type[Codec]) -> None: + __codec_registry[key] = CodecRegistryItem(codec_cls, codec_cls.get_metadata_class()) -def register_codec( - key: str, codec_cls: Type[Codec], codec_metadata_cls: Type[CodecMetadata] -) -> None: - __codec_registry[key] = CodecRegistryItem(codec_cls, codec_metadata_cls) + +def _get_codec_item(key: str) -> CodecRegistryItem: + item = __codec_registry.get(key) + if item is None: + if key in __lazy_load_codecs: + # logger.debug("Auto loading codec '%s' from entrypoint", codec_id) + cls = __lazy_load_codecs[key].load() + register_codec(key, cls) + item = __codec_registry.get(key) + if item: + return item + raise KeyError def get_codec_metadata_class(key: str) -> Type[CodecMetadata]: - return __codec_registry[key].codec_metadata_cls + return _get_codec_item(key).codec_metadata_cls def get_codec_class(key: str) -> Type[Codec]: - return __codec_registry[key].codec_cls + return _get_codec_item(key).codec_cls + + +_collect_entrypoints() diff --git a/zarr/v3/codecs/sharding.py b/zarr/v3/codecs/sharding.py index f182ef395e..7af891fcb5 100644 --- a/zarr/v3/codecs/sharding.py +++ b/zarr/v3/codecs/sharding.py @@ -262,6 +262,10 @@ def from_metadata( chunks_per_shard=chunks_per_shard, ) + @classmethod + def get_metadata_class(cls) -> ShardingCodecMetadata: + return ShardingCodecMetadata + async def decode( self, shard_bytes: BytesLike, @@ -576,4 +580,4 @@ def compute_encoded_size(self, input_byte_length: int) -> int: return input_byte_length + self._shard_index_size() -register_codec("sharding_indexed", ShardingCodec, ShardingCodecMetadata) +register_codec("sharding_indexed", ShardingCodec) diff --git a/zarr/v3/codecs/transpose.py b/zarr/v3/codecs/transpose.py index 73c4bcbb96..35544d2031 100644 --- a/zarr/v3/codecs/transpose.py +++ b/zarr/v3/codecs/transpose.py @@ -66,6 +66,10 @@ def from_metadata( order=order, ) + @classmethod + def get_metadata_class(cls) -> TransposeCodecMetadata: + return TransposeCodecMetadata + def resolve_metadata(self) -> CoreArrayMetadata: from zarr.v3.metadata import CoreArrayMetadata @@ -103,4 +107,4 @@ def compute_encoded_size(self, input_byte_length: int) -> int: return input_byte_length -register_codec("transpose", TransposeCodec, TransposeCodecMetadata) +register_codec("transpose", TransposeCodec) diff --git a/zarr/v3/codecs/zstd.py b/zarr/v3/codecs/zstd.py index 6bb7bbe1ef..9b536b9c70 100644 --- a/zarr/v3/codecs/zstd.py +++ b/zarr/v3/codecs/zstd.py @@ -44,6 +44,10 @@ def from_metadata( configuration=codec_metadata.configuration, ) + @classmethod + def get_metadata_class(cls) -> ZstdCodecMetadata: + return ZstdCodecMetadata + def _compress(self, data: bytes) -> bytes: ctx = ZstdCompressor( level=self.configuration.level, write_checksum=self.configuration.checksum @@ -70,4 +74,4 @@ def compute_encoded_size(self, _input_byte_length: int) -> int: raise NotImplementedError -register_codec("zstd", ZstdCodec, ZstdCodecMetadata) +register_codec("zstd", ZstdCodec) From 9f972825cf6a8328afef98f681dfa3ca7a4d4e36 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Tue, 5 Dec 2023 18:16:06 +0100 Subject: [PATCH 09/12] adds tests from zarrita --- zarr/tests/test_codecs_v3.py | 989 +++++++++++++++++++++++++++++++++++ zarr/v3/array.py | 4 + zarr/v3/codecs/__init__.py | 13 +- zarr/v3/codecs/registry.py | 1 - zarr/v3/store.py | 49 +- 5 files changed, 1053 insertions(+), 3 deletions(-) create mode 100644 zarr/tests/test_codecs_v3.py diff --git a/zarr/tests/test_codecs_v3.py b/zarr/tests/test_codecs_v3.py new file mode 100644 index 0000000000..90e4a8f87d --- /dev/null +++ b/zarr/tests/test_codecs_v3.py @@ -0,0 +1,989 @@ +from __future__ import annotations + +import json +from typing import Iterator, List, Literal, Optional +from attr import frozen + +import numpy as np +import pytest +import zarr +from zarr.v3 import codecs +from zarr.v3.array import Array, AsyncArray +from zarr.v3.common import Selection +from zarr.v3.indexing import morton_order_iter +from zarr.v3.metadata import CodecMetadata, ShardingCodecIndexLocation, runtime_configuration + +from zarr.v3.store import MemoryStore, Store + + +@frozen +class _AsyncArrayProxy: + array: AsyncArray + + def __getitem__(self, selection: Selection) -> _AsyncArraySelectionProxy: + return _AsyncArraySelectionProxy(self.array, selection) + + +@frozen +class _AsyncArraySelectionProxy: + array: AsyncArray + selection: Selection + + async def get(self) -> np.ndarray: + return await self.array.getitem(self.selection) + + async def set(self, value: np.ndarray): + return await self.array.setitem(self.selection, value) + + +@pytest.fixture +def store() -> Iterator[Store]: + yield MemoryStore() + + +@pytest.fixture +def sample_data() -> Iterator[np.ndarray]: + yield np.arange(0, 128 * 128 * 128, dtype="uint16").reshape((128, 128, 128), order="F") + + +@pytest.mark.parametrize( + "index_location", [ShardingCodecIndexLocation.start, ShardingCodecIndexLocation.end] +) +def test_sharding( + store: Store, sample_data: np.ndarray, index_location: ShardingCodecIndexLocation +): + a = Array.create( + store / "sample", + shape=sample_data.shape, + chunk_shape=(64, 64, 64), + dtype=sample_data.dtype, + fill_value=0, + codecs=[ + codecs.sharding_codec( + (32, 32, 32), + [ + codecs.transpose_codec("F", sample_data.ndim), + codecs.bytes_codec(), + codecs.blosc_codec(typesize=sample_data.dtype.itemsize, cname="lz4"), + ], + index_location=index_location, + ) + ], + ) + + a[:, :, :] = sample_data + + read_data = a[0 : sample_data.shape[0], 0 : sample_data.shape[1], 0 : sample_data.shape[2]] + assert sample_data.shape == read_data.shape + assert np.array_equal(sample_data, read_data) + + +@pytest.mark.parametrize( + "index_location", [ShardingCodecIndexLocation.start, ShardingCodecIndexLocation.end] +) +def test_sharding_partial( + store: Store, sample_data: np.ndarray, index_location: ShardingCodecIndexLocation +): + a = Array.create( + store / "sample", + shape=tuple(a + 10 for a in sample_data.shape), + chunk_shape=(64, 64, 64), + dtype=sample_data.dtype, + fill_value=0, + codecs=[ + codecs.sharding_codec( + (32, 32, 32), + [ + codecs.transpose_codec("F", sample_data.ndim), + codecs.bytes_codec(), + codecs.blosc_codec(typesize=sample_data.dtype.itemsize, cname="lz4"), + ], + index_location=index_location, + ) + ], + ) + + a[10:, 10:, 10:] = sample_data + + read_data = a[0:10, 0:10, 0:10] + assert np.all(read_data == 0) + + read_data = a[10:, 10:, 10:] + assert sample_data.shape == read_data.shape + assert np.array_equal(sample_data, read_data) + + +@pytest.mark.parametrize( + "index_location", [ShardingCodecIndexLocation.start, ShardingCodecIndexLocation.end] +) +def test_sharding_partial_read( + store: Store, sample_data: np.ndarray, index_location: ShardingCodecIndexLocation +): + a = Array.create( + store / "sample", + shape=tuple(a + 10 for a in sample_data.shape), + chunk_shape=(64, 64, 64), + dtype=sample_data.dtype, + fill_value=1, + codecs=[ + codecs.sharding_codec( + (32, 32, 32), + [ + codecs.transpose_codec("F", sample_data.ndim), + codecs.bytes_codec(), + codecs.blosc_codec(typesize=sample_data.dtype.itemsize, cname="lz4"), + ], + index_location=index_location, + ) + ], + ) + + read_data = a[0:10, 0:10, 0:10] + assert np.all(read_data == 1) + + +@pytest.mark.parametrize( + "index_location", [ShardingCodecIndexLocation.start, ShardingCodecIndexLocation.end] +) +def test_sharding_partial_overwrite( + store: Store, sample_data: np.ndarray, index_location: ShardingCodecIndexLocation +): + data = sample_data[:10, :10, :10] + + a = Array.create( + store / "sample", + shape=tuple(a + 10 for a in data.shape), + chunk_shape=(64, 64, 64), + dtype=data.dtype, + fill_value=1, + codecs=[ + codecs.sharding_codec( + (32, 32, 32), + [ + codecs.transpose_codec("F", data.ndim), + codecs.bytes_codec(), + codecs.blosc_codec(typesize=data.dtype.itemsize, cname="lz4"), + ], + index_location=index_location, + ) + ], + ) + + a[:10, :10, :10] = data + + read_data = a[0:10, 0:10, 0:10] + assert np.array_equal(data, read_data) + + data = data + 10 + a[:10, :10, :10] = data + read_data = a[0:10, 0:10, 0:10] + assert np.array_equal(data, read_data) + + +@pytest.mark.parametrize( + "outer_index_location", + [ShardingCodecIndexLocation.start, ShardingCodecIndexLocation.end], +) +@pytest.mark.parametrize( + "inner_index_location", + [ShardingCodecIndexLocation.start, ShardingCodecIndexLocation.end], +) +def test_nested_sharding( + store: Store, + sample_data: np.ndarray, + outer_index_location: ShardingCodecIndexLocation, + inner_index_location: ShardingCodecIndexLocation, +): + a = Array.create( + store / "l4_sample" / "color" / "1", + shape=sample_data.shape, + chunk_shape=(64, 64, 64), + dtype=sample_data.dtype, + fill_value=0, + codecs=[ + codecs.sharding_codec( + (32, 32, 32), + [codecs.sharding_codec((16, 16, 16), index_location=inner_index_location)], + index_location=outer_index_location, + ) + ], + ) + + a[:, :, :] = sample_data + + read_data = a[0 : sample_data.shape[0], 0 : sample_data.shape[1], 0 : sample_data.shape[2]] + assert sample_data.shape == read_data.shape + assert np.array_equal(sample_data, read_data) + + +@pytest.mark.parametrize("input_order", ["F", "C"]) +@pytest.mark.parametrize("store_order", ["F", "C"]) +@pytest.mark.parametrize("runtime_write_order", ["F", "C"]) +@pytest.mark.parametrize("runtime_read_order", ["F", "C"]) +@pytest.mark.parametrize("with_sharding", [True, False]) +@pytest.mark.asyncio +async def test_order( + store: Store, + input_order: Literal["F", "C"], + store_order: Literal["F", "C"], + runtime_write_order: Literal["F", "C"], + runtime_read_order: Literal["F", "C"], + with_sharding: bool, +): + data = np.arange(0, 256, dtype="uint16").reshape((32, 8), order=input_order) + + codecs_: List[CodecMetadata] = ( + [ + codecs.sharding_codec( + (16, 8), + codecs=[codecs.transpose_codec(store_order, data.ndim), codecs.bytes_codec()], + ) + ] + if with_sharding + else [codecs.transpose_codec(store_order, data.ndim), codecs.bytes_codec()] + ) + + a = await AsyncArray.create( + store / "order", + shape=data.shape, + chunk_shape=(32, 8), + dtype=data.dtype, + fill_value=0, + chunk_key_encoding=("v2", "."), + codecs=codecs_, + runtime_configuration=runtime_configuration(runtime_write_order), + ) + + await _AsyncArrayProxy(a)[:, :].set(data) + read_data = await _AsyncArrayProxy(a)[:, :].get() + assert np.array_equal(data, read_data) + + a = await AsyncArray.open( + store / "order", + runtime_configuration=runtime_configuration(order=runtime_read_order), + ) + read_data = await _AsyncArrayProxy(a)[:, :].get() + assert np.array_equal(data, read_data) + + if runtime_read_order == "F": + assert read_data.flags["F_CONTIGUOUS"] + assert not read_data.flags["C_CONTIGUOUS"] + else: + assert not read_data.flags["F_CONTIGUOUS"] + assert read_data.flags["C_CONTIGUOUS"] + + if not with_sharding: + # Compare with zarr-python + z = zarr.create( + shape=data.shape, + chunks=(32, 8), + dtype="u2", "u2", " np.dtype: def attrs(self) -> dict: return self._async_array.attrs + @property + def metadata(self) -> ArrayMetadata: + return self._async_array.metadata + @property def store_path(self) -> str: return self._async_array.store_path diff --git a/zarr/v3/codecs/__init__.py b/zarr/v3/codecs/__init__.py index e972151de1..30a42c8ad5 100644 --- a/zarr/v3/codecs/__init__.py +++ b/zarr/v3/codecs/__init__.py @@ -180,9 +180,20 @@ def bytes_codec(endian: Optional[Literal["big", "little"]] = "little") -> "Bytes return BytesCodecMetadata(configuration=BytesCodecConfigurationMetadata(endian)) -def transpose_codec(order: Union[Tuple[int, ...], Literal["C", "F"]]) -> "TransposeCodecMetadata": +def transpose_codec( + order: Union[Tuple[int, ...], Literal["C", "F"]], ndim: Optional[int] = None +) -> "TransposeCodecMetadata": from zarr.v3.codecs.transpose import TransposeCodecMetadata, TransposeCodecConfigurationMetadata + if order == "C" or order == "F": + assert ( + isinstance(ndim, int) and ndim > 0 + ), 'When using "C" or "F" the `ndim` argument needs to be provided.' + if order == "C": + order = tuple(range(ndim)) + if order == "F": + order = tuple(ndim - i - 1 for i in range(ndim)) + return TransposeCodecMetadata(configuration=TransposeCodecConfigurationMetadata(order)) diff --git a/zarr/v3/codecs/registry.py b/zarr/v3/codecs/registry.py index 7415c20ba4..388103975d 100644 --- a/zarr/v3/codecs/registry.py +++ b/zarr/v3/codecs/registry.py @@ -18,7 +18,6 @@ class CodecRegistryItem(NamedTuple): def _collect_entrypoints() -> None: entry_points = get_entry_points() - print(entry_points.keys()) if hasattr(entry_points, "select"): # If entry_points() has a select method, use that. Python 3.10+ for e in entry_points.select(group="zarr.codecs"): diff --git a/zarr/v3/store.py b/zarr/v3/store.py index f7472c68d2..b6c20be41f 100644 --- a/zarr/v3/store.py +++ b/zarr/v3/store.py @@ -10,7 +10,7 @@ import asyncio import io from pathlib import Path -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, MutableMapping, Optional, Tuple, Union from zarr.v3.common import BytesLike, to_thread @@ -284,6 +284,53 @@ def __repr__(self) -> str: return f"RemoteStore({repr(str(self))})" +class MemoryStore(Store): + supports_partial_writes = True + store_dict: MutableMapping[str, bytes] + + def __init__(self, store_dict: Optional[MutableMapping[str, bytes]] = None): + self.store_dict = store_dict or {} + + async def get_async( + self, key: str, byte_range: Optional[Tuple[int, Optional[int]]] = None + ) -> Optional[BytesLike]: + assert isinstance(key, str) + try: + value = self.store_dict[key] + if byte_range is not None: + value = value[byte_range[0] : byte_range[1]] + return value + except KeyError: + return None + + async def set_async( + self, key: str, value: BytesLike, byte_range: Optional[Tuple[int, int]] = None + ) -> None: + assert isinstance(key, str) + + if byte_range is not None: + buf = bytearray(self.store_dict[key]) + buf[byte_range[0] : byte_range[1]] = value + self.store_dict[key] = buf + else: + self.store_dict[key] = value + + async def delete_async(self, key: str) -> None: + try: + del self.store_dict[key] + except KeyError: + pass + + async def exists_async(self, key: str) -> bool: + return key in self.store_dict + + def __str__(self) -> str: + return f"memory://{id(self.store_dict)}" + + def __repr__(self) -> str: + return f"MemoryStore({repr(str(self))})" + + StoreLike = Union[Store, StorePath, Path, str] From f8bab5b36b0e012da3fdea574bcf71e28f88095c Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Tue, 5 Dec 2023 21:25:49 +0100 Subject: [PATCH 10/12] fixes types --- .pre-commit-config.yaml | 3 ++- zarr/v3/abc/codec.py | 4 +++- zarr/v3/array.py | 2 +- zarr/v3/codecs/blosc.py | 7 +++++-- zarr/v3/codecs/bytes.py | 7 +++++-- zarr/v3/codecs/crc32c_.py | 7 +++++-- zarr/v3/codecs/gzip.py | 8 ++++++-- zarr/v3/codecs/sharding.py | 7 +++++-- zarr/v3/codecs/transpose.py | 8 ++++++-- zarr/v3/codecs/zstd.py | 7 +++++-- zarr/v3/metadata.py | 5 +++-- 11 files changed, 46 insertions(+), 19 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f22dc39832..a8ee599137 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,7 +27,7 @@ repos: hooks: - id: check-yaml - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.3.0 + rev: v1.7.1 hooks: - id: mypy files: zarr @@ -35,3 +35,4 @@ repos: additional_dependencies: - types-redis - types-setuptools + - attrs diff --git a/zarr/v3/abc/codec.py b/zarr/v3/abc/codec.py index 71993ad4ac..c81f2c976f 100644 --- a/zarr/v3/abc/codec.py +++ b/zarr/v3/abc/codec.py @@ -35,12 +35,14 @@ def resolve_metadata(self) -> CoreArrayMetadata: return self.array_metadata @classmethod + @abstractmethod def from_metadata( cls, codec_metadata: "CodecMetadata", array_metadata: CoreArrayMetadata - ) -> "Type[Codec]": + ) -> Codec: pass @classmethod + @abstractmethod def get_metadata_class(cls) -> "Type[CodecMetadata]": pass diff --git a/zarr/v3/array.py b/zarr/v3/array.py index 888bd5e272..8c54cfd91c 100644 --- a/zarr/v3/array.py +++ b/zarr/v3/array.py @@ -374,7 +374,7 @@ async def _write_chunk_to_store(self, store_path: StorePath, chunk_array: np.nda else: await store_path.set_async(chunk_bytes) - async def resize(self, new_shape: ChunkCoords) -> Array: + async def resize(self, new_shape: ChunkCoords) -> AsyncArray: assert len(new_shape) == len(self.metadata.shape) new_metadata = evolve(self.metadata, shape=new_shape) diff --git a/zarr/v3/codecs/blosc.py b/zarr/v3/codecs/blosc.py index 1bbedc229a..8fb32faaa7 100644 --- a/zarr/v3/codecs/blosc.py +++ b/zarr/v3/codecs/blosc.py @@ -5,6 +5,7 @@ Dict, Literal, Optional, + Type, ) import numcodecs @@ -15,6 +16,7 @@ from zarr.v3.abc.codec import BytesBytesCodec from zarr.v3.codecs.registry import register_codec from zarr.v3.common import BytesLike, to_thread +from zarr.v3.metadata import CodecMetadata if TYPE_CHECKING: from zarr.v3.metadata import CoreArrayMetadata @@ -57,8 +59,9 @@ class BloscCodec(BytesBytesCodec): @classmethod def from_metadata( - cls, codec_metadata: BloscCodecMetadata, array_metadata: CoreArrayMetadata + cls, codec_metadata: CodecMetadata, array_metadata: CoreArrayMetadata ) -> BloscCodec: + assert isinstance(codec_metadata, BloscCodecMetadata) configuration = codec_metadata.configuration if configuration.typesize == 0: configuration = evolve(configuration, typesize=array_metadata.data_type.byte_count) @@ -73,7 +76,7 @@ def from_metadata( ) @classmethod - def get_metadata_class(cls) -> BloscCodecMetadata: + def get_metadata_class(cls) -> Type[BloscCodecMetadata]: return BloscCodecMetadata async def decode( diff --git a/zarr/v3/codecs/bytes.py b/zarr/v3/codecs/bytes.py index 955c9e2f53..80a3f155d0 100644 --- a/zarr/v3/codecs/bytes.py +++ b/zarr/v3/codecs/bytes.py @@ -4,6 +4,7 @@ TYPE_CHECKING, Literal, Optional, + Type, ) import numpy as np @@ -12,6 +13,7 @@ from zarr.v3.abc.codec import ArrayBytesCodec from zarr.v3.codecs.registry import register_codec from zarr.v3.common import BytesLike +from zarr.v3.metadata import CodecMetadata if TYPE_CHECKING: from zarr.v3.metadata import CoreArrayMetadata @@ -36,8 +38,9 @@ class BytesCodec(ArrayBytesCodec): @classmethod def from_metadata( - cls, codec_metadata: BytesCodecMetadata, array_metadata: CoreArrayMetadata + cls, codec_metadata: CodecMetadata, array_metadata: CoreArrayMetadata ) -> BytesCodec: + assert isinstance(codec_metadata, BytesCodecMetadata) assert ( array_metadata.dtype.itemsize == 1 or codec_metadata.configuration.endian is not None ), "The `endian` configuration needs to be specified for multi-byte data types." @@ -47,7 +50,7 @@ def from_metadata( ) @classmethod - def get_metadata_class(cls) -> BytesCodecMetadata: + def get_metadata_class(cls) -> Type[BytesCodecMetadata]: return BytesCodecMetadata def _get_byteorder(self, array: np.ndarray) -> Literal["big", "little"]: diff --git a/zarr/v3/codecs/crc32c_.py b/zarr/v3/codecs/crc32c_.py index 7e0e5997bc..c4fab3c9b9 100644 --- a/zarr/v3/codecs/crc32c_.py +++ b/zarr/v3/codecs/crc32c_.py @@ -4,6 +4,7 @@ TYPE_CHECKING, Literal, Optional, + Type, ) import numpy as np @@ -13,6 +14,7 @@ from zarr.v3.abc.codec import BytesBytesCodec from zarr.v3.codecs.registry import register_codec from zarr.v3.common import BytesLike +from zarr.v3.metadata import CodecMetadata if TYPE_CHECKING: from zarr.v3.metadata import CoreArrayMetadata @@ -30,12 +32,13 @@ class Crc32cCodec(BytesBytesCodec): @classmethod def from_metadata( - cls, codec_metadata: Crc32cCodecMetadata, array_metadata: CoreArrayMetadata + cls, codec_metadata: CodecMetadata, array_metadata: CoreArrayMetadata ) -> Crc32cCodec: + assert isinstance(codec_metadata, Crc32cCodecMetadata) return cls(array_metadata=array_metadata) @classmethod - def get_metadata_class(cls) -> Crc32cCodecMetadata: + def get_metadata_class(cls) -> Type[Crc32cCodecMetadata]: return Crc32cCodecMetadata async def decode( diff --git a/zarr/v3/codecs/gzip.py b/zarr/v3/codecs/gzip.py index dbbe9395f5..be1ebcdc9f 100644 --- a/zarr/v3/codecs/gzip.py +++ b/zarr/v3/codecs/gzip.py @@ -4,6 +4,7 @@ TYPE_CHECKING, Literal, Optional, + Type, ) from attr import frozen, field @@ -12,6 +13,7 @@ from zarr.v3.abc.codec import BytesBytesCodec from zarr.v3.codecs.registry import register_codec from zarr.v3.common import BytesLike, to_thread +from zarr.v3.metadata import CodecMetadata if TYPE_CHECKING: from zarr.v3.metadata import CoreArrayMetadata @@ -36,15 +38,17 @@ class GzipCodec(BytesBytesCodec): @classmethod def from_metadata( - cls, codec_metadata: GzipCodecMetadata, array_metadata: CoreArrayMetadata + cls, codec_metadata: CodecMetadata, array_metadata: CoreArrayMetadata ) -> GzipCodec: + assert isinstance(codec_metadata, GzipCodecMetadata) + return cls( array_metadata=array_metadata, configuration=codec_metadata.configuration, ) @classmethod - def get_metadata_class(cls) -> GzipCodecMetadata: + def get_metadata_class(cls) -> Type[GzipCodecMetadata]: return GzipCodecMetadata async def decode( diff --git a/zarr/v3/codecs/sharding.py b/zarr/v3/codecs/sharding.py index 7af891fcb5..795be0c641 100644 --- a/zarr/v3/codecs/sharding.py +++ b/zarr/v3/codecs/sharding.py @@ -11,6 +11,7 @@ Optional, Set, Tuple, + Type, ) from attr import field, frozen @@ -223,9 +224,11 @@ class ShardingCodec( @classmethod def from_metadata( cls, - codec_metadata: ShardingCodecMetadata, + codec_metadata: CodecMetadata, array_metadata: CoreArrayMetadata, ) -> ShardingCodec: + assert isinstance(codec_metadata, ShardingCodecMetadata) + chunks_per_shard = tuple( s // c for s, c in zip( @@ -263,7 +266,7 @@ def from_metadata( ) @classmethod - def get_metadata_class(cls) -> ShardingCodecMetadata: + def get_metadata_class(cls) -> Type[ShardingCodecMetadata]: return ShardingCodecMetadata async def decode( diff --git a/zarr/v3/codecs/transpose.py b/zarr/v3/codecs/transpose.py index 35544d2031..d160f2a88d 100644 --- a/zarr/v3/codecs/transpose.py +++ b/zarr/v3/codecs/transpose.py @@ -5,6 +5,7 @@ Literal, Optional, Tuple, + Type, ) import numpy as np @@ -12,6 +13,7 @@ from zarr.v3.abc.codec import ArrayArrayCodec from zarr.v3.codecs.registry import register_codec +from zarr.v3.metadata import CodecMetadata if TYPE_CHECKING: from zarr.v3.metadata import CoreArrayMetadata @@ -36,8 +38,10 @@ class TransposeCodec(ArrayArrayCodec): @classmethod def from_metadata( - cls, codec_metadata: TransposeCodecMetadata, array_metadata: CoreArrayMetadata + cls, codec_metadata: CodecMetadata, array_metadata: CoreArrayMetadata ) -> TransposeCodec: + assert isinstance(codec_metadata, TransposeCodecMetadata) + configuration = codec_metadata.configuration # Compatibility with older version of ZEP1 if configuration.order == "F": # type: ignore @@ -67,7 +71,7 @@ def from_metadata( ) @classmethod - def get_metadata_class(cls) -> TransposeCodecMetadata: + def get_metadata_class(cls) -> Type[TransposeCodecMetadata]: return TransposeCodecMetadata def resolve_metadata(self) -> CoreArrayMetadata: diff --git a/zarr/v3/codecs/zstd.py b/zarr/v3/codecs/zstd.py index 9b536b9c70..e66d9e0700 100644 --- a/zarr/v3/codecs/zstd.py +++ b/zarr/v3/codecs/zstd.py @@ -4,6 +4,7 @@ TYPE_CHECKING, Literal, Optional, + Type, ) from attr import frozen, field @@ -12,6 +13,7 @@ from zarr.v3.abc.codec import BytesBytesCodec from zarr.v3.codecs.registry import register_codec from zarr.v3.common import BytesLike, to_thread +from zarr.v3.metadata import CodecMetadata if TYPE_CHECKING: from zarr.v3.metadata import CoreArrayMetadata @@ -37,15 +39,16 @@ class ZstdCodec(BytesBytesCodec): @classmethod def from_metadata( - cls, codec_metadata: ZstdCodecMetadata, array_metadata: CoreArrayMetadata + cls, codec_metadata: CodecMetadata, array_metadata: CoreArrayMetadata ) -> ZstdCodec: + assert isinstance(codec_metadata, ZstdCodecMetadata) return cls( array_metadata=array_metadata, configuration=codec_metadata.configuration, ) @classmethod - def get_metadata_class(cls) -> ZstdCodecMetadata: + def get_metadata_class(cls) -> Type[ZstdCodecMetadata]: return ZstdCodecMetadata def _compress(self, data: bytes) -> bytes: diff --git a/zarr/v3/metadata.py b/zarr/v3/metadata.py index 68b8c7bb44..53b300d3f8 100644 --- a/zarr/v3/metadata.py +++ b/zarr/v3/metadata.py @@ -143,8 +143,9 @@ def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: class CodecMetadata(Protocol): - name: str - configuration: Optional[Any] + @property + def name(self) -> str: + pass class ShardingCodecIndexLocation(Enum): From 6c3e40a24a10c257e2e6cd256a7114811ef27cbc Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Wed, 6 Dec 2023 11:34:14 +0100 Subject: [PATCH 11/12] Apply suggestions from code review Co-authored-by: Joe Hamman --- zarr/tests/test_codecs_v3.py | 4 ++-- zarr/v3/codecs/registry.py | 2 +- zarr/v3/codecs/sharding.py | 1 - 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/zarr/tests/test_codecs_v3.py b/zarr/tests/test_codecs_v3.py index 90e4a8f87d..93acdb2ba1 100644 --- a/zarr/tests/test_codecs_v3.py +++ b/zarr/tests/test_codecs_v3.py @@ -42,8 +42,8 @@ def store() -> Iterator[Store]: @pytest.fixture -def sample_data() -> Iterator[np.ndarray]: - yield np.arange(0, 128 * 128 * 128, dtype="uint16").reshape((128, 128, 128), order="F") +def sample_data() -> np.ndarray: + return np.arange(0, 128 * 128 * 128, dtype="uint16").reshape((128, 128, 128), order="F") @pytest.mark.parametrize( diff --git a/zarr/v3/codecs/registry.py b/zarr/v3/codecs/registry.py index 388103975d..642c0feebb 100644 --- a/zarr/v3/codecs/registry.py +++ b/zarr/v3/codecs/registry.py @@ -42,7 +42,7 @@ def _get_codec_item(key: str) -> CodecRegistryItem: item = __codec_registry.get(key) if item: return item - raise KeyError + raise KeyError(key) def get_codec_metadata_class(key: str) -> Type[CodecMetadata]: diff --git a/zarr/v3/codecs/sharding.py b/zarr/v3/codecs/sharding.py index 795be0c641..edbe327a6b 100644 --- a/zarr/v3/codecs/sharding.py +++ b/zarr/v3/codecs/sharding.py @@ -318,7 +318,6 @@ async def decode_partial( store_path: StorePath, selection: SliceSelection, ) -> Optional[np.ndarray]: - print("decode_partial") shard_shape = self.array_metadata.chunk_shape chunk_shape = self.configuration.chunk_shape From 9749f25547a47c4b83b1a67bf097b35826586ec1 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Wed, 6 Dec 2023 11:50:34 +0100 Subject: [PATCH 12/12] remove test codec from pyproject.toml --- pyproject.toml | 3 --- 1 file changed, 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6a3730e47a..22ea19f28f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -137,6 +137,3 @@ filterwarnings = [ "ignore:PY_SSIZE_T_CLEAN will be required.*:DeprecationWarning", "ignore:The loop argument is deprecated since Python 3.8.*:DeprecationWarning", ] - -[project.entry-points."zarr.codecs"] -test = "zarr.v3.codecs.bytes:BytesCodec" \ No newline at end of file