Skip to content

Commit

Permalink
Add Storage dataclass
Browse files Browse the repository at this point in the history
  • Loading branch information
martaiborra committed Sep 19, 2024
1 parent a08ac92 commit 678b3ee
Show file tree
Hide file tree
Showing 7 changed files with 200 additions and 123 deletions.
18 changes: 13 additions & 5 deletions doc/reference/storage.rst
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
.. _Storage:
Dataclasses
===========

Storage
=======

This is a class for .......
Dataclasses for setting the compression, decompression
and storage parameters. All their parameters are optional.

.. currentmodule:: blosc2

Expand All @@ -24,3 +23,12 @@ DParams
:nosignatures:

DParams

Storage
-------

.. autosummary::
:toctree: autofiles/storage
:nosignatures:

Storage
16 changes: 15 additions & 1 deletion src/blosc2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,18 @@ class Tuner(Enum):
unpack_tensor,
)

# This import must be before ndarray and schunk
from .storage import (
CParams,
cparams_dflts,
cpu_info,
DParams,
dparams_dflts,
ncores,
nthreads,
Storage,
storage_dflts,
)

from .ndarray import ( # noqa: I001
NDArray,
Expand All @@ -213,7 +225,6 @@ class Tuner(Enum):

from .schunk import SChunk, open

from .storage import cpu_info, CParams, cparams_dflts, DParams, dparams_dflts, ncores, nthreads, storage_dflts


# Registry for postfilters
Expand Down Expand Up @@ -294,7 +305,9 @@ class Tuner(Enum):
"__version__",
"compress",
"decompress",
"CParams",
"cparams_dflts",
"DParams",
"dparams_dflts",
"storage_dflts",
"set_compressor",
Expand Down Expand Up @@ -326,6 +339,7 @@ class Tuner(Enum):
"compress2",
"decompress2",
"SChunk",
"Storage",
"open",
"remove_urlpath",
"nthreads",
Expand Down
5 changes: 3 additions & 2 deletions src/blosc2/blosc2_ext.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -929,7 +929,7 @@ cdef class SChunk:
self._urlpath = urlpath.encode() if isinstance(urlpath, str) else urlpath
kwargs["urlpath"] = self._urlpath

self.mode = kwargs.get("mode", "a")
self.mode = blosc2.Storage().mode if kwargs.get("mode", None) is None else kwargs.get("mode")
self.mmap_mode = kwargs.get("mmap_mode")
self.initial_mapping_size = kwargs.get("initial_mapping_size")
if self.mmap_mode is not None:
Expand Down Expand Up @@ -1077,7 +1077,8 @@ cdef class SChunk:
"typesize": self.schunk.storage.cparams.typesize,
"nthreads": self.schunk.storage.cparams.nthreads,
"blocksize": self.schunk.storage.cparams.blocksize,
"splitmode": blosc2.SplitMode(self.schunk.storage.cparams.splitmode)
"splitmode": blosc2.SplitMode(self.schunk.storage.cparams.splitmode),
"tuner": blosc2.Tuner(self.schunk.storage.cparams.tuner_id),
}

filters = [0] * BLOSC2_MAX_FILTERS
Expand Down
4 changes: 2 additions & 2 deletions src/blosc2/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1394,7 +1394,7 @@ def compress2(src: object, **kwargs: dict) -> str | bytes:
Other Parameters
----------------
kwargs: dict, optional
Compression parameters. The default values are in :ref:`blosc2.CParams`.
Compression parameters. The default values are in :class:`blosc2.CParams`.
Keyword arguments supported:
cparams: :class:`blosc2.CParams`
Expand Down Expand Up @@ -1444,7 +1444,7 @@ def decompress2(src: object, dst: object | bytearray = None, **kwargs: dict) ->
Other Parameters
----------------
kwargs: dict, optional
Decompression parameters. The default values are in :ref:`blosc2.DParams`.
Decompression parameters. The default values are in :class:`blosc2.DParams`.
Keyword arguments supported:
cparams: :class:`blosc2.DParams`
Expand Down
114 changes: 24 additions & 90 deletions src/blosc2/schunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import pathlib
from collections import namedtuple
from collections.abc import Mapping, MutableMapping
from dataclasses import asdict
from typing import Any, Iterator, NamedTuple

import numpy as np
Expand Down Expand Up @@ -156,95 +157,14 @@ def __init__(self, chunksize: int = None, data: object = None, **kwargs: dict):
Other parameters
----------------
kwargs: dict, optional
Storage parameters. The default values are in :class:`blosc2.Storage`.
Keyword arguments supported:
contiguous: bool, optional
If the chunks are stored contiguously or not.
Default is True when :paramref:`urlpath` is not None;
False otherwise.
urlpath: str | pathlib.Path, optional
If the storage is persistent, the name of the file (when
`contiguous = True`) or the directory (if `contiguous = False`).
If the storage is in-memory, then this field is `None`.
mode: str, optional
Persistence mode: ‘r’ means read only (must exist);
‘a’ means read/write (create if it doesn’t exist);
‘w’ means create (overwrite if it exists).
mmap_mode: str, optional
If set, the file will be memory-mapped instead of using the default
I/O functions and the `mode` argument will be ignored. The memory-mapping
modes are similar as used by the
`numpy.memmap <https://numpy.org/doc/stable/reference/generated/numpy.memmap.html>`_
function, but it is possible to extend the file:
.. list-table::
:widths: 10 90
:header-rows: 1
* - mode
- description
* - 'r'
- Open an existing file for reading only.
* - 'r+'
- Open an existing file for reading and writing. Use this mode if you want
to append data to an existing schunk file.
* - 'w+'
- Create or overwrite an existing file for reading and writing. Use this
mode if you want to create a new schunk.
* - 'c'
- Open an existing file in copy-on-write mode: all changes affect the data
in memory but changes are not saved to disk. The file on disk is
read-only. On Windows, the size of the mapping cannot change.
Only contiguous storage can be memory-mapped. Hence, `urlpath` must point to a
file (and not a directory).
.. note::
Memory-mapped files are opened once and the file contents remain in (virtual)
memory for the lifetime of the schunk. Using memory-mapped I/O can be faster
than using the default I/O functions depending on the use case. Whereas
reading performance is generally better, writing performance may also be
slower in some cases on certain systems. In any case, memory-mapped files
can be especially beneficial when operating with network file systems
(like NFS).
This is currently a beta feature (especially write operations) and we
recommend trying it out and reporting any issues you may encounter.
initial_mapping_size: int, optional
The initial size of the mapping for the memory-mapped file when writes are
allowed (r+ w+, or c mode). Once a file is memory-mapped and extended beyond the
initial mapping size, the file must be remapped which may be expensive. This
parameter allows to decouple the mapping size from the actual file size to early
reserve memory for future writes and avoid remappings. The memory is only
reserved virtually and does not occupy physical memory unless actual writes
happen. Since the virtual address space is large enough, it is ok to be generous
with this parameter (with special consideration on Windows, see note below).
For best performance, set this to the maximum expected size of the compressed
data (see example in :obj:`SChunk.__init__ <blosc2.schunk.SChunk.__init__>`).
The size is in bytes.
Default: 1 GiB.
.. note::
On Windows, the size of the mapping is directly coupled to the file size.
When the schunk gets destroyed, the file size will be truncated to the
actual size of the schunk.
cparams: dict
A dictionary with the compression parameters, which are the same
as those can be used in the :func:`~blosc2.compress2` function.
dparams: dict
A dictionary with the decompression parameters, which are the same
as those that can be used in the :func:`~blosc2.decompress2`
function.
meta: dict or None
A dictionary with different metalayers. One entry per metalayer:
key: bytes or str
The name of the metalayer.
value: object
The metalayer object that will be serialized using msgpack.
storage: :class:`blosc2.Storage`
All the decompression parameters that you want to use as
a :class:`blosc2.Storage` instance.
others: Any
If `storage` is not passed, all the parameters of a :class:`blosc2.Storage`
can be passed as keyword arguments.
Examples
--------
Expand Down Expand Up @@ -301,10 +221,24 @@ def __init__(self, chunksize: int = None, data: object = None, **kwargs: dict):
"mmap_mode",
"initial_mapping_size",
"_is_view",
"storage"
]
for kwarg in kwargs:
if kwarg not in allowed_kwargs:
raise ValueError(f"{kwarg} is not supported as keyword argument")
if kwargs.get("storage") is not None:
if any(key not in ["_is_view", "_schunk", "storage"] for key in kwargs.keys()):
raise AttributeError("Cannot pass both `storage` and other kwargs already included in Storage")
storage = kwargs.get("storage")
del kwargs["storage"]
kwargs = {**kwargs, **asdict(storage)}

if isinstance(kwargs.get("cparams"), blosc2.CParams):
kwargs["cparams"] = asdict(kwargs.get("cparams"))

if isinstance(kwargs.get("dparams"), blosc2.DParams):
kwargs["dparams"] = asdict(kwargs.get("dparams"))

urlpath = kwargs.get("urlpath")
if "contiguous" not in kwargs:
# Make contiguous true for disk, else sparse (for in-memory performance)
Expand Down Expand Up @@ -1395,8 +1329,8 @@ def __dealloc__(self):
super().__dealloc__()


@_inherit_doc_parameter(SChunk.__init__, "mmap_mode:", {r"\* - 'w\+'[^*]+": ""})
@_inherit_doc_parameter(SChunk.__init__, "initial_mapping_size:", {r"r\+ w\+, or c": "r+ or c"})
@_inherit_doc_parameter(blosc2.Storage, "mmap_mode:", {r"\* - 'w\+'[^*]+": ""})
@_inherit_doc_parameter(blosc2.Storage, "initial_mapping_size:", {r"r\+ w\+, or c": "r+ or c"})
def open(urlpath: str | pathlib.Path | blosc2.URLPath, mode: str = "a", offset: int = 0,
**kwargs: dict) -> blosc2.SChunk | blosc2.NDArray | blosc2.C2Array:
"""Open a persistent :ref:`SChunk` or :ref:`NDArray` or a remote :ref:`C2Array`
Expand Down
118 changes: 115 additions & 3 deletions src/blosc2/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@
# LICENSE file in the root directory of this source tree)
#######################################################################

from dataclasses import dataclass, field, asdict
from dataclasses import dataclass, field, asdict, fields
import warnings

import blosc2


# Internal Blosc threading
# Get CPU info
cpu_info = blosc2.get_cpu_info()
Expand Down Expand Up @@ -102,6 +102,118 @@ class DParams:
"""
nthreads: int = field(default_factory=default_nthreads)


@dataclass
class Storage:
"""Dataclass for hosting the different storage parameters.
Parameters
----------
contiguous: bool
If the chunks are stored contiguously or not.
Default is True when :paramref:`urlpath` is not None;
False otherwise.
urlpath: str or pathlib.Path, optional
If the storage is persistent, the name of the file (when
`contiguous = True`) or the directory (if `contiguous = False`).
If the storage is in-memory, then this field is `None`.
cparams: :class:`CParams` or dict
The compression parameters as a :class:`CParams` instance or a dictionary.
dparams: :class:`DParams` or dict
The decompression parameters as a :class:`DParams` instance or a dictionary.
mode: str, optional
Persistence mode: ‘r’ means read only (must exist);
‘a’ means read/write (create if it doesn’t exist);
‘w’ means create (overwrite if it exists). Default is 'a'.
mmap_mode: str, optional
If set, the file will be memory-mapped instead of using the default
I/O functions and the `mode` argument will be ignored. The memory-mapping
modes are similar as used by the
`numpy.memmap <https://numpy.org/doc/stable/reference/generated/numpy.memmap.html>`_
function, but it is possible to extend the file:
.. list-table::
:widths: 10 90
:header-rows: 1
* - mode
- description
* - 'r'
- Open an existing file for reading only.
* - 'r+'
- Open an existing file for reading and writing. Use this mode if you want
to append data to an existing schunk file.
* - 'w+'
- Create or overwrite an existing file for reading and writing. Use this
mode if you want to create a new schunk.
* - 'c'
- Open an existing file in copy-on-write mode: all changes affect the data
in memory but changes are not saved to disk. The file on disk is
read-only. On Windows, the size of the mapping cannot change.
Only contiguous storage can be memory-mapped. Hence, `urlpath` must point to a
file (and not a directory).
.. note::
Memory-mapped files are opened once and the file contents remain in (virtual)
memory for the lifetime of the schunk. Using memory-mapped I/O can be faster
than using the default I/O functions depending on the use case. Whereas
reading performance is generally better, writing performance may also be
slower in some cases on certain systems. In any case, memory-mapped files
can be especially beneficial when operating with network file systems
(like NFS).
This is currently a beta feature (especially write operations) and we
recommend trying it out and reporting any issues you may encounter.
initial_mapping_size: int, optional
The initial size of the mapping for the memory-mapped file when writes are
allowed (r+ w+, or c mode). Once a file is memory-mapped and extended beyond the
initial mapping size, the file must be remapped which may be expensive. This
parameter allows to decouple the mapping size from the actual file size to early
reserve memory for future writes and avoid remappings. The memory is only
reserved virtually and does not occupy physical memory unless actual writes
happen. Since the virtual address space is large enough, it is ok to be generous
with this parameter (with special consideration on Windows, see note below).
For best performance, set this to the maximum expected size of the compressed
data (see example in :obj:`SChunk.__init__ <blosc2.schunk.SChunk.__init__>`).
The size is in bytes.
Default: 1 GiB.
.. note::
On Windows, the size of the mapping is directly coupled to the file size.
When the schunk gets destroyed, the file size will be truncated to the
actual size of the schunk.
meta: dict or None
A dictionary with different metalayers. One entry per metalayer:
key: bytes or str
The name of the metalayer.
value: object
The metalayer object that will be serialized using msgpack.
"""
contiguous: bool = None
urlpath: str = None
cparams: CParams | dict = field(default_factory=CParams)
dparams: DParams | dict = field(default_factory=DParams)
mode: str = 'a'
mmap_mode: str = None
initial_mapping_size: int = None
meta: dict = None

def __post_init__(self):
if self.contiguous is None:
self.contiguous = False if self.urlpath is None else True
# Check for None values
for field in fields(self):
if (getattr(self, field.name) is None and
field.name not in ['urlpath', 'mmap_mode', 'initial_mapping_size', 'meta']):
setattr(self, field.name, getattr(Storage(), field.name))
warnings.warn("`{name}` field value changed from `None` to `{value}`".format(name=field.name, value=getattr(self, field.name)))


# Defaults for compression params
cparams_dflts = asdict(CParams())
"""
Expand All @@ -114,7 +226,7 @@ class DParams:
Decompression params defaults.
"""
# Default for storage
storage_dflts = {"contiguous": False, "urlpath": None, "cparams": blosc2.CParams(), "dparams": blosc2.DParams}
storage_dflts = asdict(Storage())
"""
Storage params defaults. This is meant only for :ref:`SChunk <SChunk>` or :ref:`NDArray <NDArray>`.
"""
Loading

0 comments on commit 678b3ee

Please sign in to comment.