Skip to content

Commit

Permalink
Merge pull request #90 from blowekamp/rechunk_inmemory
Browse files Browse the repository at this point in the history
Add option to rechunk to load array into memory
  • Loading branch information
blowekamp authored Nov 16, 2023
2 parents bb2fb48 + 5e59e58 commit 8c6fa54
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 6 deletions.
19 changes: 15 additions & 4 deletions pytools/HedwigZarrImage.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def shape(self) -> Tuple[int]:
"""
return self._ome_ngff_multiscale_get_array(0).shape

def rechunk(self, chunk_size: int, compressor=None) -> None:
def rechunk(self, chunk_size: int, compressor=None, *, in_memory=False) -> None:
"""
Change the chunk size of each ZARR array inplace in the pyramid.
Expand All @@ -78,6 +78,9 @@ def rechunk(self, chunk_size: int, compressor=None) -> None:
:param chunk_size: The size as an integer to resize the chunk sizes.
:param compressor: The output arrays will be written with the provided compressor, if None then the compressor
of the input arrays will be used.
:param in_memory: If true the entire arrays will be loaded into memory uncompressed, before writing to the
rechunked size, otherwise the arrays will be written directly to the rechunked size. The former is faster but
requires enough memory to hold the arrays.
"""

logger.info(f'Processing group: "{self.zarr_group.name}"...')
Expand All @@ -100,11 +103,19 @@ def rechunk(self, chunk_size: int, compressor=None) -> None:
logger.info("Chunks already requested size")
continue

if compressor is None:
compressor = arr.compressor
temp_arr = arr
if in_memory:
logger.info(f'Loading array: "{arr.name}" into memory...')
# optionally load the entire array uncompressed into memory
memory_group = zarr.group(store=zarr.MemoryStore(), overwrite=True)
zarr.copy(temp_arr, memory_group, name="temp", compressor=None)
temp_arr = memory_group["temp"]

logger.info(f'Rechunking array: "{arr.name} to disk"...')

# copy array to a temp zarr array on file
zarr.copy(
arr,
temp_arr,
self.zarr_group,
name=arr_name + ".temp",
chunks=chunks,
Expand Down
7 changes: 5 additions & 2 deletions pytools/zarr_rechunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,18 @@
default=False,
help="Use the preferred compressor when recompressing.",
)
@click.option(
"--in-memory", is_flag=True, show_default=True, default=False, help="Use in-memory zarr store when recompressing."
)
@click.version_option(__version__)
def main(input_zarr, log_level, chunk_size, recompress):
def main(input_zarr, log_level, chunk_size, recompress, in_memory):
logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.getLevelName(log_level))

compressor = Blosc(cname="zstd", clevel=5, shuffle=Blosc.SHUFFLE)
z = HedwigZarrImages(input_zarr, read_only=False)

for k in z.get_series_keys():
z[k].rechunk(chunk_size, compressor=compressor if recompress else None)
z[k].rechunk(chunk_size, compressor=compressor if recompress else None, in_memory=in_memory)


if __name__ == "__main__":
Expand Down

0 comments on commit 8c6fa54

Please sign in to comment.