Merge pull request #90 from blowekamp/rechunk_inmemory

Add option to rechunk to load array into memory
niaid · Nov 16, 2023 · 8c6fa54 · 8c6fa54
2 parents bb2fb48 + 5e59e58
commit 8c6fa54
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 6 deletions.
diff --git a/pytools/HedwigZarrImage.py b/pytools/HedwigZarrImage.py
@@ -67,7 +67,7 @@ def shape(self) -> Tuple[int]:
         """
         return self._ome_ngff_multiscale_get_array(0).shape
 
-    def rechunk(self, chunk_size: int, compressor=None) -> None:
+    def rechunk(self, chunk_size: int, compressor=None, *, in_memory=False) -> None:
         """
         Change the chunk size of each ZARR array inplace in the pyramid.
 
@@ -78,6 +78,9 @@ def rechunk(self, chunk_size: int, compressor=None) -> None:
         :param chunk_size: The size as an integer to resize the chunk sizes.
         :param compressor: The output arrays will be written with the provided compressor, if None then the compressor
          of the input arrays will be used.
+        :param in_memory: If true the entire arrays will be loaded into memory uncompressed, before writing to the
+        rechunked size, otherwise the arrays will be written directly to the rechunked size. The former is faster but
+        requires enough memory to hold the arrays.
         """
 
         logger.info(f'Processing group: "{self.zarr_group.name}"...')
@@ -100,11 +103,19 @@ def rechunk(self, chunk_size: int, compressor=None) -> None:
                 logger.info("Chunks already requested size")
                 continue
 
-            if compressor is None:
-                compressor = arr.compressor
+            temp_arr = arr
+            if in_memory:
+                logger.info(f'Loading array: "{arr.name}" into memory...')
+                # optionally load the entire array uncompressed into memory
+                memory_group = zarr.group(store=zarr.MemoryStore(), overwrite=True)
+                zarr.copy(temp_arr, memory_group, name="temp", compressor=None)
+                temp_arr = memory_group["temp"]
+
+                logger.info(f'Rechunking array: "{arr.name} to disk"...')
+
             # copy array to a temp zarr array on file
             zarr.copy(
-                arr,
+                temp_arr,
                 self.zarr_group,
                 name=arr_name + ".temp",
                 chunks=chunks,

diff --git a/pytools/zarr_rechunk.py b/pytools/zarr_rechunk.py
@@ -25,15 +25,18 @@
     default=False,
     help="Use the preferred compressor when recompressing.",
 )
+@click.option(
+    "--in-memory", is_flag=True, show_default=True, default=False, help="Use in-memory zarr store when recompressing."
+)
 @click.version_option(__version__)
-def main(input_zarr, log_level, chunk_size, recompress):
+def main(input_zarr, log_level, chunk_size, recompress, in_memory):
     logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.getLevelName(log_level))
 
     compressor = Blosc(cname="zstd", clevel=5, shuffle=Blosc.SHUFFLE)
     z = HedwigZarrImages(input_zarr, read_only=False)
 
     for k in z.get_series_keys():
-        z[k].rechunk(chunk_size, compressor=compressor if recompress else None)
+        z[k].rechunk(chunk_size, compressor=compressor if recompress else None, in_memory=in_memory)
 
 
 if __name__ == "__main__":