Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Microscopy: NGFF format support #1104

Merged
merged 10 commits into from
Jul 7, 2022
8 changes: 3 additions & 5 deletions src/04-modality-specific-files/10-microscopy.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,10 @@ Microscopy raw data MUST be stored in one of the following formats:
(`.ome.tif` for standard TIFF files or `.ome.btf` for
[BigTIFF](https://www.awaresystems.be/imaging/tiff/bigtiff.html) files)

If different from PNG, TIFF or OME-TIFF, the original unprocessed data in the native format MAY be
stored in the [`/sourcedata` directory](../02-common-principles.md#source-vs-raw-vs-derived-data).
- [NGFF/OME-ZARR](https://ngff.openmicroscopy.org/latest/) (`.ome.zarr` directories)
TheChymera marked this conversation as resolved.
Show resolved Hide resolved

Future versions may extend this list of supported file formats, for example with the
Next-Generation File Formats currently developed by OME ([OME-NGFF](https://ngff.openmicroscopy.org/latest/))
as a successor to OME-TIFF for better remote sharing of large datasets.
If different from PNG, TIFF, OME-TIFF, or NGFF, the original unprocessed data in the native format MAY be
stored in the [`/sourcedata` directory](../02-common-principles.md#source-vs-raw-vs-derived-data).

### Modality suffixes
Microscopy data currently support the following imaging modalities:
Expand Down
7 changes: 7 additions & 0 deletions src/schema/objects/extensions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,13 @@

Used by KIT, Yokogawa, and Ricoh MEG systems.
Successor to the `.sqd` extension for marker files.
.ome.zarr/:
name: OME Next Generation File Format
description: |
An OME-NGFF file.

OME-NGFF is a [Zarr](https://zarr.readthedocs.io)-based format, organizing data arrays in nested directories.
This format was developed by the Open Microscopy Environment to provide data stream access to very large data.
.nii:
name: NIfTI
description: |
Expand Down
1 change: 1 addition & 0 deletions src/schema/rules/datatypes/micr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ microscopy:
extensions:
- .ome.tif
- .ome.btf
- .ome.zarr/
- .png
- .tif
- .json
Expand Down
2 changes: 1 addition & 1 deletion tools/schemacode/schemacode/tests/test_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ def test_load_all():
os.path.abspath(os.path.dirname(__file__)),
"../data/schema",
)
schema_all = load_all(schema_path)
schema_all, _ = load_all(schema_path)

# Check if expected keys are present in all entries
for entry in schema_all:
Expand Down
64 changes: 50 additions & 14 deletions tools/schemacode/schemacode/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@
DIR_ENTITIES = ["subject", "session"]


def _get_paths(bids_paths):
def _get_paths(
bids_paths,
pseudofile_suffixes=[],
):
"""
Get all paths from a list of directories, excluding hidden subdirectories from distribution.

Expand All @@ -25,6 +28,9 @@ def _get_paths(bids_paths):
bids_paths : list or str
Directories from which to get paths, may also contain file paths, which will remain
unchanged.
pseudofile_suffixes : list of str
Directory suffixes prompting the validation of the directory name and limiting further
directory walk.

Notes
-----
Expand All @@ -47,23 +53,17 @@ def _get_paths(bids_paths):
".bidsignore",
"dandiset.yaml",
]
# Inelegant hard-coded solution.
# Could be replaced by a maximum depth limit if BIDS root auto-detection is implemented.
treat_as_file_suffix = [".ngff"]

path_list = []
for bids_path in bids_paths:
bids_path = os.path.abspath(os.path.expanduser(bids_path))
if os.path.isfile(bids_path):
path_list.append(bids_path)
continue
for root, dirs, file_names in os.walk(bids_path, topdown=False):
if any(root.endswith(i) for i in treat_as_file_suffix):
continue
if any(f"{i}/" in root for i in treat_as_file_suffix):
continue
if any(f"{i}\\" in root for i in treat_as_file_suffix):
continue
for root, dirs, file_names in os.walk(bids_path, topdown=True):
if any(root.endswith(i) for i in pseudofile_suffixes):
path_list.append(f"{root}/")
TheChymera marked this conversation as resolved.
Show resolved Hide resolved
dirs[:] = []
TheChymera marked this conversation as resolved.
Show resolved Hide resolved
# will break if BIDS ever puts meaningful data under `/.{dandi,datalad,git}*/`
if any(exclude_subdir in root for exclude_subdir in exclude_subdirs):
continue
Expand Down Expand Up @@ -335,6 +335,8 @@ def load_all(
-------
all_regex : list of dict
A list of dictionaries, with keys including 'regex' and 'mandatory'.
my_schema : list of dict
Nested dictionaries representing the full schema.
"""

my_schema = schema.load_schema(schema_dir)
Expand All @@ -346,13 +348,14 @@ def load_all(
)
all_regex.extend(top_level_regex)

return all_regex
return all_regex, my_schema


def validate_all(
bids_paths,
regex_schema,
debug=False,
pseudofile_suffixes=[],
):
"""
Validate `bids_paths` based on a `regex_schema` dictionary list, including regexes.
Expand All @@ -366,6 +369,9 @@ def validate_all(
debug : tuple, optional
Whether to print itemwise notices for checks on the console, and include them in the
validation result.
pseudofile_suffixes : list of str
TheChymera marked this conversation as resolved.
Show resolved Hide resolved
Directory suffixes prompting the validation of the directory name and limiting further
directory walk.
TheChymera marked this conversation as resolved.
Show resolved Hide resolved

Returns
-------
Expand All @@ -384,7 +390,7 @@ def validate_all(
"""

tracking_schema = deepcopy(regex_schema)
paths_list = _get_paths(bids_paths)
paths_list = _get_paths(bids_paths, pseudofile_suffixes=pseudofile_suffixes)
tracking_paths = deepcopy(paths_list)
if debug:
itemwise_results = []
Expand Down Expand Up @@ -658,6 +664,34 @@ def log_errors(validation_result):
lgr.warning("The `%s` file was not matched by any regex schema entry.", i)


def _query_pseudofile_suffixes(my_schema):
TheChymera marked this conversation as resolved.
Show resolved Hide resolved
"""Query schema for suffixes which identify directory entities.

Parameters
----------
my_schema : dict
Nested directory as produced by `schemacode.schema.load_schema()`.

Returns
-------
list of str
Directory pseudofile suffixes excluding trailing slashes.

Notes
-----
* Yes this seems super-awkward to do explicitly, after all, the trailing slash is
already in so it should automagically work, but no:
- Subdirectory names need to be dynamically excluded from validation input.
- Backslash directory delimiters are still in use, which is regrettable.
"""
pseudofile_suffixes = []
for i in my_schema["objects"]["extensions"]:
if i.endswith("/"):
if i != "/":
pseudofile_suffixes.append(i[:-1])
return pseudofile_suffixes


def validate_bids(
bids_paths,
schema_reference_root="/usr/share/bids-schema/",
Expand Down Expand Up @@ -716,11 +750,13 @@ def validate_bids(
bids_paths = [bids_paths]

bids_schema_dir = select_schema_dir(bids_paths, schema_reference_root, schema_version)
regex_schema = load_all(bids_schema_dir)
regex_schema, my_schema = load_all(bids_schema_dir)
pseudofile_suffixes = _query_pseudofile_suffixes(my_schema)
validation_result = validate_all(
bids_paths,
regex_schema,
debug=debug,
pseudofile_suffixes=pseudofile_suffixes,
)

log_errors(validation_result)
Expand Down