From fecfb26ff5a6ad6f8fc36c79634c780939fe823d Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 6 Apr 2023 14:16:01 -0400 Subject: [PATCH] Count only for the first hit for subject or tissue within filename Also added assertion so we do not count incorrectly. But may be should be just a warning? Closes #172 --- dandischema/metadata.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/dandischema/metadata.py b/dandischema/metadata.py index b1f4134..a2f042b 100644 --- a/dandischema/metadata.py +++ b/dandischema/metadata.py @@ -311,13 +311,17 @@ def _add_asset_to_stats(assetmeta: Dict[str, Any], stats: _stats_type) -> None: stats = _get_samples(value, stats, hierarchy) break + # which components already found, so we do not count more than + # once in some incorrectly named datasets + found = {} for part in Path(assetmeta["path"]).name.split(".")[0].split("_"): - if part.startswith("sub-"): - subject = part.replace("sub-", "") + if found.get("subject") and part.startswith("sub-"): + found["subject"] = subject = part.split("sub-", 1)[1] if subject not in stats["subjects"]: stats["subjects"].append(subject) - if part.startswith("sample-"): - sample = part.replace("sample-", "") + found.add("subject") + if not found.get("sample") and part.startswith("sample-"): + found["sample"] = sample = part.replace("sample-", "") if sample not in stats["tissuesample"]: stats["tissuesample"].append(sample) @@ -338,10 +342,12 @@ def aggregate_assets_summary(metadata: Iterable[Dict[str, Any]]) -> dict: stats: _stats_type = {} for meta in metadata: _add_asset_to_stats(meta, stats) - stats["numberOfBytes"] = stats.get("numberOfBytes", 0) stats["numberOfFiles"] = stats.get("numberOfFiles", 0) stats["numberOfSubjects"] = len(stats.pop("subjects", [])) or None + if stats["numberOfFiles"]: + # Must not happen. If does -- a bug in software + assert stats["numberOfSubjects"] <= stats["numberOfFiles"] stats["numberOfSamples"] = ( len(stats.pop("tissuesample", [])) + len(stats.pop("slice", [])) ) or None