ENH: Split ICA into multiple steps (#865)

Co-authored-by: Richard Höchenberger <richard.hoechenberger@gmail.com>
mne-tools · Mar 1, 2024 · 94256f7 · 94256f7
1 parent 324d4b4
commit 94256f7
Show file tree

Hide file tree

Showing 12 changed files with 876 additions and 706 deletions.
diff --git a/.circleci/run_dataset_and_copy_files.sh b/.circleci/run_dataset_and_copy_files.sh
@@ -32,15 +32,16 @@ echo "Runtime: ${SECONDS} seconds"
 
 # rerun test (check caching)!
 SECONDS=0
+RERUN_LIMIT=30
 if [[ "$RERUN_TEST" == "false" ]]; then
   echo "Skipping rerun test"
   RUN_TIME=0
 else
   pytest mne_bids_pipeline --cov-append -k $DS_RUN
   RUN_TIME=$SECONDS
-  echo "Runtime: ${RUN_TIME} seconds (should be < 20)"
+  echo "Runtime: ${RUN_TIME} seconds (should be <= $RERUN_LIMIT)"
 fi
-test $RUN_TIME -lt 20
+test $RUN_TIME -le $RERUN_LIMIT
 
 if [[ "$COPY_FILES" == "false" ]]; then
   echo "Not copying files"

diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -77,7 +77,7 @@ nav:
         - Preparations for source-level analyses: getting_started/freesurfer.md
     - Processing steps:
         - Overview: features/overview.md
-        - Detailed list of processing steps: features/steps.md
+        - List of processing steps: features/steps.md
     - Configuration options:
         - General settings: settings/general.md
         - Preprocessing:

diff --git a/docs/source/features/gen_steps.py b/docs/source/features/gen_steps.py
@@ -13,30 +13,49 @@
 steps_pre = f"""\
 {autogen_header}
 
-# Detailed list of processing steps
+# List of processing steps
 
 The following table provides a concise summary of each processing step. The
 step names can be used to run individual steps or entire groups of steps by
-passing their name(s) to `mne_bids_pipeline` via the `steps=...` argument.
+passing their name(s) to `mne_bids_pipeline` via the `steps=...` argument. However,
+we recommend using `mne_bids_pipeline config.py` to run the entire pipeline
+instead to ensure that all steps affected by a given change are re-run.
 """  # noqa: E501
 
 overview_pre = f"""\
 {autogen_header}
 
+# MNE-BIDS-Pipeline overview
+
 MNE-BIDS-Pipeline processes your data in a sequential manner, i.e., one step
 at a time. The next step is only run after the previous steps have been
 successfully completed. There are, of course, exceptions; for example, if you
-chose not to apply ICA, the respective steps will simply be omitted and we'll
-directly move to the subsequent steps. The following flow chart aims to give
-you a brief overview of which steps are included in the pipeline, in which
-order they are run, and how we group them together.
+chose not to apply ICA or SSP, the spatial filtering steps will simply be omitted and
+we'll directly move to the subsequent steps. See [the flowchart below](#flowchart) for
+a visualization of the steps, or check out the
+[list of processing steps](steps.md) for more information.
+
+All intermediate results are saved to disk for later inspection, and an
+**extensive report** is generated. Analyses are conducted on individual (per-subject)
+as well as group level.
+
+## Caching
+
+MNE-BIDS-Pipeline offers automated caching of intermediate results. This means that
+running `mne_bids_pipeline config.py` once will generate all outputs, and running it
+again will only re-run the steps that need rerunning based on:
+
+1. Changes to files on disk (e.g., updates to `bids_root` files), and
+2. Changes to `config.py`
 
-!!! info
-    All intermediate results are saved to disk for later
-    inspection, and an **extensive report** is generated.
+This is particularly useful when you are developing your pipeline, as you can quickly
+iterate over changes to your pipeline without having to re-run the entire pipeline
+every time -- only the steps that need to be re-run will be executed.
 
-!!! info
-    Analyses are conducted on individual (per-subject) as well as group level.
+## Flowchart
+
+For more detailed information on each step, please refer to the [detailed list
+of processing steps](steps.md).
 """
 
 icon_map = {
@@ -61,25 +80,26 @@
         ("02", "03"),
         ("03", "04"),
         ("04", "05"),
-        ("05", "06a"),
+        ("05", "06a1"),
+        ("06a1", "06a2"),
         ("05", "06b"),
         ("05", "07"),
         # technically we could have the raw data flow here, but it doesn't really help
         # ("05", "08a"),
         # ("05", "08b"),
-        ("06a", "08a"),
-        ("07", "08a"),
+        ("06a2", "08a"),
         # Force the artifact-fitting and epoching steps on the same level, in this order
         """\
     subgraph Z[" "]
     direction LR
-      B06a
+      B06a1
       B07
       B06b
     end
     style Z fill:#0000,stroke-width:0px
 """,
         ("06b", "08b"),
+        ("07", "08a"),
         ("07", "08b"),
         ("08a", "09"),
         ("08b", "09"),
@@ -120,7 +140,10 @@
     # Overview
     overview_lines.append(
         f"""\
-## {module_header}
+### {module_header}
+
+<details>
+<summary>Click to expand</summary>
 
 ```mermaid
 flowchart TD"""
@@ -161,7 +184,7 @@
             assert isinstance(a_b, tuple), type(a_b)
             a_b = list(a_b)  # allow modification
             for ii, idx in enumerate(a_b):
-                assert idx in title_map, (dir_header, sorted(title_map))
+                assert idx in title_map, (dir_header, idx, sorted(title_map))
                 if idx not in mapped:
                     mapped.add(idx)
                     a_b[ii] = f'{idx}["{title_map[idx]}"]'
@@ -173,7 +196,7 @@
             )
         )
         assert mapped == all_steps, all_steps.symmetric_difference(mapped)
-    overview_lines.append("```\n")
+    overview_lines.append("```\n\n</details>\n")
 
 (out_dir / "steps.md").write_text("\n".join(lines), encoding="utf8")
 (out_dir / "overview.md").write_text("\n".join(overview_lines), encoding="utf8")
diff --git a/docs/source/v1.6.md.inc b/docs/source/v1.6.md.inc
@@ -10,6 +10,7 @@
 - Added saving of detected blink and cardiac events used to calculate SSP projectors (#840 by @larsoner)
 - Added [`noise_cov_method`][mne_bids_pipeline._config.noise_cov_method] to allow for the use of methods other than `"shrunk"` for noise covariance estimation (#854 by @larsoner)
 - Added option to pass `image_kwargs` to [`mne.Report.add_epochs`] to allow adjusting e.g. `"vmin"` and `"vmax"` of the epochs image in the report via [`report_add_epochs_image_kwargs`][mne_bids_pipeline._config.report_add_epochs_image_kwargs] (#848 by @SophieHerbst)
+- Split ICA fitting and artifact detection into separate steps. This means that now, ICA is split into a total of three consecutive steps: fitting, artifact detection, and the actual data cleaning step ("applying ICA"). This makes it easier to experiment with different settings for artifact detection without needing to re-fit ICA. (#865 by @larsoner)
 
 [//]: # (### :warning: Behavior changes)
 
@@ -31,6 +32,7 @@
 - Changed the default for [`ica_n_components`][mne_bids_pipeline._config.ica_n_components] from `0.8` (too conservative) to `None` to match MNE-Python's default (#853 by @larsoner)
 - Prevent events table for the average subject overflowing in reports (#854 by @larsoner)
 - Fixed split file behavior for Epochs when using ICA (#855 by @larsoner)
+- Fixed a bug where users could not set `_components.tsv` as it would be detected as a cache miss and overwritten on next pipeline run (#865 by @larsoner)
 
 ### :medical_symbol: Code health
 

diff --git a/mne_bids_pipeline/_report.py b/mne_bids_pipeline/_report.py
@@ -30,27 +30,32 @@ def _open_report(
     session: Optional[str],
     run: Optional[str] = None,
     task: Optional[str] = None,
+    fname_report: Optional[BIDSPath] = None,
+    name: str = "report",
 ):
-    fname_report = BIDSPath(
-        subject=subject,
-        session=session,
-        # Report is across all runs, but for logging purposes it's helpful
-        # to pass the run and task for gen_log_kwargs
-        run=None,
-        task=cfg.task,
-        acquisition=cfg.acq,
-        recording=cfg.rec,
-        space=cfg.space,
-        extension=".h5",
-        datatype=cfg.datatype,
-        root=cfg.deriv_root,
-        suffix="report",
-        check=False,
-    ).fpath
+    if fname_report is None:
+        fname_report = BIDSPath(
+            subject=subject,
+            session=session,
+            # Report is across all runs, but for logging purposes it's helpful
+            # to pass the run and task for gen_log_kwargs
+            run=None,
+            task=cfg.task,
+            acquisition=cfg.acq,
+            recording=cfg.rec,
+            space=cfg.space,
+            extension=".h5",
+            datatype=cfg.datatype,
+            root=cfg.deriv_root,
+            suffix="report",
+            check=False,
+        )
+    fname_report = fname_report.fpath
+    assert fname_report.suffix == ".h5", fname_report.suffix
     # prevent parallel file access
     with FileLock(f"{fname_report}.lock"), _agg_backend():
         if not fname_report.is_file():
-            msg = "Initializing report HDF5 file"
+            msg = f"Initializing {name} HDF5 file"
             logger.info(**gen_log_kwargs(message=msg))
             report = _gen_empty_report(
                 cfg=cfg,
@@ -62,7 +67,7 @@ def _open_report(
             report = mne.open_report(fname_report)
         except Exception as exc:
             raise exc.__class__(
-                f"Could not open report HDF5 file:\n{fname_report}\n"
+                f"Could not open {name} HDF5 file:\n{fname_report}\n"
                 f"Got error:\n{exc}\nPerhaps you need to delete it?"
             ) from None
         try:
@@ -80,7 +85,7 @@ def _open_report(
             except Exception as exc:
                 logger.warning(f"Failed: {exc}")
             fname_report_html = fname_report.with_suffix(".html")
-            msg = f"Saving report: {_linkfile(fname_report_html)}"
+            msg = f"Saving {name}: {_linkfile(fname_report_html)}"
             logger.info(**gen_log_kwargs(message=msg))
             report.save(fname_report, overwrite=True)
             report.save(fname_report_html, overwrite=True, open_browser=False)

diff --git a/mne_bids_pipeline/_run.py b/mne_bids_pipeline/_run.py
@@ -4,6 +4,7 @@
 import functools
 import hashlib
 import inspect
+import json
 import pathlib
 import pdb
 import sys
@@ -277,23 +278,36 @@ def clear(self) -> None:
         self.memory.clear()
 
 
-def save_logs(*, config: SimpleNamespace, logs) -> None:  # TODO add type
+def save_logs(*, config: SimpleNamespace, logs: list[pd.Series]) -> None:
     fname = config.deriv_root / f"task-{get_task(config)}_log.xlsx"
 
     # Get the script from which the function is called for logging
     sheet_name = _short_step_path(_get_step_path()).replace("/", "-")
     sheet_name = sheet_name[-30:]  # shorten due to limit of excel format
 
-    df = pd.DataFrame(logs)
-
-    columns = df.columns
-    if "cfg" in columns:
-        columns = list(columns)
-        idx = columns.index("cfg")
-        del columns[idx]
-        columns.insert(-3, "cfg")  # put it before time, success & err cols
-
-    df = df[columns]
+    # We need to make the logs more compact to be able to write Excel format
+    # (32767 char limit per cell), in particular the "cfg" column has very large
+    # cells, so replace the "cfg" column with separated cfg.* columns (still truncated
+    # to the 32767 char limit)
+    compact_logs = list()
+    for log in logs:
+        log = log.copy()
+        # 1. Remove indentation (e.g., 220814 chars to 54416)
+        cfg = json.loads(log["cfg"])
+        del log["cfg"]
+        assert cfg["__instance_type__"] == ["types", "SimpleNamespace"], cfg[
+            "__instance_type__"
+        ]
+        for key, val in cfg["attributes"].items():
+            if isinstance(val, dict) and list(val.keys()) == ["__pathlib__"]:
+                val = val["__pathlib__"]
+            val = json.dumps(val, separators=(",", ":"))
+            if len(val) > 32767:
+                val = val[:32765] + " …"
+            log[f"cfg.{key}"] = val
+        compact_logs.append(log)
+    df = pd.DataFrame(compact_logs)
+    del logs, compact_logs
 
     with FileLock(fname.with_suffix(fname.suffix + ".lock")):
         append = fname.exists()