Skip to content

Commit

Permalink
ENH: Split ICA into multiple steps (#865)
Browse files Browse the repository at this point in the history
Co-authored-by: Richard Höchenberger <richard.hoechenberger@gmail.com>
  • Loading branch information
larsoner and hoechenberger authored Mar 1, 2024
1 parent 324d4b4 commit 94256f7
Show file tree
Hide file tree
Showing 12 changed files with 876 additions and 706 deletions.
5 changes: 3 additions & 2 deletions .circleci/run_dataset_and_copy_files.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,16 @@ echo "Runtime: ${SECONDS} seconds"

# rerun test (check caching)!
SECONDS=0
RERUN_LIMIT=30
if [[ "$RERUN_TEST" == "false" ]]; then
echo "Skipping rerun test"
RUN_TIME=0
else
pytest mne_bids_pipeline --cov-append -k $DS_RUN
RUN_TIME=$SECONDS
echo "Runtime: ${RUN_TIME} seconds (should be < 20)"
echo "Runtime: ${RUN_TIME} seconds (should be <= $RERUN_LIMIT)"
fi
test $RUN_TIME -lt 20
test $RUN_TIME -le $RERUN_LIMIT

if [[ "$COPY_FILES" == "false" ]]; then
echo "Not copying files"
Expand Down
2 changes: 1 addition & 1 deletion docs/mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ nav:
- Preparations for source-level analyses: getting_started/freesurfer.md
- Processing steps:
- Overview: features/overview.md
- Detailed list of processing steps: features/steps.md
- List of processing steps: features/steps.md
- Configuration options:
- General settings: settings/general.md
- Preprocessing:
Expand Down
59 changes: 41 additions & 18 deletions docs/source/features/gen_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,30 +13,49 @@
steps_pre = f"""\
{autogen_header}
# Detailed list of processing steps
# List of processing steps
The following table provides a concise summary of each processing step. The
step names can be used to run individual steps or entire groups of steps by
passing their name(s) to `mne_bids_pipeline` via the `steps=...` argument.
passing their name(s) to `mne_bids_pipeline` via the `steps=...` argument. However,
we recommend using `mne_bids_pipeline config.py` to run the entire pipeline
instead to ensure that all steps affected by a given change are re-run.
""" # noqa: E501

overview_pre = f"""\
{autogen_header}
# MNE-BIDS-Pipeline overview
MNE-BIDS-Pipeline processes your data in a sequential manner, i.e., one step
at a time. The next step is only run after the previous steps have been
successfully completed. There are, of course, exceptions; for example, if you
chose not to apply ICA, the respective steps will simply be omitted and we'll
directly move to the subsequent steps. The following flow chart aims to give
you a brief overview of which steps are included in the pipeline, in which
order they are run, and how we group them together.
chose not to apply ICA or SSP, the spatial filtering steps will simply be omitted and
we'll directly move to the subsequent steps. See [the flowchart below](#flowchart) for
a visualization of the steps, or check out the
[list of processing steps](steps.md) for more information.
All intermediate results are saved to disk for later inspection, and an
**extensive report** is generated. Analyses are conducted on individual (per-subject)
as well as group level.
## Caching
MNE-BIDS-Pipeline offers automated caching of intermediate results. This means that
running `mne_bids_pipeline config.py` once will generate all outputs, and running it
again will only re-run the steps that need rerunning based on:
1. Changes to files on disk (e.g., updates to `bids_root` files), and
2. Changes to `config.py`
!!! info
All intermediate results are saved to disk for later
inspection, and an **extensive report** is generated.
This is particularly useful when you are developing your pipeline, as you can quickly
iterate over changes to your pipeline without having to re-run the entire pipeline
every time -- only the steps that need to be re-run will be executed.
!!! info
Analyses are conducted on individual (per-subject) as well as group level.
## Flowchart
For more detailed information on each step, please refer to the [detailed list
of processing steps](steps.md).
"""

icon_map = {
Expand All @@ -61,25 +80,26 @@
("02", "03"),
("03", "04"),
("04", "05"),
("05", "06a"),
("05", "06a1"),
("06a1", "06a2"),
("05", "06b"),
("05", "07"),
# technically we could have the raw data flow here, but it doesn't really help
# ("05", "08a"),
# ("05", "08b"),
("06a", "08a"),
("07", "08a"),
("06a2", "08a"),
# Force the artifact-fitting and epoching steps on the same level, in this order
"""\
subgraph Z[" "]
direction LR
B06a
B06a1
B07
B06b
end
style Z fill:#0000,stroke-width:0px
""",
("06b", "08b"),
("07", "08a"),
("07", "08b"),
("08a", "09"),
("08b", "09"),
Expand Down Expand Up @@ -120,7 +140,10 @@
# Overview
overview_lines.append(
f"""\
## {module_header}
### {module_header}
<details>
<summary>Click to expand</summary>
```mermaid
flowchart TD"""
Expand Down Expand Up @@ -161,7 +184,7 @@
assert isinstance(a_b, tuple), type(a_b)
a_b = list(a_b) # allow modification
for ii, idx in enumerate(a_b):
assert idx in title_map, (dir_header, sorted(title_map))
assert idx in title_map, (dir_header, idx, sorted(title_map))
if idx not in mapped:
mapped.add(idx)
a_b[ii] = f'{idx}["{title_map[idx]}"]'
Expand All @@ -173,7 +196,7 @@
)
)
assert mapped == all_steps, all_steps.symmetric_difference(mapped)
overview_lines.append("```\n")
overview_lines.append("```\n\n</details>\n")

(out_dir / "steps.md").write_text("\n".join(lines), encoding="utf8")
(out_dir / "overview.md").write_text("\n".join(overview_lines), encoding="utf8")
2 changes: 2 additions & 0 deletions docs/source/v1.6.md.inc
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
- Added saving of detected blink and cardiac events used to calculate SSP projectors (#840 by @larsoner)
- Added [`noise_cov_method`][mne_bids_pipeline._config.noise_cov_method] to allow for the use of methods other than `"shrunk"` for noise covariance estimation (#854 by @larsoner)
- Added option to pass `image_kwargs` to [`mne.Report.add_epochs`] to allow adjusting e.g. `"vmin"` and `"vmax"` of the epochs image in the report via [`report_add_epochs_image_kwargs`][mne_bids_pipeline._config.report_add_epochs_image_kwargs] (#848 by @SophieHerbst)
- Split ICA fitting and artifact detection into separate steps. This means that now, ICA is split into a total of three consecutive steps: fitting, artifact detection, and the actual data cleaning step ("applying ICA"). This makes it easier to experiment with different settings for artifact detection without needing to re-fit ICA. (#865 by @larsoner)

[//]: # (### :warning: Behavior changes)

Expand All @@ -31,6 +32,7 @@
- Changed the default for [`ica_n_components`][mne_bids_pipeline._config.ica_n_components] from `0.8` (too conservative) to `None` to match MNE-Python's default (#853 by @larsoner)
- Prevent events table for the average subject overflowing in reports (#854 by @larsoner)
- Fixed split file behavior for Epochs when using ICA (#855 by @larsoner)
- Fixed a bug where users could not set `_components.tsv` as it would be detected as a cache miss and overwritten on next pipeline run (#865 by @larsoner)

### :medical_symbol: Code health

Expand Down
43 changes: 24 additions & 19 deletions mne_bids_pipeline/_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,27 +30,32 @@ def _open_report(
session: Optional[str],
run: Optional[str] = None,
task: Optional[str] = None,
fname_report: Optional[BIDSPath] = None,
name: str = "report",
):
fname_report = BIDSPath(
subject=subject,
session=session,
# Report is across all runs, but for logging purposes it's helpful
# to pass the run and task for gen_log_kwargs
run=None,
task=cfg.task,
acquisition=cfg.acq,
recording=cfg.rec,
space=cfg.space,
extension=".h5",
datatype=cfg.datatype,
root=cfg.deriv_root,
suffix="report",
check=False,
).fpath
if fname_report is None:
fname_report = BIDSPath(
subject=subject,
session=session,
# Report is across all runs, but for logging purposes it's helpful
# to pass the run and task for gen_log_kwargs
run=None,
task=cfg.task,
acquisition=cfg.acq,
recording=cfg.rec,
space=cfg.space,
extension=".h5",
datatype=cfg.datatype,
root=cfg.deriv_root,
suffix="report",
check=False,
)
fname_report = fname_report.fpath
assert fname_report.suffix == ".h5", fname_report.suffix
# prevent parallel file access
with FileLock(f"{fname_report}.lock"), _agg_backend():
if not fname_report.is_file():
msg = "Initializing report HDF5 file"
msg = f"Initializing {name} HDF5 file"
logger.info(**gen_log_kwargs(message=msg))
report = _gen_empty_report(
cfg=cfg,
Expand All @@ -62,7 +67,7 @@ def _open_report(
report = mne.open_report(fname_report)
except Exception as exc:
raise exc.__class__(
f"Could not open report HDF5 file:\n{fname_report}\n"
f"Could not open {name} HDF5 file:\n{fname_report}\n"
f"Got error:\n{exc}\nPerhaps you need to delete it?"
) from None
try:
Expand All @@ -80,7 +85,7 @@ def _open_report(
except Exception as exc:
logger.warning(f"Failed: {exc}")
fname_report_html = fname_report.with_suffix(".html")
msg = f"Saving report: {_linkfile(fname_report_html)}"
msg = f"Saving {name}: {_linkfile(fname_report_html)}"
logger.info(**gen_log_kwargs(message=msg))
report.save(fname_report, overwrite=True)
report.save(fname_report_html, overwrite=True, open_browser=False)
Expand Down
36 changes: 25 additions & 11 deletions mne_bids_pipeline/_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import functools
import hashlib
import inspect
import json
import pathlib
import pdb
import sys
Expand Down Expand Up @@ -277,23 +278,36 @@ def clear(self) -> None:
self.memory.clear()


def save_logs(*, config: SimpleNamespace, logs) -> None: # TODO add type
def save_logs(*, config: SimpleNamespace, logs: list[pd.Series]) -> None:
fname = config.deriv_root / f"task-{get_task(config)}_log.xlsx"

# Get the script from which the function is called for logging
sheet_name = _short_step_path(_get_step_path()).replace("/", "-")
sheet_name = sheet_name[-30:] # shorten due to limit of excel format

df = pd.DataFrame(logs)

columns = df.columns
if "cfg" in columns:
columns = list(columns)
idx = columns.index("cfg")
del columns[idx]
columns.insert(-3, "cfg") # put it before time, success & err cols

df = df[columns]
# We need to make the logs more compact to be able to write Excel format
# (32767 char limit per cell), in particular the "cfg" column has very large
# cells, so replace the "cfg" column with separated cfg.* columns (still truncated
# to the 32767 char limit)
compact_logs = list()
for log in logs:
log = log.copy()
# 1. Remove indentation (e.g., 220814 chars to 54416)
cfg = json.loads(log["cfg"])
del log["cfg"]
assert cfg["__instance_type__"] == ["types", "SimpleNamespace"], cfg[
"__instance_type__"
]
for key, val in cfg["attributes"].items():
if isinstance(val, dict) and list(val.keys()) == ["__pathlib__"]:
val = val["__pathlib__"]
val = json.dumps(val, separators=(",", ":"))
if len(val) > 32767:
val = val[:32765] + " …"
log[f"cfg.{key}"] = val
compact_logs.append(log)
df = pd.DataFrame(compact_logs)
del logs, compact_logs

with FileLock(fname.with_suffix(fname.suffix + ".lock")):
append = fname.exists()
Expand Down
Loading

0 comments on commit 94256f7

Please sign in to comment.