Skip to content

Commit

Permalink
MAINT: Use pooch for downloading web data (#775)
Browse files Browse the repository at this point in the history
  • Loading branch information
larsoner authored Aug 7, 2023
1 parent 721e728 commit 5c19b94
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 150 deletions.
14 changes: 9 additions & 5 deletions docs/source/examples/gen_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def _gen_demonstrated_funcs(example_config_path: Path) -> dict:
logger.warning(f"Dataset {dataset_name} has no HTML report.")
continue

options = DATASET_OPTIONS[dataset_options_key]
options = DATASET_OPTIONS[dataset_options_key].copy() # we modify locally

report_str = "\n## Generated output\n\n"
example_target_dir = this_dir / dataset_name
Expand Down Expand Up @@ -198,20 +198,24 @@ def _gen_demonstrated_funcs(example_config_path: Path) -> dict:
f"{fname.name} :fontawesome-solid-square-poll-vertical:</a>\n\n"
)

if options["openneuro"]:
assert sum(key in options for key in ("openneuro", "git", "web", "datalad")) == 1
if "openneuro" in options:
url = f'https://openneuro.org/datasets/{options["openneuro"]}'
elif options["git"]:
elif "git" in options:
url = options["git"]
elif options["web"]:
elif "web" in options:
url = options["web"]
else:
assert "datalad" in options # guaranteed above
url = ""

source_str = (
f"## Dataset source\n\nThis dataset was acquired from " f"[{url}]({url})\n"
)

if options["openneuro"]:
if "openneuro" in options:
for key in ("include", "exclude"):
options[key] = options.get(key, [])
download_str = (
f'\n??? example "How to download this dataset"\n'
f" Run in your terminal:\n"
Expand Down
1 change: 1 addition & 0 deletions docs/source/v1.5.md.inc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
- Fixed doc build errors and dependency specifications (#755 by @larsoner)
- Ensure `memory_file_method = "hash"` is tested (#768 by @larsoner)
- Enable [pre-commit.ci](https://pre-commit.ci) (#774 by @larsoner)
- Use `pooch` for web downloads (#775 by @larsoner)

### :bug: Bug fixes

Expand Down
105 changes: 36 additions & 69 deletions mne_bids_pipeline/_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ def _download_via_datalad(*, ds_name: str, ds_path: Path):
import datalad.api as dl

print('datalad installing "{}"'.format(ds_name))
git_url = DATASET_OPTIONS[ds_name]["git"]
options = DATASET_OPTIONS[ds_name]
git_url = options["git"]
assert "exclude" not in options
assert "hash" not in options
dataset = dl.install(path=ds_path, source=git_url)

# XXX: git-annex bug:
Expand All @@ -24,31 +27,35 @@ def _download_via_datalad(*, ds_name: str, ds_path: Path):
else:
n_jobs = 1

for to_get in DATASET_OPTIONS[ds_name]["include"]:
for to_get in DATASET_OPTIONS[ds_name].get("include", []):
print('datalad get data "{}" for "{}"'.format(to_get, ds_name))
dataset.get(to_get, jobs=n_jobs)


def _download_via_openneuro(*, ds_name: str, ds_path: Path):
import openneuro

options = DATASET_OPTIONS[ds_name]
assert "hash" not in options

openneuro.download(
dataset=DATASET_OPTIONS[ds_name]["openneuro"],
dataset=options["openneuro"],
target_dir=ds_path,
include=DATASET_OPTIONS[ds_name]["include"],
exclude=DATASET_OPTIONS[ds_name]["exclude"],
include=options.get("include", []),
exclude=options.get("exclude", []),
verify_size=False,
)


def _download_from_web(*, ds_name: str, ds_path: Path):
"""Retrieve Zip archives from a web URL."""
import cgi
import zipfile
import httpx
from tqdm import tqdm
import pooch

url = DATASET_OPTIONS[ds_name]["web"]
options = DATASET_OPTIONS[ds_name]
url = options["web"]
known_hash = options["hash"]
assert "exclude" not in options
assert "include" not in options
if ds_path.exists():
print(
"Dataset directory already exists; remove it if you wish to "
Expand All @@ -57,76 +64,36 @@ def _download_from_web(*, ds_name: str, ds_path: Path):
return

ds_path.mkdir(parents=True, exist_ok=True)

with httpx.Client(follow_redirects=True) as client:
with client.stream("GET", url=url) as response:
if not response.is_error:
pass # All good!
else:
raise RuntimeError(
f"Error {response.status_code} when trying " f"to download {url}"
)

header = response.headers["content-disposition"]
_, params = cgi.parse_header(header)
# where to store the archive
outfile = ds_path / params["filename"]
remote_file_size = int(response.headers["content-length"])

with open(outfile, mode="wb") as f:
with tqdm(
desc=params["filename"],
initial=0,
total=remote_file_size,
unit="B",
unit_scale=True,
unit_divisor=1024,
leave=False,
) as progress:
num_bytes_downloaded = response.num_bytes_downloaded

for chunk in response.iter_bytes():
f.write(chunk)
progress.update(
response.num_bytes_downloaded - num_bytes_downloaded
)
num_bytes_downloaded = response.num_bytes_downloaded

assert outfile.suffix == ".zip"

with zipfile.ZipFile(outfile) as zip:
for zip_info in zip.infolist():
path_in_zip = Path(zip_info.filename)
# omit top-level directory from Zip archive
target_path = str(Path(*path_in_zip.parts[1:]))
if str(target_path) in (".", ".."):
continue
if zip_info.filename.endswith("/"):
(ds_path / target_path).mkdir(parents=True, exist_ok=True)
continue
zip_info.filename = target_path
print(f"Extracting: {target_path}")
zip.extract(zip_info, ds_path)

outfile.unlink()
path = ds_path.parent.resolve(strict=True)
fname = f"{ds_name}.zip"
pooch.retrieve(
url=url,
path=path,
fname=fname,
processor=pooch.Unzip(extract_dir="."), # relative to path
progressbar=True,
known_hash=known_hash,
)
(path / f"{ds_name}.zip").unlink()


def _download(*, ds_name: str, ds_path: Path):
openneuro_name = DATASET_OPTIONS[ds_name]["openneuro"]
git_url = DATASET_OPTIONS[ds_name]["git"]
osf_node = DATASET_OPTIONS[ds_name]["osf"]
web_url = DATASET_OPTIONS[ds_name]["web"]
options = DATASET_OPTIONS[ds_name]
openneuro_name = options.get("openneuro", "")
git_url = options.get("git", "")
osf_node = options.get("osf", "")
web_url = options.get("web", "")
assert sum(bool(x) for x in (openneuro_name, git_url, osf_node, web_url)) == 1

if openneuro_name:
download_func = _download_via_openneuro
elif git_url:
download_func = _download_via_datalad
elif osf_node:
raise RuntimeError("OSF downloads are currently not supported.")
elif web_url:
download_func = _download_from_web
else:
raise ValueError("No download location was specified.")
assert web_url
download_func = _download_from_web

download_func(ds_name=ds_name, ds_path=ds_path)

Expand Down
72 changes: 13 additions & 59 deletions mne_bids_pipeline/tests/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,22 @@
from typing import Dict, List, TypedDict


class DATASET_OPTIONS_T(TypedDict):
git: str
openneuro: str
osf: str
web: str
include: List[str]
exclude: List[str]
# If not supplied below, the effective defaults are listed in comments
class DATASET_OPTIONS_T(TypedDict, total=False):
git: str # ""
openneuro: str # ""
osf: str # ""
web: str # ""
include: List[str] # []
exclude: List[str] # []
hash: str # ""


DATASET_OPTIONS: Dict[str, DATASET_OPTIONS_T] = {
"ERP_CORE": {
"git": "",
"openneuro": "",
"osf": "", # original dataset: '9f5w7'
# original dataset: "osf": "9f5w7"
"web": "https://osf.io/3zk6n/download?version=2",
"include": [],
"exclude": [],
"hash": "sha256:ddc94a7c9ba1922637f2770592dd51c019d341bf6bc8558e663e1979a4cb002f", # noqa: E501
},
"eeg_matchingpennies": {
# This dataset started out on osf.io as dataset https://osf.io/cj2dr
Expand All @@ -31,18 +30,12 @@ class DATASET_OPTIONS_T(TypedDict):
# "include": ["sub-05"],
#
# So now we mirror this datalad-fetched git repo back on osf.io!
"git": "",
"openneuro": "",
"osf": "", # original dataset: 'cj2dr'
# original dataset: "osf": "cj2dr"
"web": "https://osf.io/download/8rbfk?version=1",
"include": [],
"exclude": [],
"hash": "sha256:06bfbe52c50b9343b6b8d2a5de3dd33e66ad9303f7f6bfbe6868c3c7c375fafd", # noqa: E501
},
"ds003104": { # Anonymized "somato" dataset.
"git": "",
"openneuro": "ds003104",
"osf": "",
"web": "",
"include": ["sub-01", "derivatives/freesurfer/subjects"],
"exclude": [
"derivatives/freesurfer/subjects/01/mri/aparc+aseg.mgz",
Expand All @@ -51,30 +44,19 @@ class DATASET_OPTIONS_T(TypedDict):
],
},
"ds000246": {
"git": "",
"openneuro": "ds000246",
"osf": "",
"web": "",
"include": [
"sub-0001/meg/sub-0001_task-AEF_run-01_meg.ds",
"sub-0001/meg/sub-0001_task-AEF_run-01_meg.json",
"sub-0001/meg/sub-0001_task-AEF_run-01_channels.tsv",
],
"exclude": [],
},
"ds000247": {
"git": "",
"openneuro": "ds000247",
"osf": "",
"web": "",
"include": ["sub-0002/ses-01/meg"],
"exclude": [],
},
"ds000248": {
"git": "",
"openneuro": "ds000248",
"osf": "",
"web": "",
"include": ["sub-01", "sub-emptyroom", "derivatives/freesurfer/subjects"],
"exclude": [
"derivatives/freesurfer/subjects/fsaverage/mri/aparc.a2005s+aseg.mgz", # noqa: E501
Expand All @@ -88,10 +70,7 @@ class DATASET_OPTIONS_T(TypedDict):
],
},
"ds000117": {
"git": "",
"openneuro": "ds000117",
"osf": "",
"web": "",
"include": [
"sub-01/ses-meg/meg/sub-01_ses-meg_task-facerecognition_run-01_*", # noqa: E501
"sub-01/ses-meg/meg/sub-01_ses-meg_task-facerecognition_run-02_*", # noqa: E501
Expand All @@ -102,29 +81,17 @@ class DATASET_OPTIONS_T(TypedDict):
"derivatives/meg_derivatives/ct_sparse.fif",
"derivatives/meg_derivatives/sss_cal.dat",
],
"exclude": [],
},
"ds003775": {
"git": "",
"openneuro": "ds003775",
"osf": "",
"web": "",
"include": ["sub-010"],
"exclude": [],
},
"ds001810": {
"git": "",
"openneuro": "ds001810",
"osf": "",
"web": "",
"include": ["sub-01"],
"exclude": [],
},
"ds001971": {
"git": "",
"openneuro": "ds001971",
"osf": "",
"web": "",
"include": [
"sub-001/eeg/sub-001_task-AudioCueWalkingStudy_run-01_events.tsv",
"sub-001/eeg/sub-001_task-AudioCueWalkingStudy_run-01_eeg.set",
Expand All @@ -134,38 +101,25 @@ class DATASET_OPTIONS_T(TypedDict):
"sub-001/eeg/sub-001_task-AudioCueWalkingStudy_run-01_coordsystem.json", # noqa: E501
"sub-001/eeg/sub-001_task-AudioCueWalkingStudy_run-01_channels.tsv", # noqa: E501
],
"exclude": [],
},
"ds003392": {
"git": "",
"openneuro": "ds003392",
"osf": "",
"web": "",
"include": ["sub-01", "sub-emptyroom/ses-19111211"],
"exclude": [],
},
"ds004107": {
"git": "",
"openneuro": "ds004107",
"osf": "",
"web": "",
"include": [
"sub-mind002/ses-01/meg/*coordsystem*",
"sub-mind002/ses-01/meg/*auditory*",
],
"exclude": [],
},
"ds004229": {
"git": "",
"openneuro": "ds004229",
"osf": "",
"web": "",
"include": [
"sub-102",
"sub-emptyroom/ses-20000101",
"derivatives/meg_derivatives/ct_sparse.fif",
"derivatives/meg_derivatives/sss_cal.dat",
],
"exclude": [],
},
}
Loading

0 comments on commit 5c19b94

Please sign in to comment.