Skip to content

Commit

Permalink
Merge branch 'stage_ont' of https://github.com/ssjunnebo/TACA into st…
Browse files Browse the repository at this point in the history
…age_ont
  • Loading branch information
ssjunnebo committed Dec 4, 2024
2 parents 853655a + 7f3af94 commit cb9e918
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 8 deletions.
5 changes: 5 additions & 0 deletions VERSIONLOG.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
# TACA Version Log


## 20241204.1
Add support for staging ONT data on Miarka

## 20241128.1

Add automated cleanup to ONT transfer script.

## 20241127.1

Add support for organising ONT data on Miarka
Expand Down
70 changes: 62 additions & 8 deletions taca/nanopore/instrument_transfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@
from datetime import datetime as dt
from glob import glob

RUN_PATTERN = re.compile(
# Run folder name expected as yyyymmdd_HHMM_1A-3H/MN19414_flowCellId_randomHash
# Flow cell names starting with "CTC" are configuration test cells and should not be included
# As of december 2023, the third column (3A-3H) is excluded, because it will be used by Clinical Genomics
r"^\d{8}_\d{4}_(([1-2][A-H])|(MN19414))_(?!CTC)[A-Za-z0-9]+_[A-Za-z0-9]+$"
)


def main(args):
"""Find ONT runs and transfer them to storage.
Expand All @@ -24,25 +31,26 @@ def main(args):

logging.info("Starting script...")

run_pattern = re.compile(
# Run folder name expected as yyyymmdd_HHMM_1A-3H/MN19414_flowCellId_randomHash
# Flow cell names starting with "CTC" are configuration test cells and should not be included
# As of december 2023, the third column (3A-3H) is excluded, because it will be used by Clinical Genomics
r"^\d{8}_\d{4}_(([1-2][A-H])|(MN19414))_(?!CTC)[A-Za-z0-9]+_[A-Za-z0-9]+$"
)
rsync_log = os.path.join(args.source_dir, "rsync_log.txt")

logging.info("Parsing instrument position logs...")
position_logs = parse_position_logs(args.minknow_logs_dir)
logging.info("Subsetting QC and MUX metrics...")
pore_counts = get_pore_counts(position_logs)

handle_runs(pore_counts, args, rsync_log)
delete_archived_runs(args)


def handle_runs(pore_counts, args, rsync_log):
logging.info("Finding runs...")
# Look for dirs matching run pattern 3 levels deep from source
# Look for dirs matching run pattern 3 levels deep from source, excluding certain dirs
exclude_dirs = ["nosync", "keep_data", "cg_data"]
run_paths = [
path
for path in glob(os.path.join(args.source_dir, "*", "*", "*"), recursive=True)
if re.match(run_pattern, os.path.basename(path))
if re.match(RUN_PATTERN, os.path.basename(path))
and path.split(os.sep)[-3] not in exclude_dirs
]
logging.info(f"Found {len(run_paths)} runs...")

Expand All @@ -69,6 +77,52 @@ def main(args):
final_sync_to_storage(run_path, rsync_dest, args.archive_dir, rsync_log)


def delete_archived_runs(args):
logging.info("Finding locally archived runs...")
# Look for dirs matching run pattern inside archive dir
run_paths = [
path
for path in glob(os.path.join(args.archive_dir, "*", "*", "*"), recursive=True)
if re.match(RUN_PATTERN, os.path.basename(path))
]
logging.info(f"Found {len(run_paths)} locally archived runs...")

preproc_archive_contents = set(
os.listdir(os.path.join(args.dest_dir, "nosync"))
+ os.listdir(os.path.join(args.dest_dir, "nosync", "archived"))
)
# Iterate over runs
for run_path in run_paths:
logging.info(f"Handling locally archived run {run_path}...")
run_name = os.path.basename(run_path)

if run_name in preproc_archive_contents:
logging.info(
f"Locally archived run {run_path} was found in the preproc archive. Deleting..."
)
shutil.rmtree(run_path)
else:
logging.info(
f"Locally archived run {run_path} was not found in the preproc archive. Skipping..."
)
continue

# Remove empty dirs
sample_dirs = set([os.path.dirname(run_path) for run_path in run_paths])
experiment_dirs = set([os.path.dirname(sample_dir) for sample_dir in sample_dirs])

for sample_dir in sample_dirs:
if not os.listdir(sample_dir):
logging.info(f"Removing empty dir '{sample_dir}'.")
os.rmdir(sample_dir)

# Remove empty experiment dirs
for experiment_dir in experiment_dirs:
if not os.listdir(experiment_dir):
logging.info(f"Removing empty dir '{experiment_dir}'.")
os.rmdir(experiment_dir)


def sequencing_finished(run_path: str) -> bool:
sequencing_finished_indicator = "final_summary"
run_dir_content = os.listdir(run_path)
Expand Down
39 changes: 39 additions & 0 deletions tests/nanopore/test_instrument_transfer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import os
import re
import shutil
import tempfile
from unittest.mock import Mock, call, mock_open, patch

Expand Down Expand Up @@ -30,6 +31,8 @@ def setup_test_fixture():
for dir in [
args.source_dir,
args.dest_dir,
args.dest_dir + "/nosync",
args.dest_dir + "/nosync/archived",
args.dest_dir_qc,
args.archive_dir,
args.minknow_logs_dir,
Expand Down Expand Up @@ -76,6 +79,42 @@ def setup_test_fixture():
tmp.cleanup()


def test_main_delete(setup_test_fixture):
"""Check so that remotely archived runs runs and empty dirs
are deleted from the local archive.
"""

# Run fixture
args, tmp, file_paths = setup_test_fixture

# Locally and remotely archived run
run_path = f"{args.archive_dir}/experiment/sample/{DUMMY_RUN_NAME}"
os.makedirs(run_path)
remote_run_path = f"{args.dest_dir}/nosync/{DUMMY_RUN_NAME}"
os.makedirs(remote_run_path)

# Locally but not remotely archived run
innocent_run_path = f"{args.archive_dir}/innocent_experiment/innocent_sample/{DUMMY_RUN_NAME.replace('randomhash', 'dontDeleteMe')}"
os.makedirs(innocent_run_path)

with (
patch("shutil.rmtree", side_effect=shutil.rmtree) as mock_rmtree,
patch("os.rmdir", side_effect=os.rmdir) as mock_rmdir,
):
# Run code
instrument_transfer.main(args)

# Assert deletions
mock_rmtree.assert_has_calls([call(f"{run_path}")])
mock_rmdir.assert_has_calls(
[
call(f"{args.archive_dir}/experiment/sample"),
call(f"{args.archive_dir}/experiment"),
]
)
assert os.path.exists(innocent_run_path)


def test_main_ignore_CTC(setup_test_fixture):
"""Check so that runs on configuration test cells are not picked up."""

Expand Down

0 comments on commit cb9e918

Please sign in to comment.