diff --git a/VERSIONLOG.md b/VERSIONLOG.md index 6d9aca7a..22377af2 100644 --- a/VERSIONLOG.md +++ b/VERSIONLOG.md @@ -1,9 +1,18 @@ # TACA Version Log -## 20241204.2 +## 20241210.2 Add support for uploading ONT data to DDS +## 20241210.1 + +Tweaks and bugfixes for ToulligQC. + +## 20241204.2 + +Add automated QC reports with ToulligQC for ONT. + ## 20241204.1 + Add support for staging ONT data on Miarka ## 20241128.1 diff --git a/taca/analysis/analysis_nanopore.py b/taca/analysis/analysis_nanopore.py index 1c8a2929..740bafee 100644 --- a/taca/analysis/analysis_nanopore.py +++ b/taca/analysis/analysis_nanopore.py @@ -97,6 +97,12 @@ def process_user_run(ont_user_run: ONT_user_run): logger.info(f"{ont_user_run.run_name}: Putting HTML report on GenStat...") ont_user_run.copy_html_report() + # Generate and publish TouliggQC report + logger.info( + f"{ont_user_run.run_name}: Generating and publishing ToulligQC report..." + ) + ont_user_run.toulligqc_report() + # Copy metadata logger.info(f"{ont_user_run.run_name}: Copying metadata...") ont_user_run.copy_metadata() @@ -166,6 +172,10 @@ def process_qc_run(ont_qc_run: ONT_qc_run): logger.info(f"{ont_qc_run.run_name}: Putting HTML report on GenStat...") ont_qc_run.copy_html_report() + # Generate and publish TouliggQC report + logger.info(f"{ont_qc_run.run_name}: Generating and publishing ToulligQC report...") + ont_qc_run.toulligqc_report() + # Look at seq data if not ont_qc_run.has_raw_seq_output(): logger.info(f"{ont_qc_run.run_name}: Run has no sequencing output, continuing") diff --git a/taca/delivery/cli.py b/taca/delivery/cli.py index 39a2fb6f..f8144983 100644 --- a/taca/delivery/cli.py +++ b/taca/delivery/cli.py @@ -52,7 +52,7 @@ def stage(project, flowcells, samples): envvar="ORDER_PORTAL", required=True, type=click.File("r"), - help="Path to order portal credantials to retrive project information", + help="Path to order portal credentials to retrieve project information", ) @click.option( "--statusdb_config", diff --git a/taca/delivery/delivery_classes.py b/taca/delivery/delivery_classes.py index 883f82d8..a8f65433 100644 --- a/taca/delivery/delivery_classes.py +++ b/taca/delivery/delivery_classes.py @@ -218,16 +218,17 @@ def get_other_member_details( other_member_details = [] if not ignore_orderportal_members: logger.info("Fetching additional members from order portal.") - try: - owner_email = self.order_details.get("owner", {}).get("email") + if self.order_details.get("owner", {}): + owner_email = self.order_details.get("owner").get("email", {}) if ( owner_email and owner_email != self.pi_email and owner_email not in other_member_details ): other_member_details.append(owner_email) - bioinfo_email = self.order_details.get("fields", {}).get( - "project_bx_email" + if self.order_details.get("fields", {}): + bioinfo_email = self.order_details.get("fields").get( + "project_bx_email", {} ) if ( bioinfo_email @@ -235,8 +236,8 @@ def get_other_member_details( and bioinfo_email not in other_member_details ): other_member_details.append(bioinfo_email) - lab_email = self.order_details.get("fields", {}).get( - "project_lab_email" + lab_email = self.order_details.get("fields").get( + "project_lab_email", {} ) if ( lab_email @@ -244,12 +245,15 @@ def get_other_member_details( and lab_email not in other_member_details ): other_member_details.append(lab_email) - except (AssertionError, ValueError): - pass # nothing to worry, just move on + if other_member_details: + logger.info( + "The following additional contacts were found and will be added " + f"to the DDS delivery project: {', '.join(other_member_details)}" + ) if other_member_emails: logger.info( - "Other appropriate contacts were found, they will be added " - f"to DDS delivery project: {', '.join(other_member_emails)}" + "The following additional contacts were provided by the operator, " + f"they will be added to the DDS delivery project: {', '.join(other_member_emails)}" ) for email in other_member_emails: if email not in other_member_details: diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index fa55d013..789bcfd3 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -61,6 +61,10 @@ def __init__(self, run_abspath: str): # Get attributes from config self.minknow_reports_dir = CONFIG["nanopore_analysis"]["minknow_reports_dir"] + self.toulligqc_reports_dir = CONFIG["nanopore_analysis"][ + "toulligqc_reports_dir" + ] + self.toulligqc_executable = CONFIG["nanopore_analysis"]["toulligqc_executable"] self.analysis_server = CONFIG["nanopore_analysis"].get("analysis_server", None) self.rsync_options = CONFIG["nanopore_analysis"]["rsync_options"] for k, v in self.rsync_options.items(): @@ -106,6 +110,14 @@ def assert_contents(self): assert self.has_file("/.sync_finished") assert self.has_file("/final_summary*.txt") + # Raw seq files + assert any( + [ + dir in os.listdir(self.run_abspath) + for dir in ["pod5", "pod5_pass", "fast5", "fast5_pass"] + ] + ) + # NGI files from instrument assert self.has_file("/pore_count_history.csv") assert self.has_file("/run_path.txt") @@ -338,6 +350,116 @@ def copy_html_report(self): logger.error(msg) raise RsyncError(msg) + def toulligqc_report(self): + """Generate a QC report for the run using ToulligQC and publish it to GenStat.""" + + # Get sequencing summary file + glob_summary = glob.glob(f"{self.run_abspath}/sequencing_summary*.txt") + assert len(glob_summary) == 1, f"Found {len(glob_summary)} summary files" + summary = glob_summary[0] + + # Determine the format of the raw sequencing data, sorted by preference + raw_data_dir_options = [ + "pod5_pass", + "pod5", + "fast5_pass", + "fast5", + ] + raw_data_dir = None + for raw_data_dir_option in raw_data_dir_options: + if os.path.exists(f"{self.run_abspath}/{raw_data_dir_option}"): + raw_data_dir = raw_data_dir_option + break + if raw_data_dir is None: + raise AssertionError(f"No seq data found in {self.run_abspath}") + raw_data_format = "pod5" if "pod5" in raw_data_dir else "fast5" + + # Load samplesheet, if any + ss_glob = glob.glob(f"{self.run_abspath}/sample_sheet*.csv") + if len(ss_glob) == 0: + samplesheet = None + elif len(ss_glob) > 1: + # If multiple samplesheet, use latest one + samplesheet = ss_glob.sort()[-1] + else: + samplesheet = ss_glob[0] + + # Run has barcode subdirs + barcode_dirs_glob = glob.glob(f"{self.run_abspath}/{raw_data_dir}/barcode*") + if len(barcode_dirs_glob) > 0: + barcode_dirs = True + raw_data_path = barcode_dirs_glob[0] + else: + barcode_dirs = False + raw_data_path = f"{self.run_abspath}/{raw_data_dir}" + + # Determine barcodes + if samplesheet: + ss_df = pd.read_csv(samplesheet) + if "barcode" in ss_df.columns: + ss_barcodes = list(ss_df["barcode"].unique()) + ss_barcodes.sort() + barcode_nums = [int(bc[-2:]) for bc in ss_barcodes] + # If barcodes are numbered sequentially, write arg as range + if barcode_nums == list(range(1, len(barcode_nums) + 1)): + barcodes_arg = f"{ss_barcodes[0]}:{ss_barcodes[-1]}" + else: + barcodes_arg = ":".join(ss_barcodes) + else: + ss_barcodes = None + + command_args = { + "--sequencing-summary-source": summary, + f"--{raw_data_format}-source": raw_data_path, + "--output-directory": self.run_abspath, + "--report-name": "toulligqc_report", + } + if barcode_dirs: + command_args["--barcoding"] = "" + if samplesheet and ss_barcodes: + command_args["--samplesheet"] = samplesheet + command_args["--barcodes"] = barcodes_arg + + # Build command list + command_list = [self.toulligqc_executable] + for k, v in command_args.items(): + command_list.append(k) + if v: + command_list.append(v) + + # Run the command + # Small enough to wait for, should be done in 1-5 minutes + process = subprocess.run(command_list) + + # Check if the command was successful + if process.returncode == 0: + logging.info(f"{self.run_name}: ToulligQC report generated successfully.") + else: + raise subprocess.CalledProcessError(process.returncode, command_list) + + # Copy the report to GenStat + + logger.info( + f"{self.run_name}: Transferring ToulligQC report to ngi-internal..." + ) + # Transfer the ToulligQC .html report file to ngi-internal, renaming it to the full run ID. Requires password-free SSH access. + report_src_path = self.get_file("/toulligqc_report/report.html") + report_dest_path = os.path.join( + self.toulligqc_reports_dir, + f"report_{self.run_name}.html", + ) + transfer_object = RsyncAgent( + src_path=report_src_path, + dest_path=report_dest_path, + validate=False, + ) + try: + transfer_object.transfer() + except RsyncError: + msg = f"{self.run_name}: An error occurred while attempting to transfer the report {report_src_path} to {report_dest_path}." + logger.error(msg) + raise RsyncError(msg) + # Transfer run def transfer_run(self): @@ -499,7 +621,7 @@ def has_fastq_output(self) -> bool: def has_raw_seq_output(self) -> bool: """Check whether run has sequencing data output.""" - raw_seq_dirs = ["pod5", "fast5_pass"] + raw_seq_dirs = ["pod5", "pod5_pass", "fast5", "fast5_pass"] for dir in raw_seq_dirs: if os.path.exists(os.path.join(self.run_abspath, dir)): diff --git a/tests/analysis/test_analysis_nanopore.py b/tests/analysis/test_analysis_nanopore.py index 30a7105a..565c919d 100644 --- a/tests/analysis/test_analysis_nanopore.py +++ b/tests/analysis/test_analysis_nanopore.py @@ -1,5 +1,6 @@ import importlib import logging +import os import subprocess from io import StringIO from unittest.mock import patch @@ -21,28 +22,25 @@ def parametrize_testruns(): """ parameter_string_table = """ - desc instrument qc run_finished sync_finished raw_dirs fastq_dirs barcode_dirs anglerfish_samplesheets anglerfish_ongoing anglerfish_exit - prom_ongoing promethion False False False False False False False False NA - prom_done promethion False True False False False False False False NA - prom_synced promethion False True True False False False False False NA - prom_reads promethion False True True True False False False False NA - prom_fastq promethion False True True True True False False False NA - prom_bcs promethion False True True True True True False False NA - min_ongoing minion False False False False False False False False NA - min_done minion False True False False False False False False NA - min_synced minion False True True False False False False False NA - min_reads minion False True True True False False False False NA - min_fastq minion False True True True True False False False NA - min_bcs minion False True True True True True False False NA - min_qc_ongoing minion True False False False False False False False NA - min_qc_done minion True True False False False False False False NA - min_qc_synced minion True True True False False False False False NA - min_qc_reads minion True True True True False False False False NA - min_qc_fastq minion True True True True True False False False NA - min_qc_bcs minion True True True True True True False False NA - min_qc_ang_ss minion True True True True True True True False NA - min_qc_ang_run minion True True True True True True True True NA - min_qc_ang_done minion True True True True True True True False 0 + desc instrument qc run_finished sync_finished fastq_dirs barcode_dirs anglerfish_samplesheets anglerfish_ongoing anglerfish_exit + prom_ongoing promethion False False False False False False False NA + prom_done promethion False True False False False False False NA + prom_synced promethion False True True False False False False NA + prom_fastq promethion False True True True False False False NA + prom_bcs promethion False True True True True False False NA + min_ongoing minion False False False False False False False NA + min_done minion False True False False False False False NA + min_synced minion False True True False False False False NA + min_fastq minion False True True True False False False NA + min_bcs minion False True True True True False False NA + min_qc_ongoing minion True False False False False False False NA + min_qc_done minion True True False False False False False NA + min_qc_synced minion True True True False False False False NA + min_qc_fastq minion True True True True False False False NA + min_qc_bcs minion True True True True True False False NA + min_qc_ang_ss minion True True True True True True False NA + min_qc_ang_run minion True True True True True True True NA + min_qc_ang_done minion True True True True True True False 0 """ # Turn string table to datastream @@ -51,6 +49,9 @@ def parametrize_testruns(): # Read data, trimming whitespace df = pd.read_csv(data, sep=r"\s+") + # Fix data types + df.anglerfish_exit = df.anglerfish_exit[df.anglerfish_exit.notna()].astype("Int64") + # Replace nan(s) with None(s) df = df.replace(np.nan, None) @@ -100,17 +101,34 @@ def test_ont_transfer(create_dirs, run_properties, caplog): # Mock subprocess.Popen ONLY for Anglerfish original_popen = subprocess.Popen - def side_effect(*args, **kwargs): + def mock_Popen_side_effect(*args, **kwargs): if "anglerfish" in args[0]: return mock_Popen else: return original_popen(*args, **kwargs) mock_Popen = patch( - "taca.nanopore.ONT_run_classes.subprocess.Popen", side_effect=side_effect + "taca.nanopore.ONT_run_classes.subprocess.Popen", + side_effect=mock_Popen_side_effect, ).start() mock_Popen.pid = 1337 # Nice + # Mock subprocess.run ONLY for ToulligQC + original_run = subprocess.run + + def mock_run_side_effect(*args, **kwargs): + if "toulligqc" in args[0]: + os.mkdir(f"{args[0][6]}/toulligqc_report") + open(f"{args[0][6]}/toulligqc_report/report.html", "w").close() + return mock_run + else: + return original_run(*args, **kwargs) + + mock_run = patch( + "taca.nanopore.ONT_run_classes.subprocess.run", side_effect=mock_run_side_effect + ).start() + mock_run.returncode = 0 + # Reload module to implement mocks importlib.reload(analysis_nanopore) @@ -122,7 +140,6 @@ def side_effect(*args, **kwargs): script_files=True, run_finished=run_properties.pop("run_finished"), sync_finished=run_properties.pop("sync_finished"), - raw_dirs=run_properties.pop("raw_dirs"), fastq_dirs=run_properties.pop("fastq_dirs"), barcode_dirs=run_properties.pop("barcode_dirs"), anglerfish_samplesheets=run_properties.pop("anglerfish_samplesheets"), diff --git a/tests/conftest.py b/tests/conftest.py index 8b53b4e3..eeafd79b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -25,7 +25,10 @@ def create_dirs(): │ ├── minion │ │ └── qc │ └── promethion - ├── minknow_reports + ├── ngi-internal + │ ├── minknow_reports + │ └── other_reports + │ └── toulligqc_reports ├── ngi-nas-ns │ ├── Aviti_data │ ├── NextSeq_data @@ -91,8 +94,11 @@ def create_dirs(): os.makedirs(f"{tmp.name}/ngi-nas-ns/samplesheets/NovaSeqXPlus") os.makedirs(f"{tmp.name}/ngi-nas-ns/samplesheets/Aviti") + # GenStat + os.makedirs(f"{tmp.name}/ngi-internal/minknow_reports") + os.makedirs(f"{tmp.name}/ngi-internal/other_reports/toulligqc_reports") + # Misc. ONT dirs/files - os.makedirs(f"{tmp.name}/minknow_reports") os.makedirs(f"{tmp.name}/log") open(f"{tmp.name}/log/transfer_promethion.tsv", "w").close() open(f"{tmp.name}/log/transfer_minion.tsv", "w").close() diff --git a/tests/nanopore/test_ONT_run_classes.py b/tests/nanopore/test_ONT_run_classes.py index 99cd558f..6e9d1d84 100644 --- a/tests/nanopore/test_ONT_run_classes.py +++ b/tests/nanopore/test_ONT_run_classes.py @@ -49,7 +49,9 @@ def make_ONT_test_config(tmp: tempfile.TemporaryDirectory) -> dict: anglerfish: anglerfish_samplesheets_dir: {tmp.name}/ngi-nas-ns/samplesheets/anglerfish anglerfish_path: mock - minknow_reports_dir: {tmp.name}/minknow_reports/ + minknow_reports_dir: {tmp.name}/ngi-internal/minknow_reports/ + toulligqc_reports_dir: {tmp.name}/ngi-internal/other_reports/toulligqc_reports/ + toulligqc_executable: toulligqc rsync_options: '-Lav': None '-r': None @@ -90,7 +92,6 @@ def create_ONT_run_dir( script_files: bool = False, run_finished: bool = False, sync_finished: bool = False, - raw_dirs: bool = False, fastq_dirs: bool = False, barcode_dirs: bool = False, anglerfish_samplesheets: bool = False, @@ -132,9 +133,15 @@ def create_ONT_run_dir( write_pore_count_history(run_path, flowcell_id, instrument_position) if run_finished: + # Raw seq data + os.mkdir(f"{run_path}/pod5_pass") + # Run summary .txt open(f"{run_path}/final_summary_{run_name}.txt", "w").close() + # Sequencing summary .txt + open(f"{run_path}/sequencing_summary_{run_name}.txt", "w").close() + # Run report .html open(f"{run_path}/report_{run_name}.html", "w").close() @@ -188,9 +195,6 @@ def create_ONT_run_dir( with open(f"{run_path}/.anglerfish_done", "w") as f: f.write(str(anglerfish_exit)) - if raw_dirs: - os.mkdir(f"{run_path}/pod5_pass") - if fastq_dirs: os.mkdir(f"{run_path}/fastq_pass")