diff --git a/src/nplinker/arranger.py b/src/nplinker/arranger.py index e963e586..cfe78aa6 100644 --- a/src/nplinker/arranger.py +++ b/src/nplinker/arranger.py @@ -157,10 +157,14 @@ def _download_and_extract_gnps(self) -> None: Get the GNPS task ID from the PODP project JSON file, then download and extract the GNPS data to the default GNPS directory. """ - podp_file = globals.DOWNLOADS_DEFAULT_PATH / f"paired_datarecord_{config.podp_id}.json" + podp_file = ( + globals.DOWNLOADS_DEFAULT_PATH / f"paired_datarecord_{config.podp_id}.json" + ) with open(podp_file, "r") as f: podp_json_data = json.load(f) - gnps_task_id = podp_json_data["metabolomics"]["project"].get("molecular_network") + gnps_task_id = podp_json_data["metabolomics"]["project"].get( + "molecular_network" + ) data_archive = ( GNPSDownloader(gnps_task_id, globals.DOWNLOADS_DEFAULT_PATH) @@ -214,7 +218,9 @@ def _download_and_extract_antismash(self) -> None: Get the antiSMASH data from the PODP project JSON file, then download and extract the antiSMASH data to the default antiSMASH directory. """ - podp_file = globals.DOWNLOADS_DEFAULT_PATH / f"paired_datarecord_{config.podp_id}.json" + podp_file = ( + globals.DOWNLOADS_DEFAULT_PATH / f"paired_datarecord_{config.podp_id}.json" + ) with open(podp_file, "r") as f: podp_json_data = json.load(f) podp_download_and_extract_antismash_data( @@ -228,14 +234,16 @@ def arrange_bigscape(self) -> None: If `config.mode` is "podp", run BiG-SCAPE to generate the clustering file if it doesn't exist or remove the existing BiG-SCAPE data and re-run BiG-SCAPE if it is invalid. The running output of BiG-SCAPE will be saved to the directory "bigscape_running_output" - in the default BiG-SCAPE directory, and the clustering file "mix_clustering_c{config.bigscape.cutoff}.tsv" - will be copied to the default BiG-SCAPE directory. + in the default BiG-SCAPE directory, and the clustering file + "mix_clustering_c{config.bigscape.cutoff}.tsv" will be copied to the default BiG-SCAPE + directory. The validation process includes: - Check if the default BiG-SCAPE data directory exists. - Check if the clustering file "mix_clustering_c{config.bigscape.cutoff}.tsv" exists in the BiG-SCAPE data directory. + - Check if the 'data_sqlite.db' file exists in the BiG-SCAPE data directory. """ pass_validation = False if config.mode == "podp": @@ -307,7 +315,9 @@ def _validate_strain_mappings(self) -> None: strain_mappings_file = config.root_dir / STRAIN_MAPPINGS_FILENAME if not strain_mappings_file.exists(): - raise FileNotFoundError(f"Strain mappings file not found at {strain_mappings_file}") + raise FileNotFoundError( + f"Strain mappings file not found at {strain_mappings_file}" + ) with open(strain_mappings_file, "r") as f: json_data = json.load(f) @@ -316,9 +326,15 @@ def _validate_strain_mappings(self) -> None: def _generate_strain_mappings(self) -> None: """Generate the strain mappings file for the PODP mode.""" - podp_json_file = globals.DOWNLOADS_DEFAULT_PATH / f"paired_datarecord_{config.podp_id}.json" - genome_status_json_file = globals.DOWNLOADS_DEFAULT_PATH / GENOME_STATUS_FILENAME - genome_bgc_mappings_file = globals.ANTISMASH_DEFAULT_PATH / GENOME_BGC_MAPPINGS_FILENAME + podp_json_file = ( + globals.DOWNLOADS_DEFAULT_PATH / f"paired_datarecord_{config.podp_id}.json" + ) + genome_status_json_file = ( + globals.DOWNLOADS_DEFAULT_PATH / GENOME_STATUS_FILENAME + ) + genome_bgc_mappings_file = ( + globals.ANTISMASH_DEFAULT_PATH / GENOME_BGC_MAPPINGS_FILENAME + ) gnps_file_mapping_file = self.gnps_file_mappings_file strain_mappings_file = config.root_dir / STRAIN_MAPPINGS_FILENAME @@ -417,7 +433,9 @@ def validate_antismash(antismash_dir: Path) -> None: ValueError: If any sub-directory name contains a space. """ if not antismash_dir.exists(): - raise FileNotFoundError(f"antiSMASH data directory not found at {antismash_dir}") + raise FileNotFoundError( + f"antiSMASH data directory not found at {antismash_dir}" + ) sub_dirs = list_dirs(antismash_dir) if not sub_dirs: @@ -435,7 +453,9 @@ def validate_antismash(antismash_dir: Path) -> None: gbk_files = list_files(sub_dir, suffix=".gbk", keep_parent=False) bgc_files = fnmatch.filter(gbk_files, "*.region???.gbk") if not bgc_files: - raise FileNotFoundError(f"No BGC files found in antiSMASH sub-directory {sub_dir}") + raise FileNotFoundError( + f"No BGC files found in antiSMASH sub-directory {sub_dir}" + ) def validate_bigscape(bigscape_dir: Path) -> None: @@ -445,6 +465,10 @@ def validate_bigscape(bigscape_dir: Path) -> None: "mix_clustering_c{config.bigscape.cutoff}.tsv" where {config.bigscape.cutoff} is the bigscape cutoff value set in the config file. + Alternatively, the directory can contain the BiG-SCAPE database file generated by BiG-SCAPE v2. + At the moment, all the family assignments in the database will be used, so this database should + contain results from a single run with the desired cutoff. + Args: bigscape_dir: Path to the BiG-SCAPE data directory. @@ -455,5 +479,8 @@ def validate_bigscape(bigscape_dir: Path) -> None: raise FileNotFoundError(f"BiG-SCAPE data directory not found at {bigscape_dir}") clustering_file = bigscape_dir / f"mix_clustering_c{config.bigscape.cutoff}.tsv" - if not clustering_file.exists(): - raise FileNotFoundError(f"BiG-SCAPE clustering file not found: {clustering_file}") + database_file = bigscape_dir / "data_sqlite.db" + if not clustering_file.exists() and not database_file.exists(): + raise FileNotFoundError( + f"BiG-SCAPE data not found in {clustering_file} or {database_file}" + ) diff --git a/src/nplinker/genomics/bigscape/__init__.py b/src/nplinker/genomics/bigscape/__init__.py index 2e92197c..82c39239 100644 --- a/src/nplinker/genomics/bigscape/__init__.py +++ b/src/nplinker/genomics/bigscape/__init__.py @@ -1,8 +1,9 @@ import logging from .bigscape_loader import BigscapeGCFLoader +from .bigscape_loader import BigscapeV2GCFLoader from .runbigscape import run_bigscape logging.getLogger(__name__).addHandler(logging.NullHandler()) -__all__ = ["BigscapeGCFLoader", "run_bigscape"] +__all__ = ["BigscapeGCFLoader", "BigscapeV2GCFLoader", "run_bigscape"] diff --git a/src/nplinker/genomics/bigscape/bigscape_loader.py b/src/nplinker/genomics/bigscape/bigscape_loader.py index af468d6f..e1142c46 100644 --- a/src/nplinker/genomics/bigscape/bigscape_loader.py +++ b/src/nplinker/genomics/bigscape/bigscape_loader.py @@ -1,5 +1,6 @@ from __future__ import annotations import csv +import sqlite3 from os import PathLike from nplinker.logconfig import LogConfig from ..abc import GCFLoaderBase @@ -59,3 +60,78 @@ def _parse_gcf(cluster_file: str) -> list[GCF]: # register as virtual class to prevent metaclass conflicts GCFLoaderBase.register(BigscapeGCFLoader) + + +class BigscapeV2GCFLoader: + def __init__(self, db_file: str | PathLike, /) -> None: + """Build a loader for BiG-SCAPE v2 database file. + + Args: + db_file: Path to the BiG-SCAPE v2 database file + + Attributes: + db_file: path to the BiG-SCAPE database file. + """ + self.db_file = str(db_file) + self._gcf_list = self._parse_gcf(self.db_file) + + def get_gcfs(self, keep_mibig_only: bool = False, keep_singleton: bool = False) -> list[GCF]: + """Get all GCF objects. + + Args: + keep_mibig_only: True to keep GCFs that contain only MIBiG + BGCs. + keep_singleton: True to keep singleton GCFs. A singleton GCF + is a GCF that contains only one BGC. + + Returns: + a list of GCF objects. + """ + gcf_list = self._gcf_list + if not keep_mibig_only: + gcf_list = [gcf for gcf in gcf_list if not gcf.has_mibig_only()] + if not keep_singleton: + gcf_list = [gcf for gcf in gcf_list if not gcf.is_singleton()] + return gcf_list + + @staticmethod + def _parse_gcf(db_file: str) -> list[GCF]: + """Get GCF objects from database. + + Args: + db_file: Path to the sqlite3 database file. + + Returns: + A list of GCF objects + """ + gcf_dict: dict[str, GCF] = {} + + with sqlite3.connect(db_file) as connection: + cursor = connection.cursor() + + query = """ + SELECT gbk.path, bgc_record_family.family_id FROM bgc_record_family + JOIN bgc_record ON bgc_record.id = bgc_record_family.record_id + JOIN gbk ON gbk.id = bgc_record.gbk_id + """ + + results = cursor.execute(query).fetchall() + + for result in results: + gbk_path, family_id = result + + # take the filename of the gbk path as the bgc_id + # filename + bgc_id: str = gbk_path.split("/")[-1] + # remove extension + bgc_id = bgc_id.rsplit(".", 1)[0] + + if family_id not in gcf_dict: + gcf_dict[family_id] = GCF(family_id) + gcf_dict[family_id].bgc_ids.add(bgc_id) + + return list(gcf_dict.values()) + + +# register as virtual class to prevent metaclass conflicts +GCFLoaderBase.register(BigscapeV2GCFLoader) diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index fb313217..b7ae98c1 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -4,6 +4,7 @@ from nplinker.config import config from nplinker.genomics.antismash import AntismashBGCLoader from nplinker.genomics.bigscape import BigscapeGCFLoader +from nplinker.genomics.bigscape import BigscapeV2GCFLoader from nplinker.genomics.mibig import MibigLoader from nplinker.genomics.utils import add_bgc_to_gcf from nplinker.genomics.utils import add_strain_to_bgc @@ -159,7 +160,21 @@ def _load_genomics(self): bigscape_cluster_file = ( globals.BIGSCAPE_DEFAULT_PATH / f"mix_clustering_c{config.bigscape.cutoff}.tsv" ) - raw_gcfs = BigscapeGCFLoader(bigscape_cluster_file).get_gcfs() + bigscape_db_file = globals.BIGSCAPE_DEFAULT_PATH / "data_sqlite.db" + + # switch depending on found file. prefer V1 if both are found + if bigscape_cluster_file.exists(): + loader = BigscapeGCFLoader(bigscape_cluster_file) + logger.debug(f"Loading BigSCAPE cluster file {bigscape_cluster_file}") + elif bigscape_db_file.exists(): + loader = BigscapeV2GCFLoader(bigscape_db_file) + logger.debug(f"Loading BigSCAPE database file {bigscape_db_file}") + else: + raise FileNotFoundError( + f"Neither BigSCAPE cluster file {bigscape_cluster_file} nor database file {bigscape_db_file} were found." + ) + + raw_gcfs = loader.get_gcfs() # Step 5: add BGC objects to GCF all_gcfs_with_bgc, _, _ = add_bgc_to_gcf(all_bgcs_with_strain, raw_gcfs) diff --git a/tests/unit/data/bigscape/mix/data_sqlite.db b/tests/unit/data/bigscape/mix/data_sqlite.db new file mode 100644 index 00000000..53a17b9f Binary files /dev/null and b/tests/unit/data/bigscape/mix/data_sqlite.db differ diff --git a/tests/unit/genomics/test_bigscape_loader.py b/tests/unit/genomics/test_bigscape_loader.py index e3d28a4a..49b4d2e7 100644 --- a/tests/unit/genomics/test_bigscape_loader.py +++ b/tests/unit/genomics/test_bigscape_loader.py @@ -2,6 +2,7 @@ from nplinker.genomics import GCF from nplinker.genomics.abc import GCFLoaderBase from nplinker.genomics.bigscape import BigscapeGCFLoader +from nplinker.genomics.bigscape.bigscape_loader import BigscapeV2GCFLoader from .. import DATA_DIR @@ -37,3 +38,35 @@ def test_parse_gcf(self, loader): assert len(gcf_list) == 5 for gcf in gcf_list: assert isinstance(gcf, GCF) + + +class TestBigscapeV2GCFLoader: + @pytest.fixture + def loader(self): + db_file = DATA_DIR / "bigscape" / "mix" / "data_sqlite.db" + loader = BigscapeV2GCFLoader(db_file) + yield loader + + def test_abc(self, loader): + assert issubclass(BigscapeV2GCFLoader, GCFLoaderBase) + assert isinstance(loader, GCFLoaderBase) + + def test_init(self, loader): + assert loader.db_file == str(DATA_DIR / "bigscape" / "mix" / "data_sqlite.db") + + @pytest.mark.parametrize( + "keep_mibig_only, keep_singleton, expected", + [(False, False, 1), (True, False, 2), (False, True, 2), (True, True, 4)], + ) + def test_get_gcfs_v2(self, loader, keep_mibig_only, keep_singleton, expected): + gcfs = loader.get_gcfs(keep_mibig_only, keep_singleton) + assert isinstance(gcfs, list) + assert len(gcfs) == expected + assert isinstance(gcfs[0], GCF) + + def test_parse_gcf_v2(self, loader): + gcf_list = BigscapeV2GCFLoader._parse_gcf(loader.db_file) + assert isinstance(gcf_list, list) + assert len(gcf_list) == 4 + for gcf in gcf_list: + assert isinstance(gcf, GCF)