Skip to content

Commit

Permalink
improving type hints and usage of abc (#259)
Browse files Browse the repository at this point in the history
* add PathLike for input files

* add PathLike for input directory

* add import of annotations

* fix bug in unit tests

* annotate defaults with type Final

* add type alias

* change type var to type alias

* remove literal type hints

* remove unused type var

* replace typing deprecated classes with those from collections.abc

* change registering to inheritance of abstract class

Registering as virtual class can avoid metclass conflicts. However, it's not an issue we need to worry in this codebase, since all concrete classes, e.g. AntismashBGCLoader,   inherit only one base class instead of multiple classes that have different metaclasses.

Using inheritance of abstract class is more straightforward and explict.

* simplify imports

* add type hints for **kwarg

* add overloaded type hints to fix mkdocs warning

* remove base class from package level

* rename logging.py to logger.py to avoid name conflict

* fix bug of logger name

* move NPLINKER_APP_DATA_DIR to defaults to avoid partial initialisation
  • Loading branch information
CunliangGeng authored Jun 20, 2024
1 parent bedebde commit 6c78c20
Show file tree
Hide file tree
Showing 23 changed files with 166 additions and 156 deletions.
49 changes: 3 additions & 46 deletions src/nplinker/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
from pathlib import Path
from .logger import setup_logging
from .nplinker import NPLinker


logging.getLogger(__name__).addHandler(logging.NullHandler())
Expand All @@ -9,48 +10,4 @@
__version__ = "2.0.0-alpha.1"


# The path to the NPLinker application database directory
NPLINKER_APP_DATA_DIR = Path(__file__).parent / "data"
del Path


def setup_logging(level: str = "INFO", file: str = "", use_console: bool = True) -> None:
"""Setup logging configuration for the ancestor logger "nplinker".
Args:
level: The log level, use the logging module's log level constants. Valid levels are:
"NOTSET", "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL".
file: The file to write the log to. If the file does not exist, it will be created. The log
will be written to the file in append mode. If the file is an empty string (by default),
the log will not be written to a file.
use_console: Whether to log to the console.
"""
from rich.console import Console
from rich.logging import RichHandler

# Get the ancestor logger "nplinker"
logger = logging.getLogger(__name__)
logger.setLevel(level)

# File handler
if file:
logger.addHandler(
RichHandler(
console=Console(file=open(file, "a"), width=120), # force the line width to 120
omit_repeated_times=False,
rich_tracebacks=True,
tracebacks_show_locals=True,
log_time_format="[%Y-%m-%d %X]",
)
)

# Console handler
if use_console:
logger.addHandler(
RichHandler(
omit_repeated_times=False,
rich_tracebacks=True,
tracebacks_show_locals=True,
log_time_format="[%Y-%m-%d %X]",
)
)
__all__ = ["NPLinker", "setup_logging"]
11 changes: 8 additions & 3 deletions src/nplinker/arranger.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from __future__ import annotations
import fnmatch
import json
import shutil
from glob import glob
from os import PathLike
from pathlib import Path
from dynaconf import Dynaconf
from jsonschema import validate
Expand Down Expand Up @@ -369,7 +371,7 @@ def arrange_strains_selected(self) -> None:
validate(instance=json_data, schema=USER_STRAINS_SCHEMA)


def validate_gnps(gnps_dir: Path) -> None:
def validate_gnps(gnps_dir: str | PathLike) -> None:
"""Validate the GNPS data directory and its contents.
The GNPS data directory must contain the following files:
Expand All @@ -387,6 +389,7 @@ def validate_gnps(gnps_dir: Path) -> None:
is not found.
ValueError: If both file_mappings.tsv and file_mapping.csv are found.
"""
gnps_dir = Path(gnps_dir)
if not gnps_dir.exists():
raise FileNotFoundError(f"GNPS data directory not found at {gnps_dir}")

Expand Down Expand Up @@ -415,7 +418,7 @@ def validate_gnps(gnps_dir: Path) -> None:
)


def validate_antismash(antismash_dir: Path) -> None:
def validate_antismash(antismash_dir: str | PathLike) -> None:
"""Validate the antiSMASH data directory and its contents.
The validation only checks the structure of the antiSMASH data directory and file names.
Expand All @@ -438,6 +441,7 @@ def validate_antismash(antismash_dir: Path) -> None:
sub-directory.
ValueError: If any sub-directory name contains a space.
"""
antismash_dir = Path(antismash_dir)
if not antismash_dir.exists():
raise FileNotFoundError(f"antiSMASH data directory not found at {antismash_dir}")

Expand All @@ -460,7 +464,7 @@ def validate_antismash(antismash_dir: Path) -> None:
raise FileNotFoundError(f"No BGC files found in antiSMASH sub-directory {sub_dir}")


def validate_bigscape(bigscape_dir: Path, cutoff: str) -> None:
def validate_bigscape(bigscape_dir: str | PathLike, cutoff: str) -> None:
"""Validate the BiG-SCAPE data directory and its contents.
The BiG-SCAPE data directory must exist and contain the clustering file
Expand All @@ -478,6 +482,7 @@ def validate_bigscape(bigscape_dir: Path, cutoff: str) -> None:
Raises:
FileNotFoundError: If the BiG-SCAPE data directory or the clustering file is not found.
"""
bigscape_dir = Path(bigscape_dir)
if not bigscape_dir.exists():
raise FileNotFoundError(f"BiG-SCAPE data directory not found at {bigscape_dir}")

Expand Down
40 changes: 24 additions & 16 deletions src/nplinker/defaults.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,26 @@
STRAIN_MAPPINGS_FILENAME = "strain_mappings.json"
GENOME_BGC_MAPPINGS_FILENAME = "genome_bgc_mappings.json"
GENOME_STATUS_FILENAME = "genome_status.json"
GNPS_SPECTRA_FILENAME = "spectra.mgf"
GNPS_MOLECULAR_FAMILY_FILENAME = "molecular_families.tsv"
GNPS_ANNOTATIONS_FILENAME = "annotations.tsv"
GNPS_FILE_MAPPINGS_TSV = "file_mappings.tsv"
GNPS_FILE_MAPPINGS_CSV = "file_mappings.csv"
STRAINS_SELECTED_FILENAME = "strains_selected.json"
from pathlib import Path
from typing import Final


DOWNLOADS_DIRNAME = "downloads"
MIBIG_DIRNAME = "mibig"
GNPS_DIRNAME = "gnps"
ANTISMASH_DIRNAME = "antismash"
BIGSCAPE_DIRNAME = "bigscape"
BIGSCAPE_RUNNING_OUTPUT_DIRNAME = "bigscape_running_output"
OUTPUT_DIRNAME = "output"
# The path to the NPLinker application database directory
NPLINKER_APP_DATA_DIR: Final = Path(__file__).parent / "data"


STRAIN_MAPPINGS_FILENAME: Final = "strain_mappings.json"
GENOME_BGC_MAPPINGS_FILENAME: Final = "genome_bgc_mappings.json"
GENOME_STATUS_FILENAME: Final = "genome_status.json"
GNPS_SPECTRA_FILENAME: Final = "spectra.mgf"
GNPS_MOLECULAR_FAMILY_FILENAME: Final = "molecular_families.tsv"
GNPS_ANNOTATIONS_FILENAME: Final = "annotations.tsv"
GNPS_FILE_MAPPINGS_TSV: Final = "file_mappings.tsv"
GNPS_FILE_MAPPINGS_CSV: Final = "file_mappings.csv"
STRAINS_SELECTED_FILENAME: Final = "strains_selected.json"


DOWNLOADS_DIRNAME: Final = "downloads"
MIBIG_DIRNAME: Final = "mibig"
GNPS_DIRNAME: Final = "gnps"
ANTISMASH_DIRNAME: Final = "antismash"
BIGSCAPE_DIRNAME: Final = "bigscape"
BIGSCAPE_RUNNING_OUTPUT_DIRNAME: Final = "bigscape_running_output"
OUTPUT_DIRNAME: Final = "output"
6 changes: 4 additions & 2 deletions src/nplinker/genomics/abc.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,22 @@
from __future__ import annotations
from abc import ABC
from abc import abstractmethod
from os import PathLike
from .bgc import BGC
from .gcf import GCF


class BGCLoaderBase(ABC):
"""Abstract base class for BGC loader."""

def __init__(self, data_dir: str) -> None:
def __init__(self, data_dir: str | PathLike) -> None:
"""Initialize the BGC loader.
Args:
data_dir: Path to directory that contains BGC metadata files
(.json) or full data genbank files (.gbk).
"""
self.data_dir = data_dir
self.data_dir = str(data_dir)

@abstractmethod
def get_files(self) -> dict[str, str]:
Expand Down
21 changes: 10 additions & 11 deletions src/nplinker/genomics/antismash/antismash_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
import fnmatch
import logging
import os
from typing import Mapping
from collections.abc import Mapping
from os import PathLike
from pathlib import Path
from Bio import SeqIO
from Bio import SeqRecord
from nplinker.genomics import BGC
Expand All @@ -15,7 +17,7 @@
logger = logging.getLogger(__name__)


class AntismashBGCLoader:
class AntismashBGCLoader(BGCLoaderBase):
"""Build a loader for AntiSMASH BGC genbank (.gbk) files.
Note:
Expand All @@ -32,14 +34,14 @@ class AntismashBGCLoader:
```
"""

def __init__(self, data_dir: str) -> None:
def __init__(self, data_dir: str | PathLike) -> None:
"""Initialize the AntiSMASH BGC loader.
Args:
data_dir: Path to AntiSMASH directory that contains a
collection of AntiSMASH outputs.
"""
self.data_dir = data_dir
self.data_dir = str(data_dir)
self._file_dict = self._parse_data_dir(self.data_dir)
self._bgcs = self._parse_bgcs(self._file_dict)

Expand Down Expand Up @@ -111,7 +113,7 @@ def _parse_bgcs(bgc_files: Mapping[str, str]) -> list[BGC]:
return [parse_bgc_genbank(file) for file in bgc_files.values()]


def parse_bgc_genbank(file: str) -> BGC:
def parse_bgc_genbank(file: str | PathLike) -> BGC:
"""Parse a single BGC gbk file to BGC object.
Args:
Expand All @@ -124,7 +126,8 @@ def parse_bgc_genbank(file: str) -> BGC:
>>> bgc = AntismashBGCLoader.parse_bgc(
... "/data/antismash/GCF_000016425.1/NC_009380.1.region001.gbk")
"""
fname = os.path.splitext(os.path.basename(file))[0]
file = Path(file)
fname = file.stem

record = SeqIO.read(file, format="genbank")
description = record.description # "DEFINITION" in gbk file
Expand All @@ -138,7 +141,7 @@ def parse_bgc_genbank(file: str) -> BGC:
bgc = BGC(fname, *product_prediction)
bgc.description = description
bgc.antismash_id = antismash_id
bgc.antismash_file = file
bgc.antismash_file = str(file)
bgc.antismash_region = features.get("region_number")
bgc.smiles = features.get("smiles")
bgc.strain = Strain(fname)
Expand All @@ -160,7 +163,3 @@ def _parse_antismash_genbank(record: SeqRecord.SeqRecord) -> dict:
smiles = tuple(i.replace(" ", "") for i in smiles)
features["smiles"] = smiles
return features


# register as virtual class to prevent metaclass conflicts
BGCLoaderBase.register(AntismashBGCLoader)
4 changes: 2 additions & 2 deletions src/nplinker/genomics/antismash/podp_antismash_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
import logging
import re
import time
from collections.abc import Mapping
from collections.abc import Sequence
from os import PathLike
from pathlib import Path
from typing import Mapping
from typing import Sequence
import httpx
from bs4 import BeautifulSoup
from bs4 import NavigableString
Expand Down
12 changes: 2 additions & 10 deletions src/nplinker/genomics/bigscape/bigscape_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
logger = logging.getLogger(__name__)


class BigscapeGCFLoader:
class BigscapeGCFLoader(GCFLoaderBase):
"""Build a loader for BiG-SCAPE GCF cluster file.
Attributes:
Expand Down Expand Up @@ -61,11 +61,7 @@ def _parse_gcf(cluster_file: str) -> list[GCF]:
return list(gcf_dict.values())


# register as virtual class to prevent metaclass conflicts
GCFLoaderBase.register(BigscapeGCFLoader)


class BigscapeV2GCFLoader:
class BigscapeV2GCFLoader(GCFLoaderBase):
"""Build a loader for BiG-SCAPE v2 database file.
Attributes:
Expand Down Expand Up @@ -137,7 +133,3 @@ def _parse_gcf(db_file: str) -> list[GCF]:
gcf_dict[family_id].bgc_ids.add(bgc_id)

return list(gcf_dict.values())


# register as virtual class to prevent metaclass conflicts
GCFLoaderBase.register(BigscapeV2GCFLoader)
22 changes: 10 additions & 12 deletions src/nplinker/genomics/mibig/mibig_loader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import annotations
import logging
import os.path
from os import PathLike
from pathlib import Path
from nplinker.strain import Strain
from nplinker.utils import list_files
from ..abc import BGCLoaderBase
Expand All @@ -10,7 +12,7 @@
logger = logging.getLogger(__name__)


class MibigLoader:
class MibigLoader(BGCLoaderBase):
"""Parse MIBiG metadata files and return BGC objects.
MIBiG metadata file (json) contains annotations/metadata information
Expand All @@ -20,13 +22,13 @@ class MibigLoader:
objects have Strain object as their strain attribute (i.e. `BGC.strain`).
"""

def __init__(self, data_dir: str):
def __init__(self, data_dir: str | PathLike):
"""Initialize the MIBiG metadata loader.
Args:
data_dir: Path to the directory of MIBiG metadata json files
"""
self.data_dir = data_dir
self.data_dir = str(data_dir)
self._file_dict = self.parse_data_dir(self.data_dir)
self._metadata_dict = self._parse_metadata()
self._bgcs = self._parse_bgcs()
Expand All @@ -41,7 +43,7 @@ def get_files(self) -> dict[str, str]:
return self._file_dict

@staticmethod
def parse_data_dir(data_dir: str) -> dict[str, str]:
def parse_data_dir(data_dir: str | PathLike) -> dict[str, str]:
"""Parse metadata directory and return paths to all metadata json files.
Args:
Expand All @@ -54,7 +56,7 @@ def parse_data_dir(data_dir: str) -> dict[str, str]:
file_dict = {}
json_files = list_files(data_dir, prefix="BGC", suffix=".json")
for file in json_files:
fname = os.path.splitext(os.path.basename(file))[0]
fname = Path(file).stem
file_dict[fname] = file
return file_dict

Expand Down Expand Up @@ -99,7 +101,7 @@ def _parse_bgcs(self) -> list[BGC]:
return [parse_bgc_metadata_json(file) for file in self._file_dict.values()]


def parse_bgc_metadata_json(file: str) -> BGC:
def parse_bgc_metadata_json(file: str | PathLike) -> BGC:
"""Parse MIBiG metadata file and return BGC object.
Note that the MiBIG accession is used as the BGC id and strain name. The BGC
Expand All @@ -111,12 +113,8 @@ def parse_bgc_metadata_json(file: str) -> BGC:
Returns:
BGC object
"""
metadata = MibigMetadata(file)
metadata = MibigMetadata(str(file))
mibig_bgc = BGC(metadata.mibig_accession, *metadata.biosyn_class)
mibig_bgc.mibig_bgc_class = metadata.biosyn_class
mibig_bgc.strain = Strain(metadata.mibig_accession)
return mibig_bgc


# register as virtual class to prevent metaclass conflicts
BGCLoaderBase.register(MibigLoader)
Loading

0 comments on commit 6c78c20

Please sign in to comment.