Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix local test error #105

Merged
merged 32 commits into from
Feb 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
321bcf3
add docstring to is_mibig
CunliangGeng Nov 23, 2022
cbc66c3
remove MibigBGC class
CunliangGeng Nov 23, 2022
9268c56
remove parameter strain from BGC class
CunliangGeng Nov 23, 2022
9192ac7
refactor load gcf
CunliangGeng Dec 7, 2022
d0ca040
remove useless fields and methods from GCF class
CunliangGeng Dec 7, 2022
89768b3
remove useless is_hybrid method from BGC class
CunliangGeng Dec 7, 2022
38ebfd2
deprecate aa_prediction method in BGC class
CunliangGeng Dec 7, 2022
fcd3434
remove unused function parse_gbk_header
CunliangGeng Dec 9, 2022
f3837d6
remove unused method set_filename from BGC class
CunliangGeng Dec 9, 2022
c77ac08
remove unused methods and fields from BGC class
CunliangGeng Dec 9, 2022
ef59732
update antismash loader
CunliangGeng Dec 9, 2022
3772b11
move genomics tests to subfolder
CunliangGeng Jan 27, 2023
bafd4f6
update BGC class parameters
CunliangGeng Jan 27, 2023
4c99648
update GCF class parameters
CunliangGeng Jan 27, 2023
136f29f
rename region to antismash_region in BGC class
CunliangGeng Feb 8, 2023
411ad57
remove attribute antismash_accession from BGC class
CunliangGeng Feb 8, 2023
d92bce4
refactor MibigMetadata class
CunliangGeng Feb 8, 2023
ded5bb6
remove parameter description from BGC class
CunliangGeng Feb 8, 2023
590cfc1
add attribute`mibig_bgc_class` to BGC class
CunliangGeng Feb 8, 2023
4c3b97b
add docstring and type hints for BGC attributes
CunliangGeng Feb 8, 2023
383c0f9
update BGC init process in AntismashBGCLoader class
CunliangGeng Feb 10, 2023
aef4700
update docstring of MibigMetadata class
CunliangGeng Feb 10, 2023
a63e394
update antismash loader and add tests
CunliangGeng Feb 10, 2023
f9adbf1
Merge branch 'dev' into refactor_BGC_data_model
CunliangGeng Feb 13, 2023
e5ddc94
fix type hint errors
CunliangGeng Feb 13, 2023
2e54f2e
fix wrong implementations of __hash__ method
CunliangGeng Feb 14, 2023
01613b9
Update GCF class
CunliangGeng Feb 14, 2023
dd719c6
fix parameter error of using GCF
CunliangGeng Feb 14, 2023
5db1668
add unit tests for GCF class
CunliangGeng Feb 14, 2023
5735e47
add strain assignment in BGC loaders
CunliangGeng Feb 15, 2023
2fd2b2b
add attribute id to GCF class and use gcf.id in scoring
CunliangGeng Feb 15, 2023
11d4486
Merge branch 'dev' into fix_local_test_error
CunliangGeng Feb 22, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions notebooks/npclassscore_linking/prospecting/class_linking.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
'''
Initial code for NPClassScore
'''
from collections import Counter
from collections import defaultdict
import glob
import os
import sys
from collections import Counter
from collections import defaultdict
import pandas as pd


Expand Down
2 changes: 2 additions & 0 deletions src/nplinker/genomics/antismash/antismash_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from Bio import SeqRecord
from nplinker.genomics import BGC
from nplinker.logconfig import LogConfig
from nplinker.strains import Strain
from nplinker.utils import list_dirs
from nplinker.utils import list_files
from ..abc import BGCLoaderBase
Expand Down Expand Up @@ -135,6 +136,7 @@ def parse_bgc_genbank(file: str) -> BGC:
bgc.antismash_file = file
bgc.antismash_region = features.get("region_number")
bgc.smiles = features.get("smiles")
bgc.strain = Strain(fname)
return bgc


Expand Down
3 changes: 3 additions & 0 deletions src/nplinker/genomics/gcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ def __init__(self, gcf_id: str) -> None:
self._bgcs: set[BGC] = set()
self.strains: StrainCollection = StrainCollection()
self.bigscape_class: str | None = None
# CG TODO: remove attribute id, see issue 103
# https://github.com/NPLinker/nplinker/issues/103
self.id: int | None = None

def __str__(self):
return f"GCF(id={self.gcf_id}, #bgcs={len(self.bgcs)}, #strains={len(self.strains)})."
Expand Down
1 change: 1 addition & 0 deletions src/nplinker/genomics/genomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from pathlib import Path



logger = LogConfig.getLogger(__name__)

CLUSTER_REGION_REGEX = re.compile('(.+?)\\.(cluster|region)(\\d+).gbk$')
Expand Down
5 changes: 4 additions & 1 deletion src/nplinker/genomics/mibig/mibig_loader.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import os.path
from nplinker.logconfig import LogConfig
from nplinker.strains import Strain
from nplinker.utils import list_files
from ..abc import BGCLoaderBase
from ..bgc import BGC
from .mibig_metadata import MibigMetadata
from ..abc import BGCLoaderBase


logger = LogConfig.getLogger(__name__)

Expand Down Expand Up @@ -108,6 +110,7 @@ def parse_bgc_metadata_json(file: str) -> BGC:
metadata = MibigMetadata(file)
mibig_bgc = BGC(metadata.mibig_accession, metadata.biosyn_class)
mibig_bgc.mibig_bgc_class = metadata.biosyn_class
mibig_bgc.strain = Strain(metadata.mibig_accession)
hechth marked this conversation as resolved.
Show resolved Hide resolved
return mibig_bgc


Expand Down
24 changes: 18 additions & 6 deletions src/nplinker/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,15 @@
from nplinker.class_info.class_matches import ClassMatches
from nplinker.class_info.runcanopus import run_canopus
from nplinker.genomics import load_gcfs
from nplinker.genomics.antismash import AntismashBGCLoader
from nplinker.genomics.mibig import MibigBGCLoader
from nplinker.genomics.mibig import download_and_extract_mibig_metadata
from nplinker.logconfig import LogConfig
from nplinker.metabolomics.metabolomics import load_dataset
from nplinker.pairedomics.downloader import Downloader
from nplinker.pairedomics.runbigscape import run_bigscape
from nplinker.strain_collection import StrainCollection
from nplinker.genomics.antismash import AntismashBGCLoader


try:
from importlib.resources import files
Expand Down Expand Up @@ -181,12 +182,12 @@ def __init__(self, config_data):
self.datadir = files('nplinker').joinpath('data')
self.dataset_id = os.path.split(
self._root)[-1] if not self._remote_loading else self._platform_id

if self._remote_loading:
self._downloader = Downloader(self._platform_id)
else:
self._downloader = None

self.bgcs, self.gcfs, self.spectra, self.molfams = [], [], [], []
self.mibig_bgc_dict = {}
self.product_types = []
Expand Down Expand Up @@ -279,7 +280,7 @@ def _init_genomics_paths(self):
# 11. GEN: <root>/mibig_json / mibig_json_dir=<override>
self.mibig_json_dir = self._overrides.get(
self.OR_MIBIG_JSON) or os.path.join(self._root, 'mibig_json')

def _init_paths(self):
# 1. strain mapping are used for everything else so
self.strain_mappings_file = self._overrides.get(
Expand Down Expand Up @@ -331,7 +332,7 @@ def _validate_paths(self):
logger.warning(
'Optional file/directory "{}" does not exist or is not readable!'
.format(f))

def validate(self):
# check antismash format is recognised
if self._antismash_format not in self.ANTISMASH_FMTS:
Expand Down Expand Up @@ -610,8 +611,13 @@ def _load_genomics(self):
self.mibig_bgc_dict = mibig_bgc_loader.get_bgcs()

# add mibig bgc strains
# CG TODO: update strain assignment logics,
# see issue 104 https://github.com/NPLinker/nplinker/issues/104
for bgc in self.mibig_bgc_dict.values():
self.strains.add(bgc.strain)
if bgc.strain is not None:
self.strains.add(bgc.strain)
else:
logger.warning("No strain specified for BGC %s", bgc.bgc_id)

logger.debug('mibig_bgc_dict has {} entries'.format(
len(self.mibig_bgc_dict)))
Expand All @@ -635,6 +641,12 @@ def _load_genomics(self):
antismash_bgc_loader.get_files(),
self._bigscape_cutoff)

# CG TODO: remove the gcf.id, see issue 103
# https://github.com/NPLinker/nplinker/issues/103
# This is only place to set gcf.id value.
for i, gcf in enumerate(self.gcfs):
gcf.id = i

#----------------------------------------------------------------------
# CG: write unknown strains in genomics to file
#----------------------------------------------------------------------
Expand Down
4 changes: 2 additions & 2 deletions src/nplinker/scoring/linking/data_linking.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def _collect_mappings_from_spectra(self, spectra) -> np.ndarray[np.float64]:
mapping_spec[i, 2] = spectrum.family.family_id

return mapping_spec

def _collect_mappings_from_molecular_families(self, molfams: Sequence[MolecularFamily]) -> np.ndarray[np.float64]:
num_spectra = sum(len(x.spectra_ids) for x in molfams)
mapping_spec = np.zeros((num_spectra, 3))
Expand All @@ -143,7 +143,7 @@ def _collect_mappings_from_molecular_families(self, molfams: Sequence[MolecularF
for i, key in enumerate(inverted_mappings):
mapping_spec[i, 1] = key
mapping_spec[i, 2] = inverted_mappings[key]

return mapping_spec

def collect_mappings_gcf(self, gcf_list):
Expand Down
8 changes: 6 additions & 2 deletions src/nplinker/scoring/linking/link_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
from .data_linking_functions import pair_prob_approx
from .data_linking_functions import pair_prob_hg


# CG: TODO get_links function does not work any more, need to update its logics


# CG: TODO get_links function does not work any more, need to update its logics


Expand Down Expand Up @@ -442,8 +446,8 @@ def get_links(self,
link_levels = [0, 1]

# Get necessary ids
# CG: TODO update the logics here
# integer gcf.id has been removed, use string gcf.gcf_id instead.
# CG: TODO update the logics here:
# don't use integer gcf.id, use string gcf.gcf_id instead.
input_ids = np.array([gcf.id for gcf in input_object],
dtype=np.int32)
hechth marked this conversation as resolved.
Show resolved Hide resolved

Expand Down