diff --git a/src/nplinker/metabolomics/abc.py b/src/nplinker/metabolomics/abc.py index 97a83424..c1a6fd76 100644 --- a/src/nplinker/metabolomics/abc.py +++ b/src/nplinker/metabolomics/abc.py @@ -13,10 +13,18 @@ def spectra(self) -> Sequence[Spectrum]: class MolecularFamilyLoaderBase(ABC): - @property @abstractmethod - def families(self) -> Sequence[MolecularFamily]: - ... + def get_mfs(self, keep_singleton: bool) -> Sequence[MolecularFamily]: + """Get MolecularFamily objects. + + Args: + keep_singleton(bool): True to keep singleton molecular families. A + singleton molecular family is a molecular family that contains + only one spectrum. + + Returns: + Sequence[MolecularFamily]: a list of MolecularFamily objects. + """ class FileMappingLoaderBase(ABC): diff --git a/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py b/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py index 125300c2..e2083e68 100644 --- a/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py +++ b/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py @@ -32,21 +32,28 @@ def __init__(self, file: str | PathLike): >>> print(loader.families[0].spectra_ids) {'1', '3', '7', ...} """ - self._families: list[MolecularFamily | SingletonFamily] = [] + self._mfs: list[MolecularFamily | SingletonFamily] = [] self._file = file self._validate() self._load() - @property - def families(self) -> list[MolecularFamily]: - """Get all molecular families. + def get_mfs(self, keep_singleton: bool = False) -> list[MolecularFamily]: + """Get MolecularFamily objects. + + Args: + keep_singleton(bool): True to keep singleton molecular families. A + singleton molecular family is a molecular family that contains + only one spectrum. Returns: - list[MolecularFamily]: List of all molecular family objects with - their spectra ids. + list[MolecularFamily]: A list of MolecularFamily objects with their + spectra ids. """ - return self._families + mfs = self._mfs + if not keep_singleton: + mfs = [mf for mf in mfs if not mf.is_singleton()] + return mfs def _validate(self): """Validate the GNPS molecular family file.""" @@ -93,8 +100,8 @@ def _load(self) -> None: for spectrum_id in spectra_ids: family = SingletonFamily() ## uuid as family id family.spectra_ids = set([spectrum_id]) - self._families.append(family) + self._mfs.append(family) else: family = MolecularFamily(family_id) family.spectra_ids = spectra_ids - self._families.append(family) + self._mfs.append(family) diff --git a/tests/metabolomics/test_gnps_molecular_family_loader.py b/tests/metabolomics/test_gnps_molecular_family_loader.py index 0285c70d..9f3ad4f3 100644 --- a/tests/metabolomics/test_gnps_molecular_family_loader.py +++ b/tests/metabolomics/test_gnps_molecular_family_loader.py @@ -4,12 +4,22 @@ @pytest.mark.parametrize( - "workflow, num_families, num_spectra", - [(GNPSFormat.SNETS, 25769, 19), (GNPSFormat.SNETSV2, 6902, 10), (GNPSFormat.FBMN, 1105, 5)], + "workflow, num_families, num_spectra, keep_singleton", + [ + (GNPSFormat.SNETS, 25769, 19, True), + (GNPSFormat.SNETSV2, 6902, 10, True), + (GNPSFormat.FBMN, 1105, 5, True), + (GNPSFormat.SNETS, 29, 19, False), + (GNPSFormat.SNETSV2, 72, 10, False), + (GNPSFormat.FBMN, 60, 5, False), + ], ) -def test_has_molecular_families(workflow, num_families, num_spectra, gnps_mf_files): +def test_gnps_molecular_family_loader( + workflow, num_families, num_spectra, keep_singleton, gnps_mf_files +): + """Test GNPSMolecularFamilyLoader class.""" loader = GNPSMolecularFamilyLoader(gnps_mf_files[workflow]) - actual = loader.families + actual = loader.get_mfs(keep_singleton=keep_singleton) assert len(actual) == num_families # test molecular family with id "1" has correct number of spectra ids mf = [mf for mf in actual if mf.family_id == "1"][0]