From 93b17eb8162d41843465054b51ee439766cacb30 Mon Sep 17 00:00:00 2001 From: kierandidi Date: Wed, 29 May 2024 14:55:48 +0100 Subject: [PATCH 1/7] [feat] add uniprot ecnumber and cath label options to pdb manager --- graphein/ml/datasets/pdb_data.py | 218 ++++++++++++++++++++++++++++++- graphein/protein/utils.py | 12 +- 2 files changed, 223 insertions(+), 7 deletions(-) diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py index 4ba8fbe7..e1482af4 100644 --- a/graphein/ml/datasets/pdb_data.py +++ b/graphein/ml/datasets/pdb_data.py @@ -36,13 +36,14 @@ def __init__( split_ratios: Optional[List[float]] = None, split_time_frames: Optional[List[np.datetime64]] = None, assign_leftover_rows_to_split_n: int = 0, + labels: Optional[List[str]] = None, ): """Instantiate a selection of experimental PDB structures. :param root_dir: The directory in which to store all PDB entries, defaults to ``"."``. :type root_dir: str, optional - :param structure_format: Whether to use ``.pdb`` or ``.mmtf`` file. + :param structure_format: Whether to use ``.pdb``, ``.mmtf`` or ``mmcif`` file. Defaults to ``"pdb"``. :type structure_format: str, optional :param splits: A list of names corresponding to each dataset split, @@ -58,6 +59,9 @@ def __init__( to assign any rows remaining after creation of new dataset splits, defaults to ``0``. :type assign_leftover_rows_to_split_n: int, optional + :param labels: A list of names corresponding to metadata labels that should be included in PDB manager dataframe, + defaults to ``None``. + :type labels: Optional[List[str]], optional """ # Arguments self.root_dir = Path(root_dir) @@ -83,6 +87,14 @@ def __init__( ) self.pdb_availability_url = "https://files.wwpdb.org/pub/pdb/compatible/pdb_bundle/pdb_bundle_index.txt" + self.pdb_chain_cath_uniprot_url = "https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_cath_uniprot.tsv.gz" + + self.cath_id_cath_code_url = "http://download.cathdb.info/cath/releases/daily-release/newest/cath-b-newest-all.gz" + + self.pdb_chain_ec_number_url = "https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_enzyme.tsv.gz" + + + self.pdb_dir = self.root_dir / "pdb" if not os.path.exists(self.pdb_dir): os.makedirs(self.pdb_dir) @@ -99,12 +111,15 @@ def __init__( self.pdb_deposition_date_url ).name self.pdb_availability_filename = Path(self.pdb_availability_url).name + self.pdb_chain_cath_uniprot_filename = Path(self.pdb_chain_cath_uniprot_url).name + self.cath_id_cath_code_filename = Path(self.cath_id_cath_code_url).name + self.pdb_chain_ec_number_filename = Path(self.pdb_chain_ec_number_url).name self.list_columns = ["ligands"] # Data self.download_metadata() - self.df = self.parse() + self.df = self.parse(labels) self.source = self.df.copy() # Splits @@ -146,6 +161,9 @@ def download_metadata(self): self._download_entry_metadata() self._download_exp_type() self._download_pdb_availability() + self._download_pdb_chain_cath_uniprot_map() + self._download_cath_id_cath_code_map() + self._download_pdb_chain_ec_number_map() def get_unavailable_pdb_files( self, splits: Optional[List[str]] = None @@ -410,6 +428,33 @@ def _download_pdb_availability(self): log.info("Downloading PDB availability map...") wget.download(self.pdb_availability_url, out=str(self.root_dir)) log.debug("Downloaded PDB availability map") + + def _download_pdb_chain_cath_uniprot_map(self): + """Download mapping from PDB chain to uniprot accession and CATH ID from + https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_cath_uniprot.tsv.gz + """ + if not os.path.exists(self.root_dir / self.pdb_chain_cath_uniprot_filename): + log.info("Downloading Uniprot CATH map...") + wget.download(self.pdb_chain_cath_uniprot_url, out=str(self.root_dir)) + log.debug("Downloaded Uniprot CATH map") + + def _download_cath_id_cath_code_map(self): + """Download mapping from CATH IDs to CATH code from + http://download.cathdb.info/cath/releases/daily-release/newest/cath-b-newest-all.gz + """ + if not os.path.exists(self.root_dir / self.cath_id_cath_code_filename): + log.info("Downloading CATH ID to CATH code map...") + wget.download(self.cath_id_cath_code_url, out=str(self.root_dir)) + log.debug("Downloaded CATH ID to CATH code map") + + def _download_pdb_chain_ec_number_map(self): + """Download mapping from PDB chains to EC number from + https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_enzyme.tsv.gz + """ + if not os.path.exists(self.root_dir / self.pdb_chain_ec_number_filename): + log.info("Downloading EC number map...") + wget.download(self.pdb_chain_ec_number_url, out=str(self.root_dir)) + log.debug("Downloaded EC number map") def _parse_ligand_map(self) -> Dict[str, List[str]]: """Parse the ligand maps for all PDB records. @@ -536,9 +581,90 @@ def _parse_pdb_availability(self) -> Dict[str, bool]: ids = {id: False for id in ids} return ids - def parse(self) -> pd.DataFrame: + def _parse_uniprot_id(self) -> Dict[str, str]: + """Parse the uniprot ID for all PDB chains. + + :return: Dictionary of PDB chain ID with their + corresponding uniprot ID. + :rtype: Dict[str, str] + """ + uniprot_mapping = {} + with gzip.open(self.root_dir / self.pdb_chain_cath_uniprot_filename, 'rt') as f: + for line in f: + try: + pdb, chain, uniprot_id, cath_id = line.strip().split('\t') + key = f"{pdb}_{chain}" + uniprot_mapping[key] = uniprot_id + except ValueError: + continue + return uniprot_mapping + + def _parse_cath_id(self) -> Dict[str, str]: + """Parse the CATH ID for all PDB chains. + + :return: Dictionary of PDB chain ID with their + corresponding CATH ID. + :rtype: Dict[str, str] + """ + cath_mapping = {} + with gzip.open(self.root_dir / self.pdb_chain_cath_uniprot_filename, 'rt') as f: + next(f) # Skip header line + for line in f: + try: + pdb, chain, uniprot_id, cath_id = line.strip().split('\t') + key = f"{pdb}_{chain}" + cath_mapping[key] = cath_id + except ValueError: + continue + return cath_mapping + + def _parse_cath_code(self) -> Dict[str, str]: + """Parse the CATH code for all CATH IDs. + + :return: Dictionary of CATH ID with their + corresponding CATH code. + :rtype: Dict[str, str] + """ + cath_mapping = {} + with gzip.open(self.root_dir / self.cath_id_cath_code_filename, 'rt') as f: + print(f) + for line in f: + print(line) + try: + cath_id, cath_version, cath_code, cath_segment = line.strip().split() + cath_mapping[cath_id] = cath_code + print(cath_id, cath_code) + except ValueError: + continue + return cath_mapping + + def _parse_ec_number(self) -> Dict[str, str]: + """Parse the CATH ID for all PDB chains and adds None when no EC number is present. + + :return: Dictionary of PDB chain ID with their + corresponding EC number. + :rtype: Dict[str, str] + """ + ec_mapping = {} + with gzip.open(self.root_dir / self.pdb_chain_ec_number_filename, 'rt') as f: + next(f) # Skip header line + for line in f: + try: + pdb, chain, uniprot_id, ec_number = line.strip().split('\t') + key = f"{pdb}_{chain}" + ec_number = None if ec_number == '?' else ec_number + ec_mapping[key] = ec_number + except ValueError: + continue + return ec_mapping + + def parse(self, labels: List[str]) -> pd.DataFrame: """Parse all PDB sequence records. + :param labels: A list of names corresponding to metadata labels that should be included in PDB manager dataframe, + defaults to ``None``. + :type labels: Optional[List[str]], optional + :return: DataFrame containing PDB sequence entries with their corresponding metadata. :rtype: pd.DataFrame @@ -579,6 +705,14 @@ def parse(self) -> pd.DataFrame: df["experiment_type"] = df.pdb.map(self._parse_experiment_type()) df["pdb_file_available"] = df.pdb.map(self._parse_pdb_availability()) df.pdb_file_available.fillna(True, inplace=True) + if labels: + if "uniprot_id" in labels: + df["uniprot_id"] = df.id.map(self._parse_uniprot_id()) + if "cath_code" in labels: + df["cath_id"] = df.id.map(self._parse_cath_id()) + df["cath_code"] = df.cath_id.map(self._parse_cath_code()) + if "ec_number" in labels: + df["ec_number"] = df.id.map(self._parse_ec_number()) return df @@ -1150,6 +1284,82 @@ def select_complexes_with_grouped_molecule_types( if update: self.df = df + def has_uniprot_id( + self, + splits: Optional[List[str]] = None, + update: bool = False, + ) -> pd.DataFrame: + """ + Select entries that have a uniprot ID. + + :param splits: Names of splits for which to perform the operation, + defaults to ``None``. + :type splits: Optional[List[str]], optional + :param update: Whether to modify the DataFrame in place, defaults to + ``False``. + :type update: bool, optional + + :return: DataFrame of selected molecules. + :rtype: pd.DataFrame + """ + splits_df = self.get_splits(splits) + df = splits_df.dropna(subset=['uniprot_id']) + + if update: + self.df = df + return df + + + def has_cath_code( + self, + splits: Optional[List[str]] = None, + update: bool = False, + ) -> pd.DataFrame: + """ + Select entries that have a cath code. + + :param splits: Names of splits for which to perform the operation, + defaults to ``None``. + :type splits: Optional[List[str]], optional + :param update: Whether to modify the DataFrame in place, defaults to + ``False``. + :type update: bool, optional + + :return: DataFrame of selected molecules. + :rtype: pd.DataFrame + """ + splits_df = self.get_splits(splits) + df = splits_df.dropna(subset=['cath_code']) + + if update: + self.df = df + return df + + def has_ec_number( + self, + splits: Optional[List[str]] = None, + update: bool = False, + ) -> pd.DataFrame: + """ + Select entries that have an EC number. + + :param splits: Names of splits for which to perform the operation, + defaults to ``None``. + :type splits: Optional[List[str]], optional + :param update: Whether to modify the DataFrame in place, defaults to + ``False``. + :type update: bool, optional + + :return: DataFrame of selected molecules. + :rtype: pd.DataFrame + """ + splits_df = self.get_splits(splits) + df = splits_df.dropna(subset=['ec_number']) + + if update: + self.df = df + return df + def split_df_proportionally( self, df: pd.DataFrame, @@ -1572,7 +1782,7 @@ def download_pdbs( :param out_dir: Output directory, defaults to ``"."`` :type out_dir: str, optional - :param format: Filetype to download. ``pdb`` or ``mmtf``. + :param format: Filetype to download. ``pdb``, ``mmtf``, ``mmcif`` or ``bcif``. :type format: str :param splits: Names of splits for which to perform the operation, defaults to ``None``. diff --git a/graphein/protein/utils.py b/graphein/protein/utils.py index 6c9b76e8..f69a0684 100644 --- a/graphein/protein/utils.py +++ b/graphein/protein/utils.py @@ -108,7 +108,7 @@ def download_pdb_multiprocessing( :type pdb_codes: List[str] :param out_dir: Path to directory to download PDB structures to. :type out_dir: Union[str, Path] - :param format: Filetype to download. ``pdb`` or ``mmtf``. + :param format: Filetype to download. ``pdb``, ``mmtf``, ``mmcif`` or ``bcif``. :type format: str :param overwrite: Whether to overwrite existing files, defaults to ``False``. @@ -162,7 +162,7 @@ def download_pdb( :param out_dir: Path to directory to download PDB structure to. If ``None``, will download to a temporary directory. :type out_dir: Optional[Union[str, Path]] - :param format: Filetype to download. ``pdb`` or ``mmtf``. + :param format: Filetype to download. ``pdb``, ``mmtf``, ``mmcif`` or ``bcif``. :type format: str :param check_obsolete: Whether to check for obsolete PDB codes, defaults to ``False``. If an obsolete PDB code is found, the updated PDB @@ -183,8 +183,14 @@ def download_pdb( elif format == "mmtf": BASE_URL = "https://mmtf.rcsb.org/v1.0/full/" extension = ".mmtf.gz" + elif format == "mmcif": + BASE_URL = "https://files.rcsb.org/download/" + extension = ".cif.gz" + elif format == "bcif": + BASE_URL = "https://models.rcsb.org/" + extension = ".bcif.gz" else: - raise ValueError(f"Invalid format: {format}. Must be 'pdb' or 'mmtf'.") + raise ValueError(f"Invalid format: {format}. Must be 'pdb', 'mmtf', 'mmcif' or 'bcif'.") # Make output directory if it doesn't exist or set it to tempdir if None if out_dir is not None: From c4aea57eebafb125795cd7d70889c9abe549fe5a Mon Sep 17 00:00:00 2001 From: kierandidi Date: Wed, 29 May 2024 15:00:32 +0100 Subject: [PATCH 2/7] [doc] added to changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e172426f..d0baa19e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ * Fix cluster file loading bug in `pdb_data.py` [#396](https://github.com/a-r-j/graphein/pull/396) #### Misc +* add metadata options for uniprot, ecnumber and CATH code to pdb manager [#398](https://github.com/a-r-j/graphein/pull/398) * bumped logging level down from `INFO` to `DEBUG` at several places to reduced output length [#391](https://github.com/a-r-j/graphein/pull/391) * exposed `fill_value` and `bfactor` option to `protein_to_pyg` function. [#385](https://github.com/a-r-j/graphein/pull/385) and [#388](https://github.com/a-r-j/graphein/pull/388) * Updated Foldcomp datasets with improved setup function and updated database choices such as ESMAtlas. [#382](https://github.com/a-r-j/graphein/pull/382) From 0ca75eb0a4118a2800891e7fd6670e76f99edc0c Mon Sep 17 00:00:00 2001 From: kierandidi Date: Wed, 29 May 2024 23:12:32 +0100 Subject: [PATCH 3/7] [fix] address pandas warnings --- graphein/ml/datasets/pdb_data.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py index e1482af4..71ebe3df 100644 --- a/graphein/ml/datasets/pdb_data.py +++ b/graphein/ml/datasets/pdb_data.py @@ -553,7 +553,7 @@ def _parse_entries(self) -> Dict[str, datetime]: df.dropna(subset=["id"], inplace=True) df.id = df.id.str.lower() - df.date = pd.to_datetime(df.date) + df.date = pd.to_datetime(df.date, format = "%m/%d/%y") return pd.Series(df["date"].values, index=df["id"]).to_dict() def _parse_experiment_type(self) -> Dict[str, str]: @@ -704,7 +704,7 @@ def parse(self, labels: List[str]) -> pd.DataFrame: df["deposition_date"] = df.pdb.map(self._parse_entries()) df["experiment_type"] = df.pdb.map(self._parse_experiment_type()) df["pdb_file_available"] = df.pdb.map(self._parse_pdb_availability()) - df.pdb_file_available.fillna(True, inplace=True) + df["pdb_file_available"] = df["pdb_file_available"].fillna(True) if labels: if "uniprot_id" in labels: df["uniprot_id"] = df.id.map(self._parse_uniprot_id()) @@ -1771,8 +1771,8 @@ def reset(self) -> pd.DataFrame: def download_pdbs( self, - out_dir=".", - format="pdb", + out_dir: str = ".", + format: str = "pdb", splits: Optional[List[str]] = None, overwrite: bool = False, max_workers: int = 8, From 680d16a3cd783e1c0b32cdd842e95becef961ae9 Mon Sep 17 00:00:00 2001 From: kierandidi Date: Thu, 30 May 2024 14:24:45 +0100 Subject: [PATCH 4/7] [feat] improve type hinting and add subselecting based on labels --- graphein/ml/datasets/pdb_data.py | 33 ++++++++++++++++++++++++++++---- graphein/protein/tensor/io.py | 2 +- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py index 71ebe3df..8567fb25 100644 --- a/graphein/ml/datasets/pdb_data.py +++ b/graphein/ml/datasets/pdb_data.py @@ -6,7 +6,7 @@ from datetime import datetime from io import StringIO from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union, Literal import numpy as np import pandas as pd @@ -36,7 +36,7 @@ def __init__( split_ratios: Optional[List[float]] = None, split_time_frames: Optional[List[np.datetime64]] = None, assign_leftover_rows_to_split_n: int = 0, - labels: Optional[List[str]] = None, + labels: Optional[List[Literal["uniprot_id", "cath_code", "ec_number"]]] = None, ): """Instantiate a selection of experimental PDB structures. @@ -61,7 +61,7 @@ def __init__( :type assign_leftover_rows_to_split_n: int, optional :param labels: A list of names corresponding to metadata labels that should be included in PDB manager dataframe, defaults to ``None``. - :type labels: Optional[List[str]], optional + :type labels: Optional[List[Literal["uniprot_id", "cath_code", "ec_number"]]], optional """ # Arguments self.root_dir = Path(root_dir) @@ -658,7 +658,7 @@ def _parse_ec_number(self) -> Dict[str, str]: continue return ec_mapping - def parse(self, labels: List[str]) -> pd.DataFrame: + def parse(self, labels: Optional[List[Literal["uniprot_id", "cath_code", "ec_number"]]] = None) -> pd.DataFrame: """Parse all PDB sequence records. :param labels: A list of names corresponding to metadata labels that should be included in PDB manager dataframe, @@ -1286,12 +1286,17 @@ def select_complexes_with_grouped_molecule_types( def has_uniprot_id( self, + select_ids: Optional[List[str]] = None, splits: Optional[List[str]] = None, update: bool = False, ) -> pd.DataFrame: """ Select entries that have a uniprot ID. + :param select_ids: If present, filter for only these IDs. If not present, filter for entries + that have any uniprot ID. + defaults to ``None``. + :type select_ids: Optional[List[str]], optional :param splits: Names of splits for which to perform the operation, defaults to ``None``. :type splits: Optional[List[str]], optional @@ -1305,6 +1310,9 @@ def has_uniprot_id( splits_df = self.get_splits(splits) df = splits_df.dropna(subset=['uniprot_id']) + if select_ids: + df = df[df['uniprot_id'].isin(select_ids)] + if update: self.df = df return df @@ -1312,12 +1320,17 @@ def has_uniprot_id( def has_cath_code( self, + select_ids: Optional[List[str]] = None, splits: Optional[List[str]] = None, update: bool = False, ) -> pd.DataFrame: """ Select entries that have a cath code. + :param select_ids: If present, filter for only these CATH codes. If not present, filter for entries + that have any cath code. + defaults to ``None``. + :type select_ids: Optional[List[str]], optional :param splits: Names of splits for which to perform the operation, defaults to ``None``. :type splits: Optional[List[str]], optional @@ -1331,18 +1344,27 @@ def has_cath_code( splits_df = self.get_splits(splits) df = splits_df.dropna(subset=['cath_code']) + if select_ids: + df = df[df['cath_code'].isin(select_ids)] + + if update: self.df = df return df def has_ec_number( self, + select_ids: Optional[List[str]] = None, splits: Optional[List[str]] = None, update: bool = False, ) -> pd.DataFrame: """ Select entries that have an EC number. + :param select_ids: If present, filter for only these ec_numbers. If not present, filter for entries + that have any EC number + defaults to ``None``. + :type select_ids: Optional[List[str]], optional :param splits: Names of splits for which to perform the operation, defaults to ``None``. :type splits: Optional[List[str]], optional @@ -1356,6 +1378,9 @@ def has_ec_number( splits_df = self.get_splits(splits) df = splits_df.dropna(subset=['ec_number']) + if select_ids: + df = df[df['ec_number'].isin(select_ids)] + if update: self.df = df return df diff --git a/graphein/protein/tensor/io.py b/graphein/protein/tensor/io.py index dc7698bd..cdd2714a 100644 --- a/graphein/protein/tensor/io.py +++ b/graphein/protein/tensor/io.py @@ -349,7 +349,7 @@ def protein_df_to_tensor( """ num_residues = get_protein_length(df, insertions=insertions) df = df.loc[df["atom_name"].isin(atoms_to_keep)] - residue_indices = pd.factorize(get_residue_id(df, unique=False))[0] + residue_indices = pd.factorize(pd.Series(get_residue_id(df, unique=False)))[0] atom_indices = df["atom_name"].map(lambda x: atoms_to_keep.index(x)).values positions: AtomTensor = ( From 158b3aa46c650b8d76e5156462141f2437bd2e22 Mon Sep 17 00:00:00 2001 From: kierandidi Date: Thu, 30 May 2024 15:06:48 +0100 Subject: [PATCH 5/7] [fix] pin setuptools in setup.py --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 89553885..c69b44c3 100644 --- a/setup.py +++ b/setup.py @@ -156,6 +156,7 @@ def run(self): install_requires=INSTALL_REQUIRES, extras_require=EXTRA_REQUIRES, python_requires=">=3.7", + setup_requires=['setuptools==69.5.1'], license="MIT", platforms="any", classifiers=[ From c1d8c4a2ccea8ba88bdabe077fc03aae079e9654 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 30 May 2024 14:07:16 +0000 Subject: [PATCH 6/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- graphein/ml/datasets/pdb_data.py | 95 ++++++++++++++++++++------------ graphein/protein/tensor/io.py | 4 +- graphein/protein/utils.py | 4 +- setup.py | 2 +- 4 files changed, 67 insertions(+), 38 deletions(-) diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py index 8567fb25..6dbb4fc4 100644 --- a/graphein/ml/datasets/pdb_data.py +++ b/graphein/ml/datasets/pdb_data.py @@ -6,7 +6,7 @@ from datetime import datetime from io import StringIO from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Union, Literal +from typing import Any, Callable, Dict, List, Literal, Optional, Union import numpy as np import pandas as pd @@ -36,7 +36,9 @@ def __init__( split_ratios: Optional[List[float]] = None, split_time_frames: Optional[List[np.datetime64]] = None, assign_leftover_rows_to_split_n: int = 0, - labels: Optional[List[Literal["uniprot_id", "cath_code", "ec_number"]]] = None, + labels: Optional[ + List[Literal["uniprot_id", "cath_code", "ec_number"]] + ] = None, ): """Instantiate a selection of experimental PDB structures. @@ -93,8 +95,6 @@ def __init__( self.pdb_chain_ec_number_url = "https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_enzyme.tsv.gz" - - self.pdb_dir = self.root_dir / "pdb" if not os.path.exists(self.pdb_dir): os.makedirs(self.pdb_dir) @@ -111,9 +111,13 @@ def __init__( self.pdb_deposition_date_url ).name self.pdb_availability_filename = Path(self.pdb_availability_url).name - self.pdb_chain_cath_uniprot_filename = Path(self.pdb_chain_cath_uniprot_url).name + self.pdb_chain_cath_uniprot_filename = Path( + self.pdb_chain_cath_uniprot_url + ).name self.cath_id_cath_code_filename = Path(self.cath_id_cath_code_url).name - self.pdb_chain_ec_number_filename = Path(self.pdb_chain_ec_number_url).name + self.pdb_chain_ec_number_filename = Path( + self.pdb_chain_ec_number_url + ).name self.list_columns = ["ligands"] @@ -428,16 +432,20 @@ def _download_pdb_availability(self): log.info("Downloading PDB availability map...") wget.download(self.pdb_availability_url, out=str(self.root_dir)) log.debug("Downloaded PDB availability map") - + def _download_pdb_chain_cath_uniprot_map(self): """Download mapping from PDB chain to uniprot accession and CATH ID from https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_cath_uniprot.tsv.gz """ - if not os.path.exists(self.root_dir / self.pdb_chain_cath_uniprot_filename): + if not os.path.exists( + self.root_dir / self.pdb_chain_cath_uniprot_filename + ): log.info("Downloading Uniprot CATH map...") - wget.download(self.pdb_chain_cath_uniprot_url, out=str(self.root_dir)) + wget.download( + self.pdb_chain_cath_uniprot_url, out=str(self.root_dir) + ) log.debug("Downloaded Uniprot CATH map") - + def _download_cath_id_cath_code_map(self): """Download mapping from CATH IDs to CATH code from http://download.cathdb.info/cath/releases/daily-release/newest/cath-b-newest-all.gz @@ -451,7 +459,9 @@ def _download_pdb_chain_ec_number_map(self): """Download mapping from PDB chains to EC number from https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_enzyme.tsv.gz """ - if not os.path.exists(self.root_dir / self.pdb_chain_ec_number_filename): + if not os.path.exists( + self.root_dir / self.pdb_chain_ec_number_filename + ): log.info("Downloading EC number map...") wget.download(self.pdb_chain_ec_number_url, out=str(self.root_dir)) log.debug("Downloaded EC number map") @@ -553,7 +563,7 @@ def _parse_entries(self) -> Dict[str, datetime]: df.dropna(subset=["id"], inplace=True) df.id = df.id.str.lower() - df.date = pd.to_datetime(df.date, format = "%m/%d/%y") + df.date = pd.to_datetime(df.date, format="%m/%d/%y") return pd.Series(df["date"].values, index=df["id"]).to_dict() def _parse_experiment_type(self) -> Dict[str, str]: @@ -589,16 +599,18 @@ def _parse_uniprot_id(self) -> Dict[str, str]: :rtype: Dict[str, str] """ uniprot_mapping = {} - with gzip.open(self.root_dir / self.pdb_chain_cath_uniprot_filename, 'rt') as f: + with gzip.open( + self.root_dir / self.pdb_chain_cath_uniprot_filename, "rt" + ) as f: for line in f: try: - pdb, chain, uniprot_id, cath_id = line.strip().split('\t') + pdb, chain, uniprot_id, cath_id = line.strip().split("\t") key = f"{pdb}_{chain}" uniprot_mapping[key] = uniprot_id except ValueError: continue return uniprot_mapping - + def _parse_cath_id(self) -> Dict[str, str]: """Parse the CATH ID for all PDB chains. @@ -607,17 +619,19 @@ def _parse_cath_id(self) -> Dict[str, str]: :rtype: Dict[str, str] """ cath_mapping = {} - with gzip.open(self.root_dir / self.pdb_chain_cath_uniprot_filename, 'rt') as f: - next(f) # Skip header line + with gzip.open( + self.root_dir / self.pdb_chain_cath_uniprot_filename, "rt" + ) as f: + next(f) # Skip header line for line in f: try: - pdb, chain, uniprot_id, cath_id = line.strip().split('\t') + pdb, chain, uniprot_id, cath_id = line.strip().split("\t") key = f"{pdb}_{chain}" cath_mapping[key] = cath_id except ValueError: continue return cath_mapping - + def _parse_cath_code(self) -> Dict[str, str]: """Parse the CATH code for all CATH IDs. @@ -626,18 +640,22 @@ def _parse_cath_code(self) -> Dict[str, str]: :rtype: Dict[str, str] """ cath_mapping = {} - with gzip.open(self.root_dir / self.cath_id_cath_code_filename, 'rt') as f: + with gzip.open( + self.root_dir / self.cath_id_cath_code_filename, "rt" + ) as f: print(f) for line in f: print(line) try: - cath_id, cath_version, cath_code, cath_segment = line.strip().split() + cath_id, cath_version, cath_code, cath_segment = ( + line.strip().split() + ) cath_mapping[cath_id] = cath_code print(cath_id, cath_code) except ValueError: continue return cath_mapping - + def _parse_ec_number(self) -> Dict[str, str]: """Parse the CATH ID for all PDB chains and adds None when no EC number is present. @@ -646,19 +664,28 @@ def _parse_ec_number(self) -> Dict[str, str]: :rtype: Dict[str, str] """ ec_mapping = {} - with gzip.open(self.root_dir / self.pdb_chain_ec_number_filename, 'rt') as f: - next(f) # Skip header line + with gzip.open( + self.root_dir / self.pdb_chain_ec_number_filename, "rt" + ) as f: + next(f) # Skip header line for line in f: try: - pdb, chain, uniprot_id, ec_number = line.strip().split('\t') + pdb, chain, uniprot_id, ec_number = line.strip().split( + "\t" + ) key = f"{pdb}_{chain}" - ec_number = None if ec_number == '?' else ec_number + ec_number = None if ec_number == "?" else ec_number ec_mapping[key] = ec_number except ValueError: continue return ec_mapping - def parse(self, labels: Optional[List[Literal["uniprot_id", "cath_code", "ec_number"]]] = None) -> pd.DataFrame: + def parse( + self, + labels: Optional[ + List[Literal["uniprot_id", "cath_code", "ec_number"]] + ] = None, + ) -> pd.DataFrame: """Parse all PDB sequence records. :param labels: A list of names corresponding to metadata labels that should be included in PDB manager dataframe, @@ -1308,16 +1335,15 @@ def has_uniprot_id( :rtype: pd.DataFrame """ splits_df = self.get_splits(splits) - df = splits_df.dropna(subset=['uniprot_id']) + df = splits_df.dropna(subset=["uniprot_id"]) if select_ids: - df = df[df['uniprot_id'].isin(select_ids)] + df = df[df["uniprot_id"].isin(select_ids)] if update: self.df = df return df - def has_cath_code( self, select_ids: Optional[List[str]] = None, @@ -1342,11 +1368,10 @@ def has_cath_code( :rtype: pd.DataFrame """ splits_df = self.get_splits(splits) - df = splits_df.dropna(subset=['cath_code']) + df = splits_df.dropna(subset=["cath_code"]) if select_ids: - df = df[df['cath_code'].isin(select_ids)] - + df = df[df["cath_code"].isin(select_ids)] if update: self.df = df @@ -1376,10 +1401,10 @@ def has_ec_number( :rtype: pd.DataFrame """ splits_df = self.get_splits(splits) - df = splits_df.dropna(subset=['ec_number']) + df = splits_df.dropna(subset=["ec_number"]) if select_ids: - df = df[df['ec_number'].isin(select_ids)] + df = df[df["ec_number"].isin(select_ids)] if update: self.df = df diff --git a/graphein/protein/tensor/io.py b/graphein/protein/tensor/io.py index cdd2714a..58089158 100644 --- a/graphein/protein/tensor/io.py +++ b/graphein/protein/tensor/io.py @@ -349,7 +349,9 @@ def protein_df_to_tensor( """ num_residues = get_protein_length(df, insertions=insertions) df = df.loc[df["atom_name"].isin(atoms_to_keep)] - residue_indices = pd.factorize(pd.Series(get_residue_id(df, unique=False)))[0] + residue_indices = pd.factorize( + pd.Series(get_residue_id(df, unique=False)) + )[0] atom_indices = df["atom_name"].map(lambda x: atoms_to_keep.index(x)).values positions: AtomTensor = ( diff --git a/graphein/protein/utils.py b/graphein/protein/utils.py index f69a0684..c16669f1 100644 --- a/graphein/protein/utils.py +++ b/graphein/protein/utils.py @@ -190,7 +190,9 @@ def download_pdb( BASE_URL = "https://models.rcsb.org/" extension = ".bcif.gz" else: - raise ValueError(f"Invalid format: {format}. Must be 'pdb', 'mmtf', 'mmcif' or 'bcif'.") + raise ValueError( + f"Invalid format: {format}. Must be 'pdb', 'mmtf', 'mmcif' or 'bcif'." + ) # Make output directory if it doesn't exist or set it to tempdir if None if out_dir is not None: diff --git a/setup.py b/setup.py index c69b44c3..47416adb 100644 --- a/setup.py +++ b/setup.py @@ -156,7 +156,7 @@ def run(self): install_requires=INSTALL_REQUIRES, extras_require=EXTRA_REQUIRES, python_requires=">=3.7", - setup_requires=['setuptools==69.5.1'], + setup_requires=["setuptools==69.5.1"], license="MIT", platforms="any", classifiers=[ From 459b001076b3405ede19fae296302ac79325b940 Mon Sep 17 00:00:00 2001 From: Arian Jamasb Date: Mon, 3 Jun 2024 18:11:29 +0200 Subject: [PATCH 7/7] pin setuptools version for CI --- .github/workflows/build.yaml | 2 ++ setup.py | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 5ea4f233..e674be59 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -40,6 +40,8 @@ jobs: channels: "conda-forge, salilab, pytorch, pyg" python-version: ${{ matrix.python-version }} use-mamba: true + - name: Install setuptools + run: pip install setuptools==69.5.1 - name: Install Boost 1.7.3 (for DSSP) run: conda install -c anaconda libboost=1.73.0 - name: Install DSSP diff --git a/setup.py b/setup.py index 47416adb..89553885 100644 --- a/setup.py +++ b/setup.py @@ -156,7 +156,6 @@ def run(self): install_requires=INSTALL_REQUIRES, extras_require=EXTRA_REQUIRES, python_requires=">=3.7", - setup_requires=["setuptools==69.5.1"], license="MIT", platforms="any", classifiers=[