From 93b17eb8162d41843465054b51ee439766cacb30 Mon Sep 17 00:00:00 2001
From: kierandidi <kieran.didi@gmail.com>
Date: Wed, 29 May 2024 14:55:48 +0100
Subject: [PATCH 1/7] [feat] add uniprot ecnumber and cath label options to pdb
 manager

---
 graphein/ml/datasets/pdb_data.py | 218 ++++++++++++++++++++++++++++++-
 graphein/protein/utils.py        |  12 +-
 2 files changed, 223 insertions(+), 7 deletions(-)

diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py
index 4ba8fbe7..e1482af4 100644
--- a/graphein/ml/datasets/pdb_data.py
+++ b/graphein/ml/datasets/pdb_data.py
@@ -36,13 +36,14 @@ def __init__(
         split_ratios: Optional[List[float]] = None,
         split_time_frames: Optional[List[np.datetime64]] = None,
         assign_leftover_rows_to_split_n: int = 0,
+        labels: Optional[List[str]] = None,
     ):
         """Instantiate a selection of experimental PDB structures.
 
         :param root_dir: The directory in which to store all PDB entries,
             defaults to ``"."``.
         :type root_dir: str, optional
-        :param structure_format: Whether to use ``.pdb`` or ``.mmtf`` file.
+        :param structure_format: Whether to use ``.pdb``, ``.mmtf`` or ``mmcif`` file.
             Defaults to ``"pdb"``.
         :type structure_format: str, optional
         :param splits: A list of names corresponding to each dataset split,
@@ -58,6 +59,9 @@ def __init__(
             to assign any rows remaining after creation of new dataset splits,
             defaults to ``0``.
         :type assign_leftover_rows_to_split_n: int, optional
+        :param labels: A list of names corresponding to metadata labels that should be included in PDB manager dataframe,
+            defaults to ``None``.
+        :type labels: Optional[List[str]], optional
         """
         # Arguments
         self.root_dir = Path(root_dir)
@@ -83,6 +87,14 @@ def __init__(
         )
         self.pdb_availability_url = "https://files.wwpdb.org/pub/pdb/compatible/pdb_bundle/pdb_bundle_index.txt"
 
+        self.pdb_chain_cath_uniprot_url = "https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_cath_uniprot.tsv.gz"
+
+        self.cath_id_cath_code_url = "http://download.cathdb.info/cath/releases/daily-release/newest/cath-b-newest-all.gz"
+
+        self.pdb_chain_ec_number_url = "https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_enzyme.tsv.gz"
+
+
+
         self.pdb_dir = self.root_dir / "pdb"
         if not os.path.exists(self.pdb_dir):
             os.makedirs(self.pdb_dir)
@@ -99,12 +111,15 @@ def __init__(
             self.pdb_deposition_date_url
         ).name
         self.pdb_availability_filename = Path(self.pdb_availability_url).name
+        self.pdb_chain_cath_uniprot_filename = Path(self.pdb_chain_cath_uniprot_url).name
+        self.cath_id_cath_code_filename = Path(self.cath_id_cath_code_url).name
+        self.pdb_chain_ec_number_filename = Path(self.pdb_chain_ec_number_url).name
 
         self.list_columns = ["ligands"]
 
         # Data
         self.download_metadata()
-        self.df = self.parse()
+        self.df = self.parse(labels)
         self.source = self.df.copy()
 
         # Splits
@@ -146,6 +161,9 @@ def download_metadata(self):
         self._download_entry_metadata()
         self._download_exp_type()
         self._download_pdb_availability()
+        self._download_pdb_chain_cath_uniprot_map()
+        self._download_cath_id_cath_code_map()
+        self._download_pdb_chain_ec_number_map()
 
     def get_unavailable_pdb_files(
         self, splits: Optional[List[str]] = None
@@ -410,6 +428,33 @@ def _download_pdb_availability(self):
             log.info("Downloading PDB availability map...")
             wget.download(self.pdb_availability_url, out=str(self.root_dir))
             log.debug("Downloaded PDB availability map")
+    
+    def _download_pdb_chain_cath_uniprot_map(self):
+        """Download mapping from PDB chain to uniprot accession and CATH ID from
+        https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_cath_uniprot.tsv.gz
+        """
+        if not os.path.exists(self.root_dir / self.pdb_chain_cath_uniprot_filename):
+            log.info("Downloading Uniprot CATH map...")
+            wget.download(self.pdb_chain_cath_uniprot_url, out=str(self.root_dir))
+            log.debug("Downloaded Uniprot CATH map")
+    
+    def _download_cath_id_cath_code_map(self):
+        """Download mapping from CATH IDs to CATH code from
+        http://download.cathdb.info/cath/releases/daily-release/newest/cath-b-newest-all.gz
+        """
+        if not os.path.exists(self.root_dir / self.cath_id_cath_code_filename):
+            log.info("Downloading CATH ID to CATH code map...")
+            wget.download(self.cath_id_cath_code_url, out=str(self.root_dir))
+            log.debug("Downloaded CATH ID to CATH code map")
+
+    def _download_pdb_chain_ec_number_map(self):
+        """Download mapping from PDB chains to EC number from
+        https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_enzyme.tsv.gz
+        """
+        if not os.path.exists(self.root_dir / self.pdb_chain_ec_number_filename):
+            log.info("Downloading EC number map...")
+            wget.download(self.pdb_chain_ec_number_url, out=str(self.root_dir))
+            log.debug("Downloaded EC number map")
 
     def _parse_ligand_map(self) -> Dict[str, List[str]]:
         """Parse the ligand maps for all PDB records.
@@ -536,9 +581,90 @@ def _parse_pdb_availability(self) -> Dict[str, bool]:
         ids = {id: False for id in ids}
         return ids
 
-    def parse(self) -> pd.DataFrame:
+    def _parse_uniprot_id(self) -> Dict[str, str]:
+        """Parse the uniprot ID for all PDB chains.
+
+        :return: Dictionary of PDB chain ID with their
+            corresponding uniprot ID.
+        :rtype: Dict[str, str]
+        """
+        uniprot_mapping = {}
+        with gzip.open(self.root_dir / self.pdb_chain_cath_uniprot_filename, 'rt') as f:
+            for line in f:
+                try:
+                    pdb, chain, uniprot_id, cath_id = line.strip().split('\t')
+                    key = f"{pdb}_{chain}"
+                    uniprot_mapping[key] = uniprot_id
+                except ValueError:
+                    continue
+        return uniprot_mapping
+    
+    def _parse_cath_id(self) -> Dict[str, str]:
+        """Parse the CATH ID for all PDB chains.
+
+        :return: Dictionary of PDB chain ID with their
+            corresponding CATH ID.
+        :rtype: Dict[str, str]
+        """
+        cath_mapping = {}
+        with gzip.open(self.root_dir / self.pdb_chain_cath_uniprot_filename, 'rt') as f:
+            next(f) # Skip header line
+            for line in f:
+                try:
+                    pdb, chain, uniprot_id, cath_id = line.strip().split('\t')
+                    key = f"{pdb}_{chain}"
+                    cath_mapping[key] = cath_id
+                except ValueError:
+                    continue
+        return cath_mapping
+    
+    def _parse_cath_code(self) -> Dict[str, str]:
+        """Parse the CATH code for all CATH IDs.
+
+        :return: Dictionary of CATH ID with their
+            corresponding CATH code.
+        :rtype: Dict[str, str]
+        """
+        cath_mapping = {}
+        with gzip.open(self.root_dir / self.cath_id_cath_code_filename, 'rt') as f:
+            print(f)
+            for line in f:
+                print(line)
+                try:
+                    cath_id, cath_version, cath_code, cath_segment = line.strip().split()
+                    cath_mapping[cath_id] = cath_code
+                    print(cath_id, cath_code)
+                except ValueError:
+                    continue
+        return cath_mapping
+    
+    def _parse_ec_number(self) -> Dict[str, str]:
+        """Parse the CATH ID for all PDB chains and adds None when no EC number is present.
+
+        :return: Dictionary of PDB chain ID with their
+            corresponding EC number.
+        :rtype: Dict[str, str]
+        """
+        ec_mapping = {}
+        with gzip.open(self.root_dir / self.pdb_chain_ec_number_filename, 'rt') as f:
+            next(f) # Skip header line
+            for line in f:
+                try:
+                    pdb, chain, uniprot_id, ec_number = line.strip().split('\t')
+                    key = f"{pdb}_{chain}"
+                    ec_number = None if ec_number == '?' else ec_number
+                    ec_mapping[key] = ec_number
+                except ValueError:
+                    continue
+        return ec_mapping
+
+    def parse(self, labels: List[str]) -> pd.DataFrame:
         """Parse all PDB sequence records.
 
+        :param labels: A list of names corresponding to metadata labels that should be included in PDB manager dataframe,
+            defaults to ``None``.
+        :type labels: Optional[List[str]], optional
+
         :return: DataFrame containing PDB sequence entries
             with their corresponding metadata.
         :rtype: pd.DataFrame
@@ -579,6 +705,14 @@ def parse(self) -> pd.DataFrame:
         df["experiment_type"] = df.pdb.map(self._parse_experiment_type())
         df["pdb_file_available"] = df.pdb.map(self._parse_pdb_availability())
         df.pdb_file_available.fillna(True, inplace=True)
+        if labels:
+            if "uniprot_id" in labels:
+                df["uniprot_id"] = df.id.map(self._parse_uniprot_id())
+            if "cath_code" in labels:
+                df["cath_id"] = df.id.map(self._parse_cath_id())
+                df["cath_code"] = df.cath_id.map(self._parse_cath_code())
+            if "ec_number" in labels:
+                df["ec_number"] = df.id.map(self._parse_ec_number())
 
         return df
 
@@ -1150,6 +1284,82 @@ def select_complexes_with_grouped_molecule_types(
         if update:
             self.df = df
 
+    def has_uniprot_id(
+        self,
+        splits: Optional[List[str]] = None,
+        update: bool = False,
+    ) -> pd.DataFrame:
+        """
+        Select entries that have a uniprot ID.
+
+        :param splits: Names of splits for which to perform the operation,
+            defaults to ``None``.
+        :type splits: Optional[List[str]], optional
+        :param update: Whether to modify the DataFrame in place, defaults to
+            ``False``.
+        :type update: bool, optional
+
+        :return: DataFrame of selected molecules.
+        :rtype: pd.DataFrame
+        """
+        splits_df = self.get_splits(splits)
+        df = splits_df.dropna(subset=['uniprot_id'])
+
+        if update:
+            self.df = df
+        return df
+
+
+    def has_cath_code(
+        self,
+        splits: Optional[List[str]] = None,
+        update: bool = False,
+    ) -> pd.DataFrame:
+        """
+        Select entries that have a cath code.
+
+        :param splits: Names of splits for which to perform the operation,
+            defaults to ``None``.
+        :type splits: Optional[List[str]], optional
+        :param update: Whether to modify the DataFrame in place, defaults to
+            ``False``.
+        :type update: bool, optional
+
+        :return: DataFrame of selected molecules.
+        :rtype: pd.DataFrame
+        """
+        splits_df = self.get_splits(splits)
+        df = splits_df.dropna(subset=['cath_code'])
+
+        if update:
+            self.df = df
+        return df
+
+    def has_ec_number(
+        self,
+        splits: Optional[List[str]] = None,
+        update: bool = False,
+    ) -> pd.DataFrame:
+        """
+        Select entries that have an EC number.
+
+        :param splits: Names of splits for which to perform the operation,
+            defaults to ``None``.
+        :type splits: Optional[List[str]], optional
+        :param update: Whether to modify the DataFrame in place, defaults to
+            ``False``.
+        :type update: bool, optional
+
+        :return: DataFrame of selected molecules.
+        :rtype: pd.DataFrame
+        """
+        splits_df = self.get_splits(splits)
+        df = splits_df.dropna(subset=['ec_number'])
+
+        if update:
+            self.df = df
+        return df
+
     def split_df_proportionally(
         self,
         df: pd.DataFrame,
@@ -1572,7 +1782,7 @@ def download_pdbs(
 
         :param out_dir: Output directory, defaults to ``"."``
         :type out_dir: str, optional
-        :param format: Filetype to download. ``pdb`` or ``mmtf``.
+        :param format: Filetype to download. ``pdb``, ``mmtf``, ``mmcif`` or ``bcif``.
         :type format: str
         :param splits: Names of splits for which to perform the operation,
             defaults to ``None``.
diff --git a/graphein/protein/utils.py b/graphein/protein/utils.py
index 6c9b76e8..f69a0684 100644
--- a/graphein/protein/utils.py
+++ b/graphein/protein/utils.py
@@ -108,7 +108,7 @@ def download_pdb_multiprocessing(
     :type pdb_codes: List[str]
     :param out_dir: Path to directory to download PDB structures to.
     :type out_dir: Union[str, Path]
-    :param format: Filetype to download. ``pdb`` or ``mmtf``.
+    :param format: Filetype to download. ``pdb``, ``mmtf``, ``mmcif`` or ``bcif``.
     :type format: str
     :param overwrite: Whether to overwrite existing files, defaults to
         ``False``.
@@ -162,7 +162,7 @@ def download_pdb(
     :param out_dir: Path to directory to download PDB structure to. If ``None``,
         will download to a temporary directory.
     :type out_dir: Optional[Union[str, Path]]
-    :param format: Filetype to download. ``pdb`` or ``mmtf``.
+    :param format: Filetype to download. ``pdb``, ``mmtf``, ``mmcif`` or ``bcif``.
     :type format: str
     :param check_obsolete: Whether to check for obsolete PDB codes,
         defaults to ``False``. If an obsolete PDB code is found, the updated PDB
@@ -183,8 +183,14 @@ def download_pdb(
     elif format == "mmtf":
         BASE_URL = "https://mmtf.rcsb.org/v1.0/full/"
         extension = ".mmtf.gz"
+    elif format == "mmcif":
+        BASE_URL = "https://files.rcsb.org/download/"
+        extension = ".cif.gz"
+    elif format == "bcif":
+        BASE_URL = "https://models.rcsb.org/"
+        extension = ".bcif.gz"
     else:
-        raise ValueError(f"Invalid format: {format}. Must be 'pdb' or 'mmtf'.")
+        raise ValueError(f"Invalid format: {format}. Must be 'pdb', 'mmtf', 'mmcif' or 'bcif'.")
 
     # Make output directory if it doesn't exist or set it to tempdir if None
     if out_dir is not None:

From c4aea57eebafb125795cd7d70889c9abe549fe5a Mon Sep 17 00:00:00 2001
From: kierandidi <kieran.didi@gmail.com>
Date: Wed, 29 May 2024 15:00:32 +0100
Subject: [PATCH 2/7] [doc] added to changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e172426f..d0baa19e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@
 * Fix cluster file loading bug in `pdb_data.py` [#396](https://github.com/a-r-j/graphein/pull/396)
 
 #### Misc
+* add metadata options for uniprot, ecnumber and CATH code to pdb manager [#398](https://github.com/a-r-j/graphein/pull/398)
 * bumped logging level down from `INFO` to `DEBUG` at several places to reduced output length [#391](https://github.com/a-r-j/graphein/pull/391)
 * exposed `fill_value` and `bfactor` option to `protein_to_pyg` function. [#385](https://github.com/a-r-j/graphein/pull/385) and [#388](https://github.com/a-r-j/graphein/pull/388)
 * Updated Foldcomp datasets with improved setup function and updated database choices such as ESMAtlas. [#382](https://github.com/a-r-j/graphein/pull/382)

From 0ca75eb0a4118a2800891e7fd6670e76f99edc0c Mon Sep 17 00:00:00 2001
From: kierandidi <kieran.didi@gmail.com>
Date: Wed, 29 May 2024 23:12:32 +0100
Subject: [PATCH 3/7] [fix] address pandas warnings

---
 graphein/ml/datasets/pdb_data.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py
index e1482af4..71ebe3df 100644
--- a/graphein/ml/datasets/pdb_data.py
+++ b/graphein/ml/datasets/pdb_data.py
@@ -553,7 +553,7 @@ def _parse_entries(self) -> Dict[str, datetime]:
         df.dropna(subset=["id"], inplace=True)
 
         df.id = df.id.str.lower()
-        df.date = pd.to_datetime(df.date)
+        df.date = pd.to_datetime(df.date, format = "%m/%d/%y")
         return pd.Series(df["date"].values, index=df["id"]).to_dict()
 
     def _parse_experiment_type(self) -> Dict[str, str]:
@@ -704,7 +704,7 @@ def parse(self, labels: List[str]) -> pd.DataFrame:
         df["deposition_date"] = df.pdb.map(self._parse_entries())
         df["experiment_type"] = df.pdb.map(self._parse_experiment_type())
         df["pdb_file_available"] = df.pdb.map(self._parse_pdb_availability())
-        df.pdb_file_available.fillna(True, inplace=True)
+        df["pdb_file_available"] = df["pdb_file_available"].fillna(True)
         if labels:
             if "uniprot_id" in labels:
                 df["uniprot_id"] = df.id.map(self._parse_uniprot_id())
@@ -1771,8 +1771,8 @@ def reset(self) -> pd.DataFrame:
 
     def download_pdbs(
         self,
-        out_dir=".",
-        format="pdb",
+        out_dir: str = ".",
+        format: str = "pdb",
         splits: Optional[List[str]] = None,
         overwrite: bool = False,
         max_workers: int = 8,

From 680d16a3cd783e1c0b32cdd842e95becef961ae9 Mon Sep 17 00:00:00 2001
From: kierandidi <kieran.didi@gmail.com>
Date: Thu, 30 May 2024 14:24:45 +0100
Subject: [PATCH 4/7] [feat] improve type hinting and add subselecting based on
 labels

---
 graphein/ml/datasets/pdb_data.py | 33 ++++++++++++++++++++++++++++----
 graphein/protein/tensor/io.py    |  2 +-
 2 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py
index 71ebe3df..8567fb25 100644
--- a/graphein/ml/datasets/pdb_data.py
+++ b/graphein/ml/datasets/pdb_data.py
@@ -6,7 +6,7 @@
 from datetime import datetime
 from io import StringIO
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union, Literal
 
 import numpy as np
 import pandas as pd
@@ -36,7 +36,7 @@ def __init__(
         split_ratios: Optional[List[float]] = None,
         split_time_frames: Optional[List[np.datetime64]] = None,
         assign_leftover_rows_to_split_n: int = 0,
-        labels: Optional[List[str]] = None,
+        labels: Optional[List[Literal["uniprot_id", "cath_code", "ec_number"]]] = None,
     ):
         """Instantiate a selection of experimental PDB structures.
 
@@ -61,7 +61,7 @@ def __init__(
         :type assign_leftover_rows_to_split_n: int, optional
         :param labels: A list of names corresponding to metadata labels that should be included in PDB manager dataframe,
             defaults to ``None``.
-        :type labels: Optional[List[str]], optional
+        :type labels: Optional[List[Literal["uniprot_id", "cath_code", "ec_number"]]], optional
         """
         # Arguments
         self.root_dir = Path(root_dir)
@@ -658,7 +658,7 @@ def _parse_ec_number(self) -> Dict[str, str]:
                     continue
         return ec_mapping
 
-    def parse(self, labels: List[str]) -> pd.DataFrame:
+    def parse(self, labels: Optional[List[Literal["uniprot_id", "cath_code", "ec_number"]]] = None) -> pd.DataFrame:
         """Parse all PDB sequence records.
 
         :param labels: A list of names corresponding to metadata labels that should be included in PDB manager dataframe,
@@ -1286,12 +1286,17 @@ def select_complexes_with_grouped_molecule_types(
 
     def has_uniprot_id(
         self,
+        select_ids: Optional[List[str]] = None,
         splits: Optional[List[str]] = None,
         update: bool = False,
     ) -> pd.DataFrame:
         """
         Select entries that have a uniprot ID.
 
+        :param select_ids: If present, filter for only these IDs. If not present, filter for entries
+            that have any uniprot ID.
+            defaults to ``None``.
+        :type select_ids: Optional[List[str]], optional
         :param splits: Names of splits for which to perform the operation,
             defaults to ``None``.
         :type splits: Optional[List[str]], optional
@@ -1305,6 +1310,9 @@ def has_uniprot_id(
         splits_df = self.get_splits(splits)
         df = splits_df.dropna(subset=['uniprot_id'])
 
+        if select_ids:
+            df = df[df['uniprot_id'].isin(select_ids)]
+
         if update:
             self.df = df
         return df
@@ -1312,12 +1320,17 @@ def has_uniprot_id(
 
     def has_cath_code(
         self,
+        select_ids: Optional[List[str]] = None,
         splits: Optional[List[str]] = None,
         update: bool = False,
     ) -> pd.DataFrame:
         """
         Select entries that have a cath code.
 
+        :param select_ids: If present, filter for only these CATH codes. If not present, filter for entries
+            that have any cath code.
+            defaults to ``None``.
+        :type select_ids: Optional[List[str]], optional
         :param splits: Names of splits for which to perform the operation,
             defaults to ``None``.
         :type splits: Optional[List[str]], optional
@@ -1331,18 +1344,27 @@ def has_cath_code(
         splits_df = self.get_splits(splits)
         df = splits_df.dropna(subset=['cath_code'])
 
+        if select_ids:
+            df = df[df['cath_code'].isin(select_ids)]
+
+
         if update:
             self.df = df
         return df
 
     def has_ec_number(
         self,
+        select_ids: Optional[List[str]] = None,
         splits: Optional[List[str]] = None,
         update: bool = False,
     ) -> pd.DataFrame:
         """
         Select entries that have an EC number.
 
+        :param select_ids: If present, filter for only these ec_numbers. If not present, filter for entries
+            that have any EC number
+            defaults to ``None``.
+        :type select_ids: Optional[List[str]], optional
         :param splits: Names of splits for which to perform the operation,
             defaults to ``None``.
         :type splits: Optional[List[str]], optional
@@ -1356,6 +1378,9 @@ def has_ec_number(
         splits_df = self.get_splits(splits)
         df = splits_df.dropna(subset=['ec_number'])
 
+        if select_ids:
+            df = df[df['ec_number'].isin(select_ids)]
+
         if update:
             self.df = df
         return df
diff --git a/graphein/protein/tensor/io.py b/graphein/protein/tensor/io.py
index dc7698bd..cdd2714a 100644
--- a/graphein/protein/tensor/io.py
+++ b/graphein/protein/tensor/io.py
@@ -349,7 +349,7 @@ def protein_df_to_tensor(
     """
     num_residues = get_protein_length(df, insertions=insertions)
     df = df.loc[df["atom_name"].isin(atoms_to_keep)]
-    residue_indices = pd.factorize(get_residue_id(df, unique=False))[0]
+    residue_indices = pd.factorize(pd.Series(get_residue_id(df, unique=False)))[0]
     atom_indices = df["atom_name"].map(lambda x: atoms_to_keep.index(x)).values
 
     positions: AtomTensor = (

From 158b3aa46c650b8d76e5156462141f2437bd2e22 Mon Sep 17 00:00:00 2001
From: kierandidi <kieran.didi@gmail.com>
Date: Thu, 30 May 2024 15:06:48 +0100
Subject: [PATCH 5/7] [fix] pin setuptools in setup.py

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 89553885..c69b44c3 100644
--- a/setup.py
+++ b/setup.py
@@ -156,6 +156,7 @@ def run(self):
     install_requires=INSTALL_REQUIRES,
     extras_require=EXTRA_REQUIRES,
     python_requires=">=3.7",
+    setup_requires=['setuptools==69.5.1'],
     license="MIT",
     platforms="any",
     classifiers=[

From c1d8c4a2ccea8ba88bdabe077fc03aae079e9654 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 30 May 2024 14:07:16 +0000
Subject: [PATCH 6/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 graphein/ml/datasets/pdb_data.py | 95 ++++++++++++++++++++------------
 graphein/protein/tensor/io.py    |  4 +-
 graphein/protein/utils.py        |  4 +-
 setup.py                         |  2 +-
 4 files changed, 67 insertions(+), 38 deletions(-)

diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py
index 8567fb25..6dbb4fc4 100644
--- a/graphein/ml/datasets/pdb_data.py
+++ b/graphein/ml/datasets/pdb_data.py
@@ -6,7 +6,7 @@
 from datetime import datetime
 from io import StringIO
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Union, Literal
+from typing import Any, Callable, Dict, List, Literal, Optional, Union
 
 import numpy as np
 import pandas as pd
@@ -36,7 +36,9 @@ def __init__(
         split_ratios: Optional[List[float]] = None,
         split_time_frames: Optional[List[np.datetime64]] = None,
         assign_leftover_rows_to_split_n: int = 0,
-        labels: Optional[List[Literal["uniprot_id", "cath_code", "ec_number"]]] = None,
+        labels: Optional[
+            List[Literal["uniprot_id", "cath_code", "ec_number"]]
+        ] = None,
     ):
         """Instantiate a selection of experimental PDB structures.
 
@@ -93,8 +95,6 @@ def __init__(
 
         self.pdb_chain_ec_number_url = "https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_enzyme.tsv.gz"
 
-
-
         self.pdb_dir = self.root_dir / "pdb"
         if not os.path.exists(self.pdb_dir):
             os.makedirs(self.pdb_dir)
@@ -111,9 +111,13 @@ def __init__(
             self.pdb_deposition_date_url
         ).name
         self.pdb_availability_filename = Path(self.pdb_availability_url).name
-        self.pdb_chain_cath_uniprot_filename = Path(self.pdb_chain_cath_uniprot_url).name
+        self.pdb_chain_cath_uniprot_filename = Path(
+            self.pdb_chain_cath_uniprot_url
+        ).name
         self.cath_id_cath_code_filename = Path(self.cath_id_cath_code_url).name
-        self.pdb_chain_ec_number_filename = Path(self.pdb_chain_ec_number_url).name
+        self.pdb_chain_ec_number_filename = Path(
+            self.pdb_chain_ec_number_url
+        ).name
 
         self.list_columns = ["ligands"]
 
@@ -428,16 +432,20 @@ def _download_pdb_availability(self):
             log.info("Downloading PDB availability map...")
             wget.download(self.pdb_availability_url, out=str(self.root_dir))
             log.debug("Downloaded PDB availability map")
-    
+
     def _download_pdb_chain_cath_uniprot_map(self):
         """Download mapping from PDB chain to uniprot accession and CATH ID from
         https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_cath_uniprot.tsv.gz
         """
-        if not os.path.exists(self.root_dir / self.pdb_chain_cath_uniprot_filename):
+        if not os.path.exists(
+            self.root_dir / self.pdb_chain_cath_uniprot_filename
+        ):
             log.info("Downloading Uniprot CATH map...")
-            wget.download(self.pdb_chain_cath_uniprot_url, out=str(self.root_dir))
+            wget.download(
+                self.pdb_chain_cath_uniprot_url, out=str(self.root_dir)
+            )
             log.debug("Downloaded Uniprot CATH map")
-    
+
     def _download_cath_id_cath_code_map(self):
         """Download mapping from CATH IDs to CATH code from
         http://download.cathdb.info/cath/releases/daily-release/newest/cath-b-newest-all.gz
@@ -451,7 +459,9 @@ def _download_pdb_chain_ec_number_map(self):
         """Download mapping from PDB chains to EC number from
         https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_enzyme.tsv.gz
         """
-        if not os.path.exists(self.root_dir / self.pdb_chain_ec_number_filename):
+        if not os.path.exists(
+            self.root_dir / self.pdb_chain_ec_number_filename
+        ):
             log.info("Downloading EC number map...")
             wget.download(self.pdb_chain_ec_number_url, out=str(self.root_dir))
             log.debug("Downloaded EC number map")
@@ -553,7 +563,7 @@ def _parse_entries(self) -> Dict[str, datetime]:
         df.dropna(subset=["id"], inplace=True)
 
         df.id = df.id.str.lower()
-        df.date = pd.to_datetime(df.date, format = "%m/%d/%y")
+        df.date = pd.to_datetime(df.date, format="%m/%d/%y")
         return pd.Series(df["date"].values, index=df["id"]).to_dict()
 
     def _parse_experiment_type(self) -> Dict[str, str]:
@@ -589,16 +599,18 @@ def _parse_uniprot_id(self) -> Dict[str, str]:
         :rtype: Dict[str, str]
         """
         uniprot_mapping = {}
-        with gzip.open(self.root_dir / self.pdb_chain_cath_uniprot_filename, 'rt') as f:
+        with gzip.open(
+            self.root_dir / self.pdb_chain_cath_uniprot_filename, "rt"
+        ) as f:
             for line in f:
                 try:
-                    pdb, chain, uniprot_id, cath_id = line.strip().split('\t')
+                    pdb, chain, uniprot_id, cath_id = line.strip().split("\t")
                     key = f"{pdb}_{chain}"
                     uniprot_mapping[key] = uniprot_id
                 except ValueError:
                     continue
         return uniprot_mapping
-    
+
     def _parse_cath_id(self) -> Dict[str, str]:
         """Parse the CATH ID for all PDB chains.
 
@@ -607,17 +619,19 @@ def _parse_cath_id(self) -> Dict[str, str]:
         :rtype: Dict[str, str]
         """
         cath_mapping = {}
-        with gzip.open(self.root_dir / self.pdb_chain_cath_uniprot_filename, 'rt') as f:
-            next(f) # Skip header line
+        with gzip.open(
+            self.root_dir / self.pdb_chain_cath_uniprot_filename, "rt"
+        ) as f:
+            next(f)  # Skip header line
             for line in f:
                 try:
-                    pdb, chain, uniprot_id, cath_id = line.strip().split('\t')
+                    pdb, chain, uniprot_id, cath_id = line.strip().split("\t")
                     key = f"{pdb}_{chain}"
                     cath_mapping[key] = cath_id
                 except ValueError:
                     continue
         return cath_mapping
-    
+
     def _parse_cath_code(self) -> Dict[str, str]:
         """Parse the CATH code for all CATH IDs.
 
@@ -626,18 +640,22 @@ def _parse_cath_code(self) -> Dict[str, str]:
         :rtype: Dict[str, str]
         """
         cath_mapping = {}
-        with gzip.open(self.root_dir / self.cath_id_cath_code_filename, 'rt') as f:
+        with gzip.open(
+            self.root_dir / self.cath_id_cath_code_filename, "rt"
+        ) as f:
             print(f)
             for line in f:
                 print(line)
                 try:
-                    cath_id, cath_version, cath_code, cath_segment = line.strip().split()
+                    cath_id, cath_version, cath_code, cath_segment = (
+                        line.strip().split()
+                    )
                     cath_mapping[cath_id] = cath_code
                     print(cath_id, cath_code)
                 except ValueError:
                     continue
         return cath_mapping
-    
+
     def _parse_ec_number(self) -> Dict[str, str]:
         """Parse the CATH ID for all PDB chains and adds None when no EC number is present.
 
@@ -646,19 +664,28 @@ def _parse_ec_number(self) -> Dict[str, str]:
         :rtype: Dict[str, str]
         """
         ec_mapping = {}
-        with gzip.open(self.root_dir / self.pdb_chain_ec_number_filename, 'rt') as f:
-            next(f) # Skip header line
+        with gzip.open(
+            self.root_dir / self.pdb_chain_ec_number_filename, "rt"
+        ) as f:
+            next(f)  # Skip header line
             for line in f:
                 try:
-                    pdb, chain, uniprot_id, ec_number = line.strip().split('\t')
+                    pdb, chain, uniprot_id, ec_number = line.strip().split(
+                        "\t"
+                    )
                     key = f"{pdb}_{chain}"
-                    ec_number = None if ec_number == '?' else ec_number
+                    ec_number = None if ec_number == "?" else ec_number
                     ec_mapping[key] = ec_number
                 except ValueError:
                     continue
         return ec_mapping
 
-    def parse(self, labels: Optional[List[Literal["uniprot_id", "cath_code", "ec_number"]]] = None) -> pd.DataFrame:
+    def parse(
+        self,
+        labels: Optional[
+            List[Literal["uniprot_id", "cath_code", "ec_number"]]
+        ] = None,
+    ) -> pd.DataFrame:
         """Parse all PDB sequence records.
 
         :param labels: A list of names corresponding to metadata labels that should be included in PDB manager dataframe,
@@ -1308,16 +1335,15 @@ def has_uniprot_id(
         :rtype: pd.DataFrame
         """
         splits_df = self.get_splits(splits)
-        df = splits_df.dropna(subset=['uniprot_id'])
+        df = splits_df.dropna(subset=["uniprot_id"])
 
         if select_ids:
-            df = df[df['uniprot_id'].isin(select_ids)]
+            df = df[df["uniprot_id"].isin(select_ids)]
 
         if update:
             self.df = df
         return df
 
-
     def has_cath_code(
         self,
         select_ids: Optional[List[str]] = None,
@@ -1342,11 +1368,10 @@ def has_cath_code(
         :rtype: pd.DataFrame
         """
         splits_df = self.get_splits(splits)
-        df = splits_df.dropna(subset=['cath_code'])
+        df = splits_df.dropna(subset=["cath_code"])
 
         if select_ids:
-            df = df[df['cath_code'].isin(select_ids)]
-
+            df = df[df["cath_code"].isin(select_ids)]
 
         if update:
             self.df = df
@@ -1376,10 +1401,10 @@ def has_ec_number(
         :rtype: pd.DataFrame
         """
         splits_df = self.get_splits(splits)
-        df = splits_df.dropna(subset=['ec_number'])
+        df = splits_df.dropna(subset=["ec_number"])
 
         if select_ids:
-            df = df[df['ec_number'].isin(select_ids)]
+            df = df[df["ec_number"].isin(select_ids)]
 
         if update:
             self.df = df
diff --git a/graphein/protein/tensor/io.py b/graphein/protein/tensor/io.py
index cdd2714a..58089158 100644
--- a/graphein/protein/tensor/io.py
+++ b/graphein/protein/tensor/io.py
@@ -349,7 +349,9 @@ def protein_df_to_tensor(
     """
     num_residues = get_protein_length(df, insertions=insertions)
     df = df.loc[df["atom_name"].isin(atoms_to_keep)]
-    residue_indices = pd.factorize(pd.Series(get_residue_id(df, unique=False)))[0]
+    residue_indices = pd.factorize(
+        pd.Series(get_residue_id(df, unique=False))
+    )[0]
     atom_indices = df["atom_name"].map(lambda x: atoms_to_keep.index(x)).values
 
     positions: AtomTensor = (
diff --git a/graphein/protein/utils.py b/graphein/protein/utils.py
index f69a0684..c16669f1 100644
--- a/graphein/protein/utils.py
+++ b/graphein/protein/utils.py
@@ -190,7 +190,9 @@ def download_pdb(
         BASE_URL = "https://models.rcsb.org/"
         extension = ".bcif.gz"
     else:
-        raise ValueError(f"Invalid format: {format}. Must be 'pdb', 'mmtf', 'mmcif' or 'bcif'.")
+        raise ValueError(
+            f"Invalid format: {format}. Must be 'pdb', 'mmtf', 'mmcif' or 'bcif'."
+        )
 
     # Make output directory if it doesn't exist or set it to tempdir if None
     if out_dir is not None:
diff --git a/setup.py b/setup.py
index c69b44c3..47416adb 100644
--- a/setup.py
+++ b/setup.py
@@ -156,7 +156,7 @@ def run(self):
     install_requires=INSTALL_REQUIRES,
     extras_require=EXTRA_REQUIRES,
     python_requires=">=3.7",
-    setup_requires=['setuptools==69.5.1'],
+    setup_requires=["setuptools==69.5.1"],
     license="MIT",
     platforms="any",
     classifiers=[

From 459b001076b3405ede19fae296302ac79325b940 Mon Sep 17 00:00:00 2001
From: Arian Jamasb <arjamasb@gmail.com>
Date: Mon, 3 Jun 2024 18:11:29 +0200
Subject: [PATCH 7/7] pin setuptools version for CI

---
 .github/workflows/build.yaml | 2 ++
 setup.py                     | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 5ea4f233..e674be59 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -40,6 +40,8 @@ jobs:
           channels: "conda-forge, salilab, pytorch, pyg"
           python-version: ${{ matrix.python-version }}
           use-mamba: true
+      - name: Install setuptools
+        run: pip install setuptools==69.5.1
       - name: Install Boost 1.7.3 (for DSSP)
         run: conda install -c anaconda libboost=1.73.0
       - name: Install DSSP
diff --git a/setup.py b/setup.py
index 47416adb..89553885 100644
--- a/setup.py
+++ b/setup.py
@@ -156,7 +156,6 @@ def run(self):
     install_requires=INSTALL_REQUIRES,
     extras_require=EXTRA_REQUIRES,
     python_requires=">=3.7",
-    setup_requires=["setuptools==69.5.1"],
     license="MIT",
     platforms="any",
     classifiers=[