From e6e9658c589416b36ef9f0b83c6d5f41a710690a Mon Sep 17 00:00:00 2001
From: Alex Morehead <acmwhb@missouri.edu>
Date: Fri, 18 Aug 2023 13:22:37 -0700
Subject: [PATCH] Add the ability for the `PDBManager` to perform
 interface-based chain filtering

---
 graphein/ml/datasets/pdb_data.py | 221 ++++++++++++++++++++++++++++++-
 1 file changed, 216 insertions(+), 5 deletions(-)

diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py
index 9082746a..c1c82612 100644
--- a/graphein/ml/datasets/pdb_data.py
+++ b/graphein/ml/datasets/pdb_data.py
@@ -1,3 +1,4 @@
+import copy
 import gzip
 import os
 import shutil
@@ -13,6 +14,7 @@
 from biopandas.pdb import PandasPdb
 from loguru import logger as log
 from pandas.core.groupby.generic import DataFrameGroupBy
+from scipy.spatial.distance import cdist
 from tqdm import tqdm
 
 from graphein.protein.utils import (
@@ -23,6 +25,11 @@
 )
 from graphein.utils.dependencies import is_tool
 
+PRIMARY_INTERCHAIN_CONTACT_ATOMS_FOR_FILTERING: List[str] = ["CA", "C4'"]
+SECONDARY_INTERCHAIN_CONTACT_ATOMS_NOT_FOR_FILTERING: List[str] = ["H"]
+PRIMARY_HYDROGEN_BOND_ATOMS_FOR_FILTERING: List[str] = ["N", "O", "N1", "N9", "N3", "C2", "C4", "C5", "C6"]
+SECONDARY_HYDROGEN_BOND_ATOMS_FOR_FILTERING: List[str] = ["N", "O", "N1", "N9", "N3", "C2", "C4", "C5", "C6"]
+
 
 class PDBManager:
     """A utility for creating selections of experimental PDB structures."""
@@ -1818,6 +1825,99 @@ def select_pdb_by_criterion(
                 pdb.df[key] = filtered_pdb
         return pdb
 
+    def filter_chains_by_interface_criteria(
+        self,
+        pdb: PandasPdb,
+        primary_interchain_contact_atoms_for_filtering: List[str] = PRIMARY_INTERCHAIN_CONTACT_ATOMS_FOR_FILTERING,
+        secondary_interchain_contact_atoms_not_for_filtering: List[str] = SECONDARY_INTERCHAIN_CONTACT_ATOMS_NOT_FOR_FILTERING,
+        primary_hydrogen_bond_atoms_for_filtering: List[str] = PRIMARY_HYDROGEN_BOND_ATOMS_FOR_FILTERING,
+        secondary_hydrogen_bond_atoms_for_filtering: List[str] = SECONDARY_HYDROGEN_BOND_ATOMS_FOR_FILTERING,
+        interface_contact_criterion: float = 7.0,
+        hydrogen_bond_criterion: float = 3.5,
+        interface_contact_count: int = 16,
+        hydrogen_bond_count: int = 10,
+        chain_id_col: str = "chain_id",
+        atom_name_col: str = "atom_name",
+        atom_df_name: str = "ATOM",
+    ) -> PandasPdb:
+        """Filter a PDB using interface criteria.
+
+        :param pdb: The PDB object to filter by interface criteria.
+        :type pdb: PandasPdb
+        :param primary_interchain_contact_atoms_for_filtering: The main atoms in each residue with
+            which to measure inter-chain pairwise residue distances.
+        :type primary_interchain_contact_atoms_for_filtering: List[str], optional
+        :param secondary_interchain_contact_atoms_not_for_filtering: The secondary atoms in each residue without
+            which to measure inter-chain pairwise residue distances.
+        :type secondary_interchain_contact_atoms_not_for_filtering: List[str], optional
+        :param primary_hydrogen_bond_atoms_for_filtering: The main atoms in each residue with
+            which to measure inter-chain atom distances for hydrogen bonding.
+        :type primary_hydrogen_bond_atoms_for_filtering: List[str], optional
+        :param secondary_hydrogen_bond_atoms_for_filtering: The secondary atoms in each residue with
+            which to measure inter-chain atom distances for hydrogen bonding.
+        :type secondary_hydrogen_bond_atoms_for_filtering: List[str], optional
+        :param interface_contact_criterion: Distance between two inter-chain
+            residues at which to classify a residue pair as interface contacts.
+        :type interface_contact_criterion: float, optional
+        :param hydrogen_bond_criterion: Distance between two inter-chain
+            atoms at which to classify an atom pair as hydrogen-bonded.
+        :type hydrogen_bond_criterion: float, optional
+        :param interface_contact_count: Number of interface contacts required
+            to select a chain to be exported.
+        :type interface_contact_count: int, optional
+        :param hydrogen_bond_count: Number of hydrogen bonds required
+            to select a chain to be exported.
+        :type hydrogen_bond_count: int, optional
+        :param chain_id_col: Name of the chain ID DataFrame column.
+        type: chain_id_col: str, optional
+        :param atom_name_col: Name of the atom name DataFrame column.
+        type: atom_name_col: str, optional
+        :param atom_df_name: Name of the DataFrame by which to access
+            ATOM entries within a PandasPdb object.
+        :type atom_df_name: str, defaults to ``ATOM``
+
+        :return: The filtered PDB object.
+        :rtype: PandasPdb
+        """
+        filtered_pdb = copy.deepcopy(pdb)
+    
+        atom_data = pdb.df[atom_df_name]
+        unique_chain_ids = atom_data[chain_id_col].unique()
+        
+        interface_contact_atom_mask = atom_data[atom_name_col].isin(primary_interchain_contact_atoms_for_filtering)
+        interface_contact_other_atom_mask = ~atom_data[atom_name_col].isin(secondary_interchain_contact_atoms_not_for_filtering)
+        hydrogen_bond_atom_mask = atom_data[atom_name_col].isin(primary_hydrogen_bond_atoms_for_filtering)
+        hydrogen_bond_other_atom_mask = atom_data[atom_name_col].isin(secondary_hydrogen_bond_atoms_for_filtering)
+        
+        for chain1 in unique_chain_ids:
+            interface_contact_chain1_mask = (atom_data[chain_id_col] == chain1) & interface_contact_atom_mask
+            hydrogen_bond_chain1_mask = (atom_data[chain_id_col] == chain1) & hydrogen_bond_atom_mask
+            interface_contact_chain1_residues = atom_data[interface_contact_chain1_mask]
+            hydrogen_bond_chain1_atoms = atom_data[hydrogen_bond_chain1_mask]
+            
+            if np.sum(interface_contact_chain1_mask) == 0 or np.sum(hydrogen_bond_chain1_mask) == 0:
+                continue
+            
+            interface_contact_chain1_coords = interface_contact_chain1_residues[["x_coord", "y_coord", "z_coord"]].to_numpy()
+            interface_contact_non_chain1_coords = atom_data.loc[interface_contact_other_atom_mask & (atom_data[chain_id_col] != chain1), ["x_coord", "y_coord", "z_coord"]].to_numpy()
+            hydrogen_bond_chain1_coords = hydrogen_bond_chain1_atoms[["x_coord", "y_coord", "z_coord"]].to_numpy()
+            hydrogen_bond_non_chain1_coords = atom_data.loc[hydrogen_bond_other_atom_mask & (atom_data[chain_id_col] != chain1), ["x_coord", "y_coord", "z_coord"]].to_numpy()
+
+            interface_contact_distances = cdist(interface_contact_chain1_coords, interface_contact_non_chain1_coords, metric="euclidean")
+            hydrogen_bond_distances = cdist(hydrogen_bond_chain1_coords, hydrogen_bond_non_chain1_coords, metric="euclidean")
+            
+            num_interface_contacts = np.sum(interface_contact_distances <= interface_contact_criterion, axis=1).sum()
+            chain_within_interface = (num_interface_contacts >= interface_contact_count).item()
+
+            num_hydrogen_bonds = np.sum(hydrogen_bond_distances <= hydrogen_bond_criterion, axis=1).sum()
+            chain_with_sufficient_bond_count = (num_hydrogen_bonds >= hydrogen_bond_count).item()
+            
+            if not chain_within_interface or not chain_with_sufficient_bond_count:
+                log.info(f"Filtering out chain {chain1} within PDB {pdb.pdb_path}, as it contains {num_interface_contacts} (of {interface_contact_count} required) interface contacts and {num_hydrogen_bonds} (of {hydrogen_bond_count} required) hydrogen bonds")
+                filtered_pdb.df[atom_df_name] = filtered_pdb.df[atom_df_name][filtered_pdb.df[atom_df_name][chain_id_col] != chain1]
+        
+        return filtered_pdb
+
     def write_out_pdb_chain_groups(
         self,
         df: pd.DataFrame,
@@ -1828,6 +1928,11 @@ def write_out_pdb_chain_groups(
         atom_df_name: str = "ATOM",
         max_num_chains_per_pdb_code: int = -1,
         models: List[int] = [1],
+        filter_for_interface_contacts: bool = False,
+        interface_contact_criterion: float = 7.0,
+        hydrogen_bond_criterion: float = 3.5,
+        interface_contact_count: int = 16,
+        hydrogen_bond_count: int = 10,
     ):
         """Record groups of PDB codes and associated chains
         as collated PDB files.
@@ -1852,6 +1957,28 @@ def write_out_pdb_chain_groups(
         :param models: List of indices of models from which to extract chains,
             defaults to ``[1]``.
         :type models: List[int], optional
+        :param filter_for_interface_contacts: Whether to filter for complex
+            chains that constitute at least one inter-chain interface, as
+            defined by the subsequent parameters ``interface_contact_criterion``,
+            ``hydrogen_bond_criterion``, ``interface_contact_count``,
+            and ``hydrogen_bond_count``.
+        :param filter_for_interface_contacts: bool, optional
+        :param interface_contact_criterion: Distance between two inter-chain
+            residues at which to classify a residue pair as interface contacts.
+            Only referenced if ``filter_for_interface_contacts`` is ``True``.
+        :type interface_contact_criterion: float, optional
+        :param hydrogen_bond_criterion: Distance between two inter-chain
+            atoms at which to classify an atom pair as hydrogen-bonded.
+            Only referenced if ``filter_for_interface_contacts`` is ``True``.
+        :type hydrogen_bond_criterion: float, optional
+        :param interface_contact_count: Number of interface contacts required
+            to select a chain to be exported. Only referenced if
+            ``filter_for_interface_contacts`` is ``True``.
+        :type interface_contact_count: int, optional
+        :param hydrogen_bond_count: Number of hydrogen bonds required
+            to select a chain to be exported. Only referenced if
+            ``filter_for_interface_contacts`` is ``True``.
+        :type hydrogen_bond_count: int, optional
         """
         if len(df) > 0:
             split_dir = Path(out_dir) / split
@@ -1896,14 +2023,29 @@ def write_out_pdb_chain_groups(
                         for chain in entry_chains
                         if chain in pdb_atom_chains
                     ]
-                    chains = (
-                        chains
-                        if max_num_chains_per_pdb_code == -1
-                        else chains[:max_num_chains_per_pdb_code]
-                    )
+                    if not filter_for_interface_contacts:
+                        chains = (
+                            chains
+                            if max_num_chains_per_pdb_code == -1
+                            else chains[:max_num_chains_per_pdb_code]
+                        )
                     pdb_chains = self.select_pdb_by_criterion(
                         pdb, "chain_id", chains, entry_pdb_code
                     )
+                    num_pdb_chains = len(pdb_chains.df[atom_df_name].chain_id.unique().tolist())
+                    if filter_for_interface_contacts and num_pdb_chains > 1:
+                        pdb_chains = self.filter_chains_by_interface_criteria(
+                            pdb=pdb_chains,
+                            interface_contact_criterion=interface_contact_criterion,
+                            hydrogen_bond_criterion=hydrogen_bond_criterion,
+                            interface_contact_count=interface_contact_count,
+                            hydrogen_bond_count=hydrogen_bond_count,
+                        )
+                        pdb_chains = (
+                            pdb_chains
+                            if max_num_chains_per_pdb_code == -1
+                            else pdb_chains[:max_num_chains_per_pdb_code]
+                        )
                     # export selected chains within the same PDB file
                     pdb_chains.to_pdb(str(output_pdb_filepath))
 
@@ -1915,6 +2057,11 @@ def write_df_pdbs(
         splits: Optional[List[str]] = None,
         max_num_chains_per_pdb_code: int = -1,
         models: List[int] = [1],
+        filter_for_interface_contacts: bool = False,
+        interface_contact_criterion: float = 7.0,
+        hydrogen_bond_criterion: float = 3.5,
+        interface_contact_count: int = 16,
+        hydrogen_bond_count: int = 10,
     ):
         """Write the given selection as a collection of PDB files.
 
@@ -1935,6 +2082,28 @@ def write_df_pdbs(
         :param models: List of indices of models from which to extract chains,
             defaults to ``[1]``.
         :type models: List[int], optional
+        :param filter_for_interface_contacts: Whether to filter for complex
+            chains that constitute at least one inter-chain interface, as
+            defined by the subsequent parameters ``interface_contact_criterion``,
+            ``hydrogen_bond_criterion``, ``interface_contact_count``,
+            and ``hydrogen_bond_count``.
+        :param filter_for_interface_contacts: bool, optional
+        :param interface_contact_criterion: Distance between two inter-chain
+            residues at which to classify a residue pair as interface contacts.
+            Only referenced if ``filter_for_interface_contacts`` is ``True``.
+        :type interface_contact_criterion: float, optional
+        :param hydrogen_bond_criterion: Distance between two inter-chain
+            atoms at which to classify an atom pair as hydrogen-bonded.
+            Only referenced if ``filter_for_interface_contacts`` is ``True``.
+        :type hydrogen_bond_criterion: float, optional
+        :param interface_contact_count: Number of interface contacts required
+            to select a chain to be exported. Only referenced if
+            ``filter_for_interface_contacts`` is ``True``.
+        :type interface_contact_count: int, optional
+        :param hydrogen_bond_count: Number of hydrogen bonds required
+            to select a chain to be exported. Only referenced if
+            ``filter_for_interface_contacts`` is ``True``.
+        :type hydrogen_bond_count: int, optional
         """
         out_dir = Path(pdb_dir) / out_dir
         os.makedirs(out_dir, exist_ok=True)
@@ -1950,6 +2119,11 @@ def write_df_pdbs(
                     merge_fn=self.merge_pdb_chain_groups,
                     max_num_chains_per_pdb_code=max_num_chains_per_pdb_code,
                     models=models,
+                    filter_for_interface_contacts=filter_for_interface_contacts,
+                    interface_contact_criterion=interface_contact_criterion,
+                    hydrogen_bond_criterion=hydrogen_bond_criterion,
+                    interface_contact_count=interface_contact_count,
+                    hydrogen_bond_count=hydrogen_bond_count,
                 )
         else:
             self.write_out_pdb_chain_groups(
@@ -1960,6 +2134,11 @@ def write_df_pdbs(
                 merge_fn=self.merge_pdb_chain_groups,
                 max_num_chains_per_pdb_code=max_num_chains_per_pdb_code,
                 models=models,
+                filter_for_interface_contacts=filter_for_interface_contacts,
+                interface_contact_criterion=interface_contact_criterion,
+                hydrogen_bond_criterion=hydrogen_bond_criterion,
+                interface_contact_count=interface_contact_count,
+                hydrogen_bond_count=hydrogen_bond_count,
             )
 
     def export_pdbs(
@@ -1968,6 +2147,11 @@ def export_pdbs(
         splits: Optional[List[str]] = None,
         max_num_chains_per_pdb_code: int = -1,
         models: List[int] = [1],
+        filter_for_interface_contacts: bool = False,
+        interface_contact_criterion: float = 7.0,
+        hydrogen_bond_criterion: float = 3.5,
+        interface_contact_count: int = 16,
+        hydrogen_bond_count: int = 10,
         force: bool = False,
     ):
         """Write the selection as a collection of PDB files.
@@ -1983,6 +2167,28 @@ def export_pdbs(
         :param models: List of indices of models from which to extract chains,
             defaults to ``[1]``.
         :type models: List[int], optional
+        :param filter_for_interface_contacts: Whether to filter for complex
+            chains that constitute at least one inter-chain interface, as
+            defined by the subsequent parameters ``interface_contact_criterion``,
+            ``hydrogen_bond_criterion``, ``interface_contact_count``,
+            and ``hydrogen_bond_count``.
+        :param filter_for_interface_contacts: bool, optional
+        :param interface_contact_criterion: Distance between two inter-chain
+            residues at which to classify a residue pair as interface contacts.
+            Only referenced if ``filter_for_interface_contacts`` is ``True``.
+        :type interface_contact_criterion: float, optional
+        :param hydrogen_bond_criterion: Distance between two inter-chain
+            atoms at which to classify an atom pair as hydrogen-bonded.
+            Only referenced if ``filter_for_interface_contacts`` is ``True``.
+        :type hydrogen_bond_criterion: float, optional
+        :param interface_contact_count: Number of interface contacts required
+            to select a chain to be exported. Only referenced if
+            ``filter_for_interface_contacts`` is ``True``.
+        :type interface_contact_count: int, optional
+        :param hydrogen_bond_count: Number of hydrogen bonds required
+            to select a chain to be exported. Only referenced if
+            ``filter_for_interface_contacts`` is ``True``.
+        :type hydrogen_bond_count: int, optional
         :param force: Whether to raise an error if the download selection
             contains PDBs which are not available in PDB format.
         """
@@ -1999,5 +2205,10 @@ def export_pdbs(
             splits=splits,
             max_num_chains_per_pdb_code=max_num_chains_per_pdb_code,
             models=models,
+            filter_for_interface_contacts=filter_for_interface_contacts,
+            interface_contact_criterion=interface_contact_criterion,
+            hydrogen_bond_criterion=hydrogen_bond_criterion,
+            interface_contact_count=interface_contact_count,
+            hydrogen_bond_count=hydrogen_bond_count,
         )
         log.info("Done writing selection of PDB chains")