From 3d8e17af1ca56bae350ad5f73f504609811573c4 Mon Sep 17 00:00:00 2001
From: Alex Morehead <amorehead@profluent.bio>
Date: Tue, 23 May 2023 18:57:08 +0000
Subject: [PATCH 01/12] Add name parsing function to PDBDataset API

---
 .gitignore                       |  3 +++
 graphein/ml/datasets/pdb_data.py | 29 +++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/.gitignore b/.gitignore
index 941201e0..e141ee95 100644
--- a/.gitignore
+++ b/.gitignore
@@ -173,3 +173,6 @@ afdb_swissprot_v4.*
 
 # Local test directories
 tmp/
+
+# Local Conda/Mamba environment
+graphein-env/
\ No newline at end of file
diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py
index d7ba7dcb..4538b4a9 100644
--- a/graphein/ml/datasets/pdb_data.py
+++ b/graphein/ml/datasets/pdb_data.py
@@ -727,6 +727,35 @@ def experiment_type(
         if update:
             self.df = df
         return df
+    
+    def name(
+        self,
+        substrings: List[str],
+        splits: Optional[List[str]] = None,
+        update: bool = False,
+    ) -> pd.DataFrame:
+        """
+        Select molecules by substrings present in their names:
+        e.g., [``DNA``, ``RNA``]
+
+        :param substrings: Substrings to be found within the name field of each molecule.
+        :type type: str, optional
+        :param splits: Names of splits for which to perform the operation,
+            defaults to ``None``.
+        :type splits: Optional[List[str]], optional
+        :param update: Whether to modify the DataFrame in place, defaults to
+            ``False``.
+        :type update: bool, optional
+
+        :return: DataFrame of selected molecules.
+        :rtype: pd.DataFrame
+        """
+        splits_df = self.get_splits(splits)
+        df = splits_df.loc[splits_df.name.str.contains("|".join(substrings))]
+
+        if update:
+            self.df = df
+        return df
 
     def compare_length(
         self,

From 99e329c33afcf91abbd82dda5a301245624fe020 Mon Sep 17 00:00:00 2001
From: Alex Morehead <amorehead@profluent.bio>
Date: Tue, 23 May 2023 22:37:54 +0000
Subject: [PATCH 02/12] Add new helper functions for working with non-protein
 molecule types with the PDBManager

---
 graphein/ml/datasets/pdb_data.py | 56 ++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py
index 4538b4a9..4111b159 100644
--- a/graphein/ml/datasets/pdb_data.py
+++ b/graphein/ml/datasets/pdb_data.py
@@ -728,6 +728,35 @@ def experiment_type(
             self.df = df
         return df
     
+    def experiment_types(
+        self,
+        types: List[str] = ["diffraction"],
+        splits: Optional[List[str]] = None,
+        update: bool = False,
+    ) -> pd.DataFrame:
+        """
+        Select molecules by experiment types:
+        [``diffraction``, ``NMR``, ``EM``, ``other``]
+
+        :param types: Experiment types of molecules, defaults to "diffraction".
+        :type types: List[str], optional
+        :param splits: Names of splits for which to perform the operation,
+            defaults to ``None``.
+        :type splits: Optional[List[str]], optional
+        :param update: Whether to modify the DataFrame in place, defaults to
+            ``False``.
+        :type update: bool, optional
+
+        :return: DataFrame of selected molecules.
+        :rtype: pd.DataFrame
+        """
+        splits_df = self.get_splits(splits)
+        df = splits_df.loc[splits_df.experiment_type.isin(types)]
+
+        if update:
+            self.df = df
+        return df
+    
     def name(
         self,
         substrings: List[str],
@@ -1084,6 +1113,33 @@ def remove_non_standard_alphabet_sequences(
         if update:
             self.df = df
         return df
+    
+    def select_complexes_with_grouped_molecule_types(
+        self, types_to_group: List[str], splits: Optional[List[str]] = None, update: bool = False
+    ):
+        """
+        Select complexes containing at least one instance of each
+        provided molecule type.
+
+        :param types_to_group: Names of molecule types by which to assemble complexes.
+        :type types_to_group: List[str]
+        :param splits: Names of splits for which to perform the operation,
+            defaults to ``None``.
+        :type splits: Optional[List[str]], optional
+        :param update: Whether to update the DataFrame in place, defaults to
+            ``False``.
+        :type update: bool, optional
+
+        :return: DataFrame containing only complexes with at least one instance
+          of each provided molecule type.
+        :rtype: pd.DataFrame
+        """
+        splits_df = self.get_splits(splits)
+        df = splits_df.groupby("pdb").filter(
+            lambda group: all(type_to_group in group["molecule_type"].values for type_to_group in types_to_group)
+        )
+        if update:
+            self.df = df
 
     def split_df_proportionally(
         self,

From 72573e6b464c93228c47e76a8cbdebc8b358a47b Mon Sep 17 00:00:00 2001
From: Alex Morehead <amorehead@profluent.bio>
Date: Wed, 24 May 2023 19:48:26 +0000
Subject: [PATCH 03/12] Update variable names and default values

---
 graphein/ml/datasets/pdb_data.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py
index 4111b159..99710c21 100644
--- a/graphein/ml/datasets/pdb_data.py
+++ b/graphein/ml/datasets/pdb_data.py
@@ -1115,14 +1115,14 @@ def remove_non_standard_alphabet_sequences(
         return df
     
     def select_complexes_with_grouped_molecule_types(
-        self, types_to_group: List[str], splits: Optional[List[str]] = None, update: bool = False
+        self, molecule_types_to_group: List[str], splits: Optional[List[str]] = None, update: bool = False
     ):
         """
         Select complexes containing at least one instance of each
         provided molecule type.
 
-        :param types_to_group: Names of molecule types by which to assemble complexes.
-        :type types_to_group: List[str]
+        :param molecule_types_to_group: Names of molecule types by which to assemble complexes.
+        :type molecule_types_to_group: List[str]
         :param splits: Names of splits for which to perform the operation,
             defaults to ``None``.
         :type splits: Optional[List[str]], optional
@@ -1136,7 +1136,7 @@ def select_complexes_with_grouped_molecule_types(
         """
         splits_df = self.get_splits(splits)
         df = splits_df.groupby("pdb").filter(
-            lambda group: all(type_to_group in group["molecule_type"].values for type_to_group in types_to_group)
+            lambda group: all(molecule_type_to_group in group["molecule_type"].values for molecule_type_to_group in molecule_types_to_group)
         )
         if update:
             self.df = df
@@ -1812,7 +1812,7 @@ def write_out_pdb_chain_groups(
         split: str,
         merge_fn: Callable,
         atom_df_name: str = "ATOM",
-        max_num_chains_per_pdb_code: int = 1,
+        max_num_chains_per_pdb_code: int = -1,
         models: List[int] = [1],
     ):
         """Record groups of PDB codes and associated chains
@@ -1833,7 +1833,7 @@ def write_out_pdb_chain_groups(
             ATOM entries within a PandasPdb object.
         :type atom_df_name: str, defaults to ``ATOM``
         :param max_num_chains_per_pdb_code: Maximum number of chains
-            to collate into a matching PDB file.
+            to collate into a matching PDB file, defaults to ``-1``.
         :type max_num_chains_per_pdb_code: int, optional
         :param models: List of indices of models from which to extract chains,
             defaults to ``[1]``.
@@ -1899,7 +1899,7 @@ def write_df_pdbs(
         df: pd.DataFrame,
         out_dir: str = "collated_pdb",
         splits: Optional[List[str]] = None,
-        max_num_chains_per_pdb_code: int = 1,
+        max_num_chains_per_pdb_code: int = -1,
         models: List[int] = [1],
     ):
         """Write the given selection as a collection of PDB files.
@@ -1916,7 +1916,7 @@ def write_df_pdbs(
             defaults to ``None``.
         :type splits: Optional[List[str]], optional
         :param max_num_chains_per_pdb_code: Maximum number of chains
-            to collate into a matching PDB file.
+            to collate into a matching PDB file, defaults to ``-1``.
         :type max_num_chains_per_pdb_code: int, optional
         :param models: List of indices of models from which to extract chains,
             defaults to ``[1]``.
@@ -1952,7 +1952,7 @@ def export_pdbs(
         self,
         pdb_dir: str,
         splits: Optional[List[str]] = None,
-        max_num_chains_per_pdb_code: int = 1,
+        max_num_chains_per_pdb_code: int = -1,
         models: List[int] = [1],
         force: bool = False,
     ):
@@ -1964,7 +1964,7 @@ def export_pdbs(
             defaults to ``None``.
         :type splits: Optional[List[str]], optional
         :param max_num_chains_per_pdb_code: Maximum number of chains
-            to collate into a matching PDB file.
+            to collate into a matching PDB file, defaults to ``-1``.
         :type max_num_chains_per_pdb_code: int, optional
         :param models: List of indices of models from which to extract chains,
             defaults to ``[1]``.

From a0353843687d41781ed1498dc70c3833a64ba039 Mon Sep 17 00:00:00 2001
From: Alex Morehead <amorehead@profluent.bio>
Date: Wed, 24 May 2023 20:44:32 +0000
Subject: [PATCH 04/12] Update syntax

---
 graphein/ml/datasets/pdb_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py
index 99710c21..394122a7 100644
--- a/graphein/ml/datasets/pdb_data.py
+++ b/graphein/ml/datasets/pdb_data.py
@@ -1136,7 +1136,7 @@ def select_complexes_with_grouped_molecule_types(
         """
         splits_df = self.get_splits(splits)
         df = splits_df.groupby("pdb").filter(
-            lambda group: all(molecule_type_to_group in group["molecule_type"].values for molecule_type_to_group in molecule_types_to_group)
+            lambda group: all([molecule_type_to_group in group["molecule_type"].values for molecule_type_to_group in molecule_types_to_group])
         )
         if update:
             self.df = df

From c07bea191d3e81304688d20149ab546fde842ffe Mon Sep 17 00:00:00 2001
From: Alex Morehead <amorehead@profluent.bio>
Date: Wed, 24 May 2023 22:38:45 +0000
Subject: [PATCH 05/12] Restore .gitignore

---
 .gitignore | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index e141ee95..65941213 100644
--- a/.gitignore
+++ b/.gitignore
@@ -172,7 +172,4 @@ afdb_swissprot_v4
 afdb_swissprot_v4.*
 
 # Local test directories
-tmp/
-
-# Local Conda/Mamba environment
-graphein-env/
\ No newline at end of file
+tmp/
\ No newline at end of file

From 361b9405a0c134e81be5888be55561b010e9c5b0 Mon Sep 17 00:00:00 2001
From: Alex Morehead <amorehead@profluent.bio>
Date: Wed, 24 May 2023 22:38:59 +0000
Subject: [PATCH 06/12] Restore .gitignore

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 65941213..941201e0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -172,4 +172,4 @@ afdb_swissprot_v4
 afdb_swissprot_v4.*
 
 # Local test directories
-tmp/
\ No newline at end of file
+tmp/

From efc2d3d2d194902d79034ae5771eaa62546a10ec Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 24 May 2023 22:40:00 +0000
Subject: [PATCH 07/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 graphein/ml/datasets/pdb_data.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py
index 394122a7..bc2da68a 100644
--- a/graphein/ml/datasets/pdb_data.py
+++ b/graphein/ml/datasets/pdb_data.py
@@ -727,7 +727,7 @@ def experiment_type(
         if update:
             self.df = df
         return df
-    
+
     def experiment_types(
         self,
         types: List[str] = ["diffraction"],
@@ -756,7 +756,7 @@ def experiment_types(
         if update:
             self.df = df
         return df
-    
+
     def name(
         self,
         substrings: List[str],
@@ -1113,9 +1113,12 @@ def remove_non_standard_alphabet_sequences(
         if update:
             self.df = df
         return df
-    
+
     def select_complexes_with_grouped_molecule_types(
-        self, molecule_types_to_group: List[str], splits: Optional[List[str]] = None, update: bool = False
+        self,
+        molecule_types_to_group: List[str],
+        splits: Optional[List[str]] = None,
+        update: bool = False,
     ):
         """
         Select complexes containing at least one instance of each
@@ -1136,7 +1139,12 @@ def select_complexes_with_grouped_molecule_types(
         """
         splits_df = self.get_splits(splits)
         df = splits_df.groupby("pdb").filter(
-            lambda group: all([molecule_type_to_group in group["molecule_type"].values for molecule_type_to_group in molecule_types_to_group])
+            lambda group: all(
+                [
+                    molecule_type_to_group in group["molecule_type"].values
+                    for molecule_type_to_group in molecule_types_to_group
+                ]
+            )
         )
         if update:
             self.df = df

From a6c4a6e4f84d537282c28fedb9c60ddd5ba4ce0c Mon Sep 17 00:00:00 2001
From: Alex Morehead <amorehead@profluent.bio>
Date: Wed, 24 May 2023 23:38:49 +0000
Subject: [PATCH 08/12] Handle edge case

---
 graphein/ml/datasets/pdb_data.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py
index 394122a7..5207f723 100644
--- a/graphein/ml/datasets/pdb_data.py
+++ b/graphein/ml/datasets/pdb_data.py
@@ -1778,7 +1778,7 @@ def merge_pdb_chain_groups(self, group: DataFrameGroupBy) -> pd.DataFrame:
 
     def select_pdb_by_criterion(
         self, pdb: PandasPdb, field: str, field_values: List[Any]
-    ) -> PandasPdb:
+    ) -> Optional[PandasPdb]:
         """Filter a PDB using a field selection.
 
         :param pdb: The PDB object to filter by a field.
@@ -1789,18 +1789,18 @@ def select_pdb_by_criterion(
             the PDB.
         :type field_values: List[Any]
 
-        :return: The filtered PDB object.
-        :rtype: PandasPdb
+        :return: The filtered PDB object or instead `None` to signify
+        that no atoms within the PDB object were found after filtering.
+        :rtype: Optional[PandasPdb], optional
         """
         for key in pdb.df:
             if field in pdb.df[key]:
                 filtered_pdb = pdb.df[key][
                     pdb.df[key][field].isin(field_values)
                 ]
-                if "ATOM" in key:
-                    assert (
-                        len(filtered_pdb) > 0
-                    ), "Filtered DataFrame must contain atoms."
+                if "ATOM" in key and len(filtered_pdb) == 0:
+                    log.warning("Filtered DataFrame does not contain any atoms. Skipping DataFrame...")
+                    return None
                 pdb.df[key] = filtered_pdb
         return pdb
 
@@ -1891,7 +1891,8 @@ def write_out_pdb_chain_groups(
                         pdb, "chain_id", chains
                     )
                     # export selected chains within the same PDB file
-                    pdb_chains.to_pdb(str(output_pdb_filepath))
+                    if pdb_chains:
+                        pdb_chains.to_pdb(str(output_pdb_filepath))
 
     def write_df_pdbs(
         self,

From 768f15fb468d40d517860f0af2c7ef0fb5acc1b5 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 24 May 2023 23:39:25 +0000
Subject: [PATCH 09/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 graphein/ml/datasets/pdb_data.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py
index 93372933..052fc273 100644
--- a/graphein/ml/datasets/pdb_data.py
+++ b/graphein/ml/datasets/pdb_data.py
@@ -1807,7 +1807,9 @@ def select_pdb_by_criterion(
                     pdb.df[key][field].isin(field_values)
                 ]
                 if "ATOM" in key and len(filtered_pdb) == 0:
-                    log.warning("Filtered DataFrame does not contain any atoms. Skipping DataFrame...")
+                    log.warning(
+                        "Filtered DataFrame does not contain any atoms. Skipping DataFrame..."
+                    )
                     return None
                 pdb.df[key] = filtered_pdb
         return pdb

From 6b1879ed47205a0ff8aad92b85e328f4527fb3da Mon Sep 17 00:00:00 2001
From: Alex Morehead <amorehead@profluent.bio>
Date: Thu, 25 May 2023 01:19:41 +0000
Subject: [PATCH 10/12] Handle edge cases in export function

---
 graphein/ml/datasets/pdb_data.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py
index 052fc273..07bf2ef4 100644
--- a/graphein/ml/datasets/pdb_data.py
+++ b/graphein/ml/datasets/pdb_data.py
@@ -1785,8 +1785,8 @@ def merge_pdb_chain_groups(self, group: DataFrameGroupBy) -> pd.DataFrame:
         )
 
     def select_pdb_by_criterion(
-        self, pdb: PandasPdb, field: str, field_values: List[Any]
-    ) -> Optional[PandasPdb]:
+        self, pdb: PandasPdb, field: str, field_values: List[Any], pdb_code: str
+    ) -> PandasPdb:
         """Filter a PDB using a field selection.
 
         :param pdb: The PDB object to filter by a field.
@@ -1796,10 +1796,11 @@ def select_pdb_by_criterion(
         :param field_values: The field values by which to filter
             the PDB.
         :type field_values: List[Any]
+        :param pdb_code: The PDB code associated with a given PDB object.
+        :type pdb_code: str
 
-        :return: The filtered PDB object or instead `None` to signify
-        that no atoms within the PDB object were found after filtering.
-        :rtype: Optional[PandasPdb], optional
+        :return: The filtered PDB object.
+        :rtype: PandasPdb
         """
         for key in pdb.df:
             if field in pdb.df[key]:
@@ -1808,9 +1809,8 @@ def select_pdb_by_criterion(
                 ]
                 if "ATOM" in key and len(filtered_pdb) == 0:
                     log.warning(
-                        "Filtered DataFrame does not contain any atoms. Skipping DataFrame..."
+                        f"DataFrame for PDB {pdb_code} does not contain any standard atoms after filtering"
                     )
-                    return None
                 pdb.df[key] = filtered_pdb
         return pdb
 
@@ -1898,11 +1898,10 @@ def write_out_pdb_chain_groups(
                         else chains[:max_num_chains_per_pdb_code]
                     )
                     pdb_chains = self.select_pdb_by_criterion(
-                        pdb, "chain_id", chains
+                        pdb, "chain_id", chains, entry_pdb_code
                     )
                     # export selected chains within the same PDB file
-                    if pdb_chains:
-                        pdb_chains.to_pdb(str(output_pdb_filepath))
+                    pdb_chains.to_pdb(str(output_pdb_filepath))
 
     def write_df_pdbs(
         self,

From c5d7d6ea1b01fe6134f79a64c800ece2af04cc5a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 25 May 2023 01:20:04 +0000
Subject: [PATCH 11/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 graphein/ml/datasets/pdb_data.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py
index 07bf2ef4..9082746a 100644
--- a/graphein/ml/datasets/pdb_data.py
+++ b/graphein/ml/datasets/pdb_data.py
@@ -1785,7 +1785,11 @@ def merge_pdb_chain_groups(self, group: DataFrameGroupBy) -> pd.DataFrame:
         )
 
     def select_pdb_by_criterion(
-        self, pdb: PandasPdb, field: str, field_values: List[Any], pdb_code: str
+        self,
+        pdb: PandasPdb,
+        field: str,
+        field_values: List[Any],
+        pdb_code: str,
     ) -> PandasPdb:
         """Filter a PDB using a field selection.
 

From e527f7414aecdfe4c0e8300215cb82595a067943 Mon Sep 17 00:00:00 2001
From: Arian Jamasb <arjamasb@gmail.com>
Date: Thu, 25 May 2023 19:45:05 +0100
Subject: [PATCH 12/12] update changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 797a32e4..71ed5a4b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -23,6 +23,7 @@
 * Insertions retained by default in the `graphein.protein.tensor` module. I.e. `insertions=True` is now the default behaviour.[#307](https://github.com/a-r-j/graphein/pull/307)
 * Adds transform composition to FoldComp Dataset [#312](https://github.com/a-r-j/graphein/pull/312)
 * Improve FoldComp dataloading performance and include B factors (pLDDT) in output. [#313](https://github.com/a-r-j/graphein/pull/313) [#315](https://github.com/a-r-j/graphein/pull/315)
+* Add new helper functions to PDBManager [#322](https://github.com/a-r-j/graphein/pull/322) (@amorehead)
 
 ### 1.7.0 - UNRELEASED