From 3d8e17af1ca56bae350ad5f73f504609811573c4 Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Tue, 23 May 2023 18:57:08 +0000 Subject: [PATCH 01/12] Add name parsing function to PDBDataset API --- .gitignore | 3 +++ graphein/ml/datasets/pdb_data.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/.gitignore b/.gitignore index 941201e0..e141ee95 100644 --- a/.gitignore +++ b/.gitignore @@ -173,3 +173,6 @@ afdb_swissprot_v4.* # Local test directories tmp/ + +# Local Conda/Mamba environment +graphein-env/ \ No newline at end of file diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py index d7ba7dcb..4538b4a9 100644 --- a/graphein/ml/datasets/pdb_data.py +++ b/graphein/ml/datasets/pdb_data.py @@ -727,6 +727,35 @@ def experiment_type( if update: self.df = df return df + + def name( + self, + substrings: List[str], + splits: Optional[List[str]] = None, + update: bool = False, + ) -> pd.DataFrame: + """ + Select molecules by substrings present in their names: + e.g., [``DNA``, ``RNA``] + + :param substrings: Substrings to be found within the name field of each molecule. + :type type: str, optional + :param splits: Names of splits for which to perform the operation, + defaults to ``None``. + :type splits: Optional[List[str]], optional + :param update: Whether to modify the DataFrame in place, defaults to + ``False``. + :type update: bool, optional + + :return: DataFrame of selected molecules. + :rtype: pd.DataFrame + """ + splits_df = self.get_splits(splits) + df = splits_df.loc[splits_df.name.str.contains("|".join(substrings))] + + if update: + self.df = df + return df def compare_length( self, From 99e329c33afcf91abbd82dda5a301245624fe020 Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Tue, 23 May 2023 22:37:54 +0000 Subject: [PATCH 02/12] Add new helper functions for working with non-protein molecule types with the PDBManager --- graphein/ml/datasets/pdb_data.py | 56 ++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py index 4538b4a9..4111b159 100644 --- a/graphein/ml/datasets/pdb_data.py +++ b/graphein/ml/datasets/pdb_data.py @@ -728,6 +728,35 @@ def experiment_type( self.df = df return df + def experiment_types( + self, + types: List[str] = ["diffraction"], + splits: Optional[List[str]] = None, + update: bool = False, + ) -> pd.DataFrame: + """ + Select molecules by experiment types: + [``diffraction``, ``NMR``, ``EM``, ``other``] + + :param types: Experiment types of molecules, defaults to "diffraction". + :type types: List[str], optional + :param splits: Names of splits for which to perform the operation, + defaults to ``None``. + :type splits: Optional[List[str]], optional + :param update: Whether to modify the DataFrame in place, defaults to + ``False``. + :type update: bool, optional + + :return: DataFrame of selected molecules. + :rtype: pd.DataFrame + """ + splits_df = self.get_splits(splits) + df = splits_df.loc[splits_df.experiment_type.isin(types)] + + if update: + self.df = df + return df + def name( self, substrings: List[str], @@ -1084,6 +1113,33 @@ def remove_non_standard_alphabet_sequences( if update: self.df = df return df + + def select_complexes_with_grouped_molecule_types( + self, types_to_group: List[str], splits: Optional[List[str]] = None, update: bool = False + ): + """ + Select complexes containing at least one instance of each + provided molecule type. + + :param types_to_group: Names of molecule types by which to assemble complexes. + :type types_to_group: List[str] + :param splits: Names of splits for which to perform the operation, + defaults to ``None``. + :type splits: Optional[List[str]], optional + :param update: Whether to update the DataFrame in place, defaults to + ``False``. + :type update: bool, optional + + :return: DataFrame containing only complexes with at least one instance + of each provided molecule type. + :rtype: pd.DataFrame + """ + splits_df = self.get_splits(splits) + df = splits_df.groupby("pdb").filter( + lambda group: all(type_to_group in group["molecule_type"].values for type_to_group in types_to_group) + ) + if update: + self.df = df def split_df_proportionally( self, From 72573e6b464c93228c47e76a8cbdebc8b358a47b Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Wed, 24 May 2023 19:48:26 +0000 Subject: [PATCH 03/12] Update variable names and default values --- graphein/ml/datasets/pdb_data.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py index 4111b159..99710c21 100644 --- a/graphein/ml/datasets/pdb_data.py +++ b/graphein/ml/datasets/pdb_data.py @@ -1115,14 +1115,14 @@ def remove_non_standard_alphabet_sequences( return df def select_complexes_with_grouped_molecule_types( - self, types_to_group: List[str], splits: Optional[List[str]] = None, update: bool = False + self, molecule_types_to_group: List[str], splits: Optional[List[str]] = None, update: bool = False ): """ Select complexes containing at least one instance of each provided molecule type. - :param types_to_group: Names of molecule types by which to assemble complexes. - :type types_to_group: List[str] + :param molecule_types_to_group: Names of molecule types by which to assemble complexes. + :type molecule_types_to_group: List[str] :param splits: Names of splits for which to perform the operation, defaults to ``None``. :type splits: Optional[List[str]], optional @@ -1136,7 +1136,7 @@ def select_complexes_with_grouped_molecule_types( """ splits_df = self.get_splits(splits) df = splits_df.groupby("pdb").filter( - lambda group: all(type_to_group in group["molecule_type"].values for type_to_group in types_to_group) + lambda group: all(molecule_type_to_group in group["molecule_type"].values for molecule_type_to_group in molecule_types_to_group) ) if update: self.df = df @@ -1812,7 +1812,7 @@ def write_out_pdb_chain_groups( split: str, merge_fn: Callable, atom_df_name: str = "ATOM", - max_num_chains_per_pdb_code: int = 1, + max_num_chains_per_pdb_code: int = -1, models: List[int] = [1], ): """Record groups of PDB codes and associated chains @@ -1833,7 +1833,7 @@ def write_out_pdb_chain_groups( ATOM entries within a PandasPdb object. :type atom_df_name: str, defaults to ``ATOM`` :param max_num_chains_per_pdb_code: Maximum number of chains - to collate into a matching PDB file. + to collate into a matching PDB file, defaults to ``-1``. :type max_num_chains_per_pdb_code: int, optional :param models: List of indices of models from which to extract chains, defaults to ``[1]``. @@ -1899,7 +1899,7 @@ def write_df_pdbs( df: pd.DataFrame, out_dir: str = "collated_pdb", splits: Optional[List[str]] = None, - max_num_chains_per_pdb_code: int = 1, + max_num_chains_per_pdb_code: int = -1, models: List[int] = [1], ): """Write the given selection as a collection of PDB files. @@ -1916,7 +1916,7 @@ def write_df_pdbs( defaults to ``None``. :type splits: Optional[List[str]], optional :param max_num_chains_per_pdb_code: Maximum number of chains - to collate into a matching PDB file. + to collate into a matching PDB file, defaults to ``-1``. :type max_num_chains_per_pdb_code: int, optional :param models: List of indices of models from which to extract chains, defaults to ``[1]``. @@ -1952,7 +1952,7 @@ def export_pdbs( self, pdb_dir: str, splits: Optional[List[str]] = None, - max_num_chains_per_pdb_code: int = 1, + max_num_chains_per_pdb_code: int = -1, models: List[int] = [1], force: bool = False, ): @@ -1964,7 +1964,7 @@ def export_pdbs( defaults to ``None``. :type splits: Optional[List[str]], optional :param max_num_chains_per_pdb_code: Maximum number of chains - to collate into a matching PDB file. + to collate into a matching PDB file, defaults to ``-1``. :type max_num_chains_per_pdb_code: int, optional :param models: List of indices of models from which to extract chains, defaults to ``[1]``. From a0353843687d41781ed1498dc70c3833a64ba039 Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Wed, 24 May 2023 20:44:32 +0000 Subject: [PATCH 04/12] Update syntax --- graphein/ml/datasets/pdb_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py index 99710c21..394122a7 100644 --- a/graphein/ml/datasets/pdb_data.py +++ b/graphein/ml/datasets/pdb_data.py @@ -1136,7 +1136,7 @@ def select_complexes_with_grouped_molecule_types( """ splits_df = self.get_splits(splits) df = splits_df.groupby("pdb").filter( - lambda group: all(molecule_type_to_group in group["molecule_type"].values for molecule_type_to_group in molecule_types_to_group) + lambda group: all([molecule_type_to_group in group["molecule_type"].values for molecule_type_to_group in molecule_types_to_group]) ) if update: self.df = df From c07bea191d3e81304688d20149ab546fde842ffe Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Wed, 24 May 2023 22:38:45 +0000 Subject: [PATCH 05/12] Restore .gitignore --- .gitignore | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index e141ee95..65941213 100644 --- a/.gitignore +++ b/.gitignore @@ -172,7 +172,4 @@ afdb_swissprot_v4 afdb_swissprot_v4.* # Local test directories -tmp/ - -# Local Conda/Mamba environment -graphein-env/ \ No newline at end of file +tmp/ \ No newline at end of file From 361b9405a0c134e81be5888be55561b010e9c5b0 Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Wed, 24 May 2023 22:38:59 +0000 Subject: [PATCH 06/12] Restore .gitignore --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 65941213..941201e0 100644 --- a/.gitignore +++ b/.gitignore @@ -172,4 +172,4 @@ afdb_swissprot_v4 afdb_swissprot_v4.* # Local test directories -tmp/ \ No newline at end of file +tmp/ From efc2d3d2d194902d79034ae5771eaa62546a10ec Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 24 May 2023 22:40:00 +0000 Subject: [PATCH 07/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- graphein/ml/datasets/pdb_data.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py index 394122a7..bc2da68a 100644 --- a/graphein/ml/datasets/pdb_data.py +++ b/graphein/ml/datasets/pdb_data.py @@ -727,7 +727,7 @@ def experiment_type( if update: self.df = df return df - + def experiment_types( self, types: List[str] = ["diffraction"], @@ -756,7 +756,7 @@ def experiment_types( if update: self.df = df return df - + def name( self, substrings: List[str], @@ -1113,9 +1113,12 @@ def remove_non_standard_alphabet_sequences( if update: self.df = df return df - + def select_complexes_with_grouped_molecule_types( - self, molecule_types_to_group: List[str], splits: Optional[List[str]] = None, update: bool = False + self, + molecule_types_to_group: List[str], + splits: Optional[List[str]] = None, + update: bool = False, ): """ Select complexes containing at least one instance of each @@ -1136,7 +1139,12 @@ def select_complexes_with_grouped_molecule_types( """ splits_df = self.get_splits(splits) df = splits_df.groupby("pdb").filter( - lambda group: all([molecule_type_to_group in group["molecule_type"].values for molecule_type_to_group in molecule_types_to_group]) + lambda group: all( + [ + molecule_type_to_group in group["molecule_type"].values + for molecule_type_to_group in molecule_types_to_group + ] + ) ) if update: self.df = df From a6c4a6e4f84d537282c28fedb9c60ddd5ba4ce0c Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Wed, 24 May 2023 23:38:49 +0000 Subject: [PATCH 08/12] Handle edge case --- graphein/ml/datasets/pdb_data.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py index 394122a7..5207f723 100644 --- a/graphein/ml/datasets/pdb_data.py +++ b/graphein/ml/datasets/pdb_data.py @@ -1778,7 +1778,7 @@ def merge_pdb_chain_groups(self, group: DataFrameGroupBy) -> pd.DataFrame: def select_pdb_by_criterion( self, pdb: PandasPdb, field: str, field_values: List[Any] - ) -> PandasPdb: + ) -> Optional[PandasPdb]: """Filter a PDB using a field selection. :param pdb: The PDB object to filter by a field. @@ -1789,18 +1789,18 @@ def select_pdb_by_criterion( the PDB. :type field_values: List[Any] - :return: The filtered PDB object. - :rtype: PandasPdb + :return: The filtered PDB object or instead `None` to signify + that no atoms within the PDB object were found after filtering. + :rtype: Optional[PandasPdb], optional """ for key in pdb.df: if field in pdb.df[key]: filtered_pdb = pdb.df[key][ pdb.df[key][field].isin(field_values) ] - if "ATOM" in key: - assert ( - len(filtered_pdb) > 0 - ), "Filtered DataFrame must contain atoms." + if "ATOM" in key and len(filtered_pdb) == 0: + log.warning("Filtered DataFrame does not contain any atoms. Skipping DataFrame...") + return None pdb.df[key] = filtered_pdb return pdb @@ -1891,7 +1891,8 @@ def write_out_pdb_chain_groups( pdb, "chain_id", chains ) # export selected chains within the same PDB file - pdb_chains.to_pdb(str(output_pdb_filepath)) + if pdb_chains: + pdb_chains.to_pdb(str(output_pdb_filepath)) def write_df_pdbs( self, From 768f15fb468d40d517860f0af2c7ef0fb5acc1b5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 24 May 2023 23:39:25 +0000 Subject: [PATCH 09/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- graphein/ml/datasets/pdb_data.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py index 93372933..052fc273 100644 --- a/graphein/ml/datasets/pdb_data.py +++ b/graphein/ml/datasets/pdb_data.py @@ -1807,7 +1807,9 @@ def select_pdb_by_criterion( pdb.df[key][field].isin(field_values) ] if "ATOM" in key and len(filtered_pdb) == 0: - log.warning("Filtered DataFrame does not contain any atoms. Skipping DataFrame...") + log.warning( + "Filtered DataFrame does not contain any atoms. Skipping DataFrame..." + ) return None pdb.df[key] = filtered_pdb return pdb From 6b1879ed47205a0ff8aad92b85e328f4527fb3da Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Thu, 25 May 2023 01:19:41 +0000 Subject: [PATCH 10/12] Handle edge cases in export function --- graphein/ml/datasets/pdb_data.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py index 052fc273..07bf2ef4 100644 --- a/graphein/ml/datasets/pdb_data.py +++ b/graphein/ml/datasets/pdb_data.py @@ -1785,8 +1785,8 @@ def merge_pdb_chain_groups(self, group: DataFrameGroupBy) -> pd.DataFrame: ) def select_pdb_by_criterion( - self, pdb: PandasPdb, field: str, field_values: List[Any] - ) -> Optional[PandasPdb]: + self, pdb: PandasPdb, field: str, field_values: List[Any], pdb_code: str + ) -> PandasPdb: """Filter a PDB using a field selection. :param pdb: The PDB object to filter by a field. @@ -1796,10 +1796,11 @@ def select_pdb_by_criterion( :param field_values: The field values by which to filter the PDB. :type field_values: List[Any] + :param pdb_code: The PDB code associated with a given PDB object. + :type pdb_code: str - :return: The filtered PDB object or instead `None` to signify - that no atoms within the PDB object were found after filtering. - :rtype: Optional[PandasPdb], optional + :return: The filtered PDB object. + :rtype: PandasPdb """ for key in pdb.df: if field in pdb.df[key]: @@ -1808,9 +1809,8 @@ def select_pdb_by_criterion( ] if "ATOM" in key and len(filtered_pdb) == 0: log.warning( - "Filtered DataFrame does not contain any atoms. Skipping DataFrame..." + f"DataFrame for PDB {pdb_code} does not contain any standard atoms after filtering" ) - return None pdb.df[key] = filtered_pdb return pdb @@ -1898,11 +1898,10 @@ def write_out_pdb_chain_groups( else chains[:max_num_chains_per_pdb_code] ) pdb_chains = self.select_pdb_by_criterion( - pdb, "chain_id", chains + pdb, "chain_id", chains, entry_pdb_code ) # export selected chains within the same PDB file - if pdb_chains: - pdb_chains.to_pdb(str(output_pdb_filepath)) + pdb_chains.to_pdb(str(output_pdb_filepath)) def write_df_pdbs( self, From c5d7d6ea1b01fe6134f79a64c800ece2af04cc5a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 25 May 2023 01:20:04 +0000 Subject: [PATCH 11/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- graphein/ml/datasets/pdb_data.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py index 07bf2ef4..9082746a 100644 --- a/graphein/ml/datasets/pdb_data.py +++ b/graphein/ml/datasets/pdb_data.py @@ -1785,7 +1785,11 @@ def merge_pdb_chain_groups(self, group: DataFrameGroupBy) -> pd.DataFrame: ) def select_pdb_by_criterion( - self, pdb: PandasPdb, field: str, field_values: List[Any], pdb_code: str + self, + pdb: PandasPdb, + field: str, + field_values: List[Any], + pdb_code: str, ) -> PandasPdb: """Filter a PDB using a field selection. From e527f7414aecdfe4c0e8300215cb82595a067943 Mon Sep 17 00:00:00 2001 From: Arian Jamasb Date: Thu, 25 May 2023 19:45:05 +0100 Subject: [PATCH 12/12] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 797a32e4..71ed5a4b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ * Insertions retained by default in the `graphein.protein.tensor` module. I.e. `insertions=True` is now the default behaviour.[#307](https://github.com/a-r-j/graphein/pull/307) * Adds transform composition to FoldComp Dataset [#312](https://github.com/a-r-j/graphein/pull/312) * Improve FoldComp dataloading performance and include B factors (pLDDT) in output. [#313](https://github.com/a-r-j/graphein/pull/313) [#315](https://github.com/a-r-j/graphein/pull/315) +* Add new helper functions to PDBManager [#322](https://github.com/a-r-j/graphein/pull/322) (@amorehead) ### 1.7.0 - UNRELEASED