diff --git a/CHANGELOG.md b/CHANGELOG.md index 9de21328..35dad4f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ * [Logging] - [#242](https://github.com/a-r-j/graphein/pull/242) Adds control of protein graph construction logging. Resolves [#238](https://github.com/a-r-j/graphein/issues/238) #### Protein +* [Bugfix] - [#222]https://github.com/a-r-j/graphein/pull/222) Fixes entrypoint for user-defined `df_processing_funcs` ([#216](https://github.com/a-r-j/graphein/issues/216)) * [Feature] = [#263](https://github.com/a-r-j/graphein/pull/263) Adds control of Alt Loc selection strategy. N.b. Default `ProteinGraphConfig` changed to include insertions by default (`insertions=True`) and `alt_locs="max_occupancy"`. * [Feature] - [#264](https://github.com/a-r-j/graphein/pull/264) Adds entrypoint to `graphein.protein.graphs.construct_graph` for passing in a BioPandas dataframe directly. * [Feature] - [#229](https://github.com/a-r-j/graphein/pull/220) Adds support for filtering KNN edges based on self-loops and chain membership. Contribution by @anton-bushuiev. diff --git a/graphein/protein/config.py b/graphein/protein/config.py index 23925db0..3c9c7917 100644 --- a/graphein/protein/config.py +++ b/graphein/protein/config.py @@ -27,17 +27,23 @@ class DSSPConfig(BaseModel): class GetContactsConfig(BaseModel): """ Config object for parameters relating to running ``GetContacts``. - ``GetContacts`` is an optional dependency from which intramolecular interactions can be computed and used as edges in the graph. + ``GetContacts`` is an optional dependency from which intramolecular + interactions can be computed and used as edges in the graph. - More information about ``GetContacts`` can be found at https://getcontacts.github.io/ + More information about ``GetContacts`` can be found at + https://getcontacts.github.io/ :param get_contacts_path: Path to ``GetContacts`` installation :type get_contacts_path: pathlib.Path :param contacts_dir: Path to store output of ``GetContacts`` :type contacts_dir: pathlib.Path - :param pdb_dir: Path to PDB files to be used to compute intramolecular interactions. + :param pdb_dir: Path to PDB files to be used to compute intramolecular + interactions. :type pdb_dir: pathlib.Path - :param granularity: Specifies the node types of the graph, defaults to ``"CA"`` for alpha-carbons as nodes. Other options are ``"CB"`` (beta-carbon), ``"atom"`` for all-atom graphs, and ``"centroid"`` for nodes positioned as residue centroids. + :param granularity: Specifies the node types of the graph, defaults to + ``"CA"`` for alpha-carbons as nodes. Other options are ``"CB"`` + (beta-carbon), ``"atom"`` for all-atom graphs, and ``"centroid"`` + for nodes positioned as residue centroids. :type granularity: str """ @@ -110,19 +116,26 @@ class ProteinGraphConfig(BaseModel): """ Config Object for Protein Structure Graph Construction. - If you encounter a problematic structure, perusing https://www.umass.edu/microbio/chime/pe_beta/pe/protexpl/badpdbs.htm may provide some additional insight. - PDBs are notoriously troublesome and this is an excellent overview. + If you encounter a problematic structure, perusing + https://www.umass.edu/microbio/chime/pe_beta/pe/protexpl/badpdbs.htm + may provide some additional insight. PDBs are notoriously troublesome and + this is an excellent overview. - :param granularity: Controls the granularity of the graph construction. ``"atom"`` builds an atomic-scale graph where - nodes are constituent atoms. Residue-level graphs can be build by specifying which constituent atom should - represent node positions (see :const:`~graphein.protein.config.GraphAtoms`). Additionally, ``"centroids"`` can be specified to - compute the centre of gravity for a given atom (Specified in :const:`~graphein.protein.config.GranularityOpts`). - Defaults to ``"CA"`` (alpha-Carbon). - :type granularity: str (Union[graphein.protein.config.GraphAtoms, graphein.protein.config.GranularityOpts]) - :param keep_hets: Controls whether or not heteroatoms are removed from the PDB file. These are typically modified - residues, bound ligands, crystallographic adjuvants, ions or water molecules. - - For more information, see: https://proteopedia.org/wiki/index.php/Hetero_atoms + :param granularity: Controls the granularity of the graph construction. + ``"atom"`` builds an atomic-scale graph where nodes are constituent + atoms. Residue-level graphs can be build by specifying which constituent + atom should represent node positions + (see :const:`~graphein.protein.config.GraphAtoms`). Additionally, + ``"centroids"`` can be specified to + compute the centre of gravity for a given atom (Specified in + :const:`~graphein.protein.config.GranularityOpts`). Defaults to + ``"CA"`` (alpha-Carbon). + :type granularity: str (Union[graphein.protein.config.GraphAtoms, + graphein.protein.config.GranularityOpts]) + :param keep_hets: Controls whether or not heteroatoms are removed from the + PDB file. These are typically modified residues, bound ligands, + crystallographic adjuvants, ions or water molecules. For more + information, see: https://proteopedia.org/wiki/index.php/Hetero_atoms :type keep_hets: List[str] :param insertions: Controls whether or not insertions are allowed. :type insertions: bool @@ -136,30 +149,39 @@ class ProteinGraphConfig(BaseModel): :type pdb_dir: pathlib.Path. Optional. :param verbose: Specifies verbosity of graph creation process. :type verbose: bool - :param exclude_waters: Specifies whether or not water molecules are excluded from the structure + :param exclude_waters: Specifies whether or not water molecules are excluded + from the structure :type excluded_waters: bool - :param deprotonate: Specifies whether or not to remove ``H`` atoms from the graph. + :param deprotonate: Specifies whether or not to remove ``H`` atoms from the + graph. :type deprotonate: bool - :param protein_df_processing_functions: List of functions that take a ``pd.DataFrame`` and return a ``pd.DataFrame``. - This allows users to define their own series of processing functions for the protein structure DataFrame and - override the default sequencing of processing steps provided by Graphein. We refer users to our low-level API + :param protein_df_processing_functions: List of functions that take a + ``pd.DataFrame`` and return a ``pd.DataFrame``. This allows users to + define their own series of processing functions for the protein + structure DataFrame and override the default sequencing of processing + steps provided by Graphein. We refer users to our low-level API tutorial for more details. :type protein_df_processing_functions: Optional[List[Callable]] - :param edge_construction_functions: List of functions that take an ``nx.Graph`` and return an ``nx.Graph`` with desired - edges added. Prepared edge constructions can be found in :ref:`graphein.protein.edges` + :param edge_construction_functions: List of functions that take an + ``nx.Graph`` and return an ``nx.Graph`` with desired edges added. + Prepared edge constructions can be found in + :ref:`graphein.protein.edges`. :type edge_construction_functions: List[Callable] :param node_metadata_functions: List of functions that take an ``nx.Graph`` :type node_metadata_functions: List[Callable], optional :param edge_metadata_functions: List of functions that take an :type edge_metadata_functions: List[Callable], optional - :param graph_metadata_functions: List of functions that take an ``nx.Graph`` and return an ``nx.Graph`` with added - graph-level features and metadata. + :param graph_metadata_functions: List of functions that take an ``nx.Graph`` + and return an ``nx.Graph`` with added graph-level features and metadata. :type graph_metadata_functions: List[Callable], optional - :param get_contacts_config: Config object containing parameters for running ``GetContacts`` for computing intramolecular - contact-based edges. Defaults to None. + :param get_contacts_config: Config object containing parameters for running + ``GetContacts`` for computing intramolecular contact-based edges. + Defaults to ``None``. :type get_contacts_config: GetContactsConfig, optional - :param dssp_config: Config Object containing reference to ``DSSP`` executable. Defaults to None. - **NB** DSSP must be installed. See installation instructions: https://graphein.ai/getting_started/installation.html#optional-dependencies + :param dssp_config: Config Object containing reference to ``DSSP`` + executable. Defaults to ``None``. **NB** DSSP must be installed. See + installation instructions: + https://graphein.ai/getting_started/installation.html#optional-dependencies :type dssp_config: DSSPConfig, optional """ @@ -197,7 +219,10 @@ def convert_alt_locs_aliases(cls, v): return v def __eq__(self, other: Any) -> bool: - """Overwrites the BaseModel __eq__ function in order to check more specific cases (like partial functions).""" + """ + Overwrites the BaseModel __eq__ function in order to check more + specific cases (like partial functions). + """ if isinstance(other, ProteinGraphConfig): return ( DeepDiff( @@ -215,14 +240,18 @@ def __eq__(self, other: Any) -> bool: class ProteinMeshConfig(BaseModel): """ - Config object for parameters relating to Protein Mesh construction with ``PyMol`` + Config object for parameters relating to Protein Mesh construction with + ``PyMol`` - **NB** PyMol must be installed. See: https://graphein.ai/getting_started/installation.html#optional-dependencies + **NB** PyMol must be installed. See: + https://graphein.ai/getting_started/installation.html#optional-dependencies :param pymol_command_line_options: List of CLI args for running PyMol. - See: https://pymolwiki.org/index.php/Command_Line_Options. Defaults to ``"-cKq"`` () + See: https://pymolwiki.org/index.php/Command_Line_Options. + Defaults to ``"-cKq"`` () :type pymol_command_line_options: str, optional - :param pymol_commands: List of Commands passed to PyMol in surface construction. + :param pymol_commands: List of Commands passed to PyMol in surface + construction. :type pymol_commands: List[str], optional """ diff --git a/graphein/protein/graphs.py b/graphein/protein/graphs.py index d71adfe2..b780b34d 100644 --- a/graphein/protein/graphs.py +++ b/graphein/protein/graphs.py @@ -76,9 +76,9 @@ def read_pdb_to_dataframe( """ Reads PDB file to ``PandasPDB`` object. - Returns ``atomic_df``, which is a DataFrame enumerating all atoms and their - cartesian coordinates in 3D space. Also contains associated metadata from - the PDB file. + Returns ``atomic_df``, which is a DataFrame enumerating all atoms and + their cartesian coordinates in 3D space. Also contains associated metadata + from the PDB file. :param pdb_path: path to PDB file. Defaults to ``None``. :type pdb_path: str, optional @@ -90,12 +90,6 @@ def read_pdb_to_dataframe( :param model_index: Index of model to read. Only relevant for structures containing ensembles. Defaults to ``1``. :type model_index: int, optional - :param verbose: print dataframe? - :type verbose: bool - :param granularity: Specifies granularity of dataframe. See - :class:`~graphein.protein.config.ProteinGraphConfig` for further - details. - :type granularity: str :returns: ``pd.DataFrame`` containing protein structure :rtype: pd.DataFrame """ @@ -175,7 +169,7 @@ def label_node_id( def deprotonate_structure(df: pd.DataFrame) -> pd.DataFrame: - """Remove protons from PDB dataframe. + """Remove protons from PDB DataFrame. :param df: Atomic dataframe. :type df: pd.DataFrame @@ -223,7 +217,7 @@ def subset_structure_to_atom_type( :param df: Protein Structure dataframe to subset. :type df: pd.DataFrame - :returns: Subsetted protein structure dataframe. + :returns: Subset protein structure dataframe. :rtype: pd.DataFrame """ return filter_dataframe( @@ -280,8 +274,7 @@ def remove_insertions( :param df: Protein Structure dataframe to remove insertions from. :type df: pd.DataFrame :param keep: Specifies which insertion to keep. Options are ``"first"`` or - ``"last"``. - Default is ``"first"`` + ``"last"``. Default is ``"first"``. :type keep: Literal["first", "last"] :return: Protein structure dataframe with insertions removed :rtype: pd.DataFrame @@ -309,6 +302,7 @@ def filter_hetatms( :param df: Protein Structure dataframe to filter hetatoms from. :type df: pd.DataFrame :param keep_hets: List of hetero atom names to keep. + :type keep_hets: List[str] :returns: Protein structure dataframe with heteroatoms removed :rtype: pd.DataFrame """ @@ -335,7 +329,7 @@ def process_dataframe( Should be the object returned from :func:`~graphein.protein.graphs.read_pdb_to_dataframe`. :type protein_df: pd.DataFrame - :param atom_df_processing_funcs: List of functions to process dataframe. + :param atom_df_processing_funcs: List of functions to process DataFrame. These must take in a DataFrame and return a DataFrame. Defaults to ``None``. :type atom_df_processing_funcs: List[Callable], optional @@ -349,14 +343,14 @@ def process_dataframe( See: :const:`~graphein.protein.config.GRAPH_ATOMS` and :const:`~graphein.protein.config.GRANULARITY_OPTS`. :type granularity: str - :param insertions: Whether or not to keep insertions. + :param insertions: Whether or not to keep insertions. Defaults to ``False``. :param insertions: bool :param alt_locs: Whether or not to keep alternatively located atoms. :param alt_locs: bool :param deprotonate: Whether or not to remove hydrogen atoms (i.e. deprotonation). :type deprotonate: bool - :param keep_hets: Hetatoms to keep. Defaults to an empty list. + :param keep_hets: Hetatoms to keep. Defaults to an empty list (``[]``). To keep a hetatom, pass it inside a list of hetatom names to keep. :type keep_hets: List[str] :param verbose: Verbosity level. @@ -387,7 +381,7 @@ def process_dataframe( ) # This block enables processing via a list of supplied functions operating - # on the atom and hetatom dataframes If these are provided, the dataframe + # on the atom and hetatom DataFrames. If these are provided, the DataFrame # returned will be computed only from these and the default workflow # below this block will not execute. if atom_df_processing_funcs is not None: @@ -512,10 +506,10 @@ def initialise_graph_with_metadata( :type name: Optional[str], defaults to ``None`` :param pdb_code: PDB ID / Accession code, if the PDB is available on the PDB database. - :type pdb_code: Optional[str], defaults to ``None`` + :type pdb_code: Optional[str], defaults to ``None``. :param pdb_path: path to local PDB file, if constructing a graph from a local file. - :type pdb_path: Optional[str], defaults to ``None`` + :type pdb_path: Optional[str], defaults to ``None``. :return: Returns initial protein structure graph with metadata. :rtype: nx.Graph """ @@ -578,15 +572,16 @@ def add_nodes_to_graph( :param G: ``nx.Graph`` with metadata to populate with nodes. :type G: nx.Graph :param protein_df: DataFrame of protein structure containing nodes & initial - node metadata to add to the graph. + node metadata to add to the graph. Defaults to ``None``. :type protein_df: pd.DataFrame, optional - :param verbose: Controls verbosity of this step. + :param verbose: Controls verbosity of this step. Defaults to ``False``. :type verbose: bool :returns: nx.Graph with nodes added. :rtype: nx.Graph """ - # If no protein dataframe is supplied, use the one stored in the Graph object + # If no protein dataframe is supplied, use the one stored in the Graph + # object if protein_df is None: protein_df = G.graph["pdb_df"] # Assign intrinsic node attributes @@ -823,6 +818,8 @@ def construct_graph( insertions=config.insertions, alt_locs=config.alt_locs, keep_hets=config.keep_hets, + atom_df_processing_funcs=config.protein_df_processing_functions, + hetatom_df_processing_funcs=config.protein_df_processing_functions, ) if verbose: @@ -884,8 +881,8 @@ def _mp_graph_constructor( :param use_pdb_code: Whether we are using ``"pdb_code"``s, ``pdb_path``s or ``"uniprot_id"``s. :type use_pdb_code: bool - :param config: Protein structure graph construction config (see: - :class:`graphein.protein.config.ProteinGraphConfig`). + :param config: Protein structure graph construction config + (see: :class:`graphein.protein.config.ProteinGraphConfig`). :type config: ProteinGraphConfig :return: Protein structure graph or ``None`` if an error is encountered. :rtype: Union[nx.Graph, None] @@ -913,7 +910,8 @@ def _mp_graph_constructor( except Exception as ex: log.info( - f"Graph construction error (PDB={args[0]})! {traceback.format_exc()}" + f"Graph construction error (PDB={args[0]})! \ + {traceback.format_exc()}" ) log.info(ex) return None @@ -958,7 +956,7 @@ def construct_graphs_mp( :param out_path: Path to save the graphs to. If ``None``, graphs are not saved to disk. :type out_path: Optional[str], defaults to ``None`` - :return: Iterable of protein graphs. None values indicate there was a + :return: Iterable of protein graphs. ``None`` values indicate there was a problem in constructing the graph for this particular pdb. :rtype: Union[List[nx.Graph], Dict[str, nx.Graph]] """ diff --git a/tests/protein/test_graphs.py b/tests/protein/test_graphs.py index b9d8e464..ae575116 100644 --- a/tests/protein/test_graphs.py +++ b/tests/protein/test_graphs.py @@ -5,6 +5,7 @@ import networkx as nx import numpy as np +import pandas as pd import pytest from graphein.protein.config import DSSPConfig, ProteinGraphConfig @@ -204,7 +205,8 @@ def test_chain_selection(): # Removed - testing with GetContacts as a dependency is not a priority right now """ def test_intramolecular_edges(): - Example-based test that intramolecular edge construction using GetContacts works correctly. + Example-based test that intramolecular edge construction using GetContacts + works correctly. Uses 4hhb PDB file as an example test case. @@ -271,7 +273,8 @@ def test_node_features(): config_params = { "node_metadata_functions": [ - expasy_protein_scale, # Todo we need to refactor node data assingment flow + expasy_protein_scale, # Todo we need to refactor node data + # assignment flow meiler_embedding, ], "graph_metadata_functions": [ @@ -320,7 +323,8 @@ def test_sequence_features(): # Check for existence on sequence-based features as node-level features # for n, d in G.nodes(data=True): # Todo this can probably be improved. - # This only checks for the existence and shape of the esm_embedding for each node + # This only checks for the existence and shape of the esm_embedding for each + # node # assert "esm_embedding" in d # assert len(d["esm_embedding"]) == 1280 @@ -497,7 +501,8 @@ def test_secondary_structure_graphs(): res_counts = sum(d["residue_counts"] for _, d in h.nodes(data=True)) assert res_counts == len( g - ), "Residue counts in SS graph should match number of residues in original graph" + ), "Residue counts in SS graph should match number of residues in original \ + graph" assert nx.is_connected( h ), "SS graph should be connected in this configuration" @@ -534,3 +539,31 @@ def test_chain_graph(): h = compute_chain_graph(g, return_weighted_graph=True) node_sum = sum(d["num_residues"] for _, d in h.nodes(data=True)) assert node_sum == len(g), "Number of residues do not match" + + +def test_df_processing(): + def return_even_df(df: pd.DataFrame) -> pd.DataFrame: + return df.loc[df["residue_number"] % 2 == 0] + + def remove_hetatms(df: pd.DataFrame) -> pd.DataFrame: + return df.loc[df["record_name"] == "ATOM"] + + params_to_change = { + "protein_df_processing_functions": [return_even_df, remove_hetatms], + "granularity": "atom", + } + + config = ProteinGraphConfig(**params_to_change) + config.dict() + + config2 = ProteinGraphConfig(granularity="atom") + + g1 = construct_graph(config=config, pdb_code="3eiy") + g2 = construct_graph(config=config2, pdb_code="3eiy") + + for n, d in g1.nodes(data=True): + assert ( + int(d["residue_number"]) % 2 == 0 + ), "Only even residues should be present" + + assert len(g1) != len(g2), "Graphs should not be equal"