Development of PDBManager Class (WIP) (#272)

* add PDB manager #270 * add download method * add clustering utilities * Add dataset splits functionality and add new documentation * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Resolve merge conflicts with remote * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove unused test * Address lingering SonarCloud concerns * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add deposition date parsing * remove pdb.py * add chain extraction util * add chain writing method * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * After fixing merge conflicts, add more filters and add time-based splits * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix up SonarCloud concerns * Improve verbiage surrounding PDB resolutions * Simplify code and improve variable names * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Track names of splits in df_splits * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix column naming during merging of DataFrame splits * add additional properties * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * refactor clustering to allow file caching and overwriting * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add description to assert statements * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add extra documentation around clustering function, and address small formatting issues * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add method to write selection to CSV * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * improve from_fasta documentation * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Enable code reuse for length filters * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Minor documentation changes to FASTA write-out function * Add ability to perform most API calls for a subset of splits * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update .gitignore * Fix missing download call, and add more documentation to download functions * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix small bug when merging different splits together * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix bug in length filtering functions, fix print bugs in utils, and add ability to write-out PDB files after selecting a subset of chains to include in them * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix string formatting * Update PDB write-out logic and documentation * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add PDB download workaround for PDBs that can no longer be downloaded * Make exception more specific * Add TQDM for data split exporting * Enable PDBManager root to be set to an arbitrary location * add initial tests * update changelog * add tutorial notebook * Allow all chains in a complex to be exported together * add module-level import * Remove old, unused PDBManager prototype file * add parsing & checks for unavailable PDB structures * fix download checker * actually fix download checker * add availability filter * Default to export model 1's chains only in PDBManager, and clean-up notebook and utilities * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add tutorial nblink * add tutorial to datasets sections * mv pdb data to ml API * rm pyg dataset import * rm unused code * fix annotation * add MMTF download format * refactor dependency utils * refactor graphein.utils.utils.import_message * refactor graphein.protein.utils.is_tool * update .gitignore * ignore cif too * ignore cif too * ignore foldcomp files * catch straggling erroneous imports * ignore mol2 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update folding utils * add max batch option * add foldcomp utils * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add notebook updates [WIP] * move manager class into graphein.ml * remove datasets init * fix import util refactor I didn't catch * add PDBmanager to __init__ * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix oligomeric filtering * update notebook * fix dataset init * fix protein.coord renaming in tensor module * add try/except to pyg-related datasets * add try/except to pyg-related datasets * add mmseqs to CI build * rollback dssp install to conda * ignore pdb manager notebook in minimal tests * fix code smell * fix metrics * shorten line lengths * add minimum scipy version * remove python 3.7 from CI * Add Torch 2.0.0 to CI * add note about multiple split strategies * add torch cluster install to CI * update dockerfile to torch 2.0 * switch docker pytorch 1.13 for VMD python version conflict * switch out torchtyping for jaxtyping * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update tensor shape syntax for jaxtyping * remove torch-dependent tests from minimal install testing * update test ignores * install dssp from apt, rather than conda in docker * update typing extensions version --------- Co-authored-by: Arian Jamasb <arjamasb@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
a-r-j · Mar 31, 2023 · 83295a8 · 83295a8
1 parent 0b24a20
commit 83295a8
Show file tree

Hide file tree

Showing 52 changed files with 387,028 additions and 385,129 deletions.
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -17,7 +17,7 @@ jobs:
     strategy:
       matrix:
         python-version: [3.8, 3.9]
-        torch: [1.12.0, 1.13.0]
+        torch: [1.12.0, 1.13.0, 2.0.0]
         #include:
         #  - torch: 1.6.0
         #    torchvision: 0.7.0
@@ -62,11 +62,15 @@ jobs:
       #  run: source activate graphein-dev
       - name: Install DSSP
         run: conda install -c salilab dssp
+      - name: Install mmseqs
+        run: mamba install -c conda-forge -c bioconda mmseqs2
       - name: Install PyTorch
-        run: conda install -c pytorch pytorch==${{matrix.torch}} cpuonly
+        run: mamba install -c pytorch pytorch==${{matrix.torch}} cpuonly
         #run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
       - name: Install PyG
-        run: conda install -c pyg pyg
+        run: mamba install -c pyg pyg
+      - name: Install torch-cluster
+        run: mamba install pytorch-cluster -c pyg
       - name: Install BLAST
         run: sudo apt install ncbi-blast+
       - name: Install Graphein

diff --git a/.github/workflows/minimal__install.yaml b/.github/workflows/minimal__install.yaml
@@ -25,7 +25,7 @@ jobs:
 
     strategy:
       matrix:
-        python-version: [3.7, 3.8, 3.9, 3.11]
+        python-version: [3.8, 3.9, 3.11]
     steps:
       - name: Checkout repository
         uses: actions/checkout@v3
@@ -45,6 +45,6 @@ jobs:
       - name: Install Dev Dependencies
         run: pip install -r .requirements/dev.in
       - name: Run unit tests and generate coverage report
-        run: pytest . --ignore-glob="tests/protein/tensor"
+        run: pytest . --ignore-glob="tests/protein/tensor" --ignore="tests/ml/test_conversion.py" --ignore="tests/ml/test_torch_geometric_dataset.py"
       - name: Test notebook execution
-        run: pytest --nbval-lax notebooks/ --current-env --ignore-glob="notebooks/dataloader_tutorial.ipynb" --ignore-glob="notebooks/higher_order_graphs.ipynb" --ignore-glob="notebooks/protein_graph_analytics.ipynb" --ignore-glob="notebooks/subgraphing_tutorial.ipynb" --ignore-glob="notebooks/splitting_a_dataset.ipynb" --ignore-glob="notebooks/protein_tensors.ipynb" --ignore-glob="notebooks/datasets_and_dataloaders.ipynb" --ignore-glob="notebooks/foldcomp.ipynb"
+        run: pytest --nbval-lax notebooks/ --current-env --ignore-glob="notebooks/dataloader_tutorial.ipynb" --ignore-glob="notebooks/higher_order_graphs.ipynb" --ignore-glob="notebooks/protein_graph_analytics.ipynb" --ignore-glob="notebooks/subgraphing_tutorial.ipynb" --ignore-glob="notebooks/splitting_a_dataset.ipynb" --ignore-glob="notebooks/protein_tensors.ipynb" --ignore-glob="notebooks/datasets_and_dataloaders.ipynb" --ignore-glob="notebooks/foldcomp.ipynb" --ignore-glob="notebooks/creating_datasets_from_the_pdb.ipynb"
diff --git a/.gitignore b/.gitignore
@@ -145,6 +145,31 @@ dmypy.json
 # Local test files
 datasets/examples/*
 *.ent
+*.pdb
+*.pt
+*.dbn
+*.cif
 *.zip
+*.mol2
 datasets/regnetwork/human
 notebooks/lightning_logs
+pdb/
+cc-to-pdb.tdd
+entries.idx
+pdb_cluster_all_seqs.fasta
+pdb_cluster_cluster.tsv
+pdb_cluster_rep_seq_id_*_c_*.fasta
+pdb_bundle_index.txt
+pdb_entry_type.txt
+pdb_seqres.txt
+pdb_seqres.txt.gz
+pdb.fasta
+resolu.idx
+source.idx
+
+# Foldcomp files
+afdb_swissprot_v4
+afdb_swissprot_v4.*
+
+# Local test directories
+tmp/
diff --git a/.requirements/base.in b/.requirements/base.in
@@ -15,9 +15,9 @@ rich-click
 seaborn
 pyyaml>=5.1,<6.0
 scikit-learn
-scipy
+scipy>=1.8
 tqdm
-typing_extensions
+typing_extensions==4.5.0
 wget
 xarray
-torchtyping
+jaxtyping
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,25 +1,34 @@
 ### 1.6.1 - UNRELEASED
 
+* `Protein` tensors have coordinates renamed from `Protein.x` to `Protein.coords`. [#272](https://github.com/a-r-j/graphein/pull/272)
+* Tensor types are now defined using [`jaxtyping`](https://github.com/google/jaxtyping), removing the `torchtyping` dependency [#272](https://github.com/a-r-j/graphein/pull/272)
+* Drops explicit Python 3.7 support. Colab now runs on 3.8+. [#272](https://github.com/a-r-j/graphein/pull/272)
+* Dockerfile now builds from `pytorch/pytorch:1.13.0-cuda11.6-cudnn8-runtime` (replaces `pytorch/pytorch:1.9.1-cuda11.1-cudnn8-runtime`) [#272](https://github.com/a-r-j/graphein/pull/272)
+
 #### New Features
-* [FoldComp Dataset] - [#284](https://github.com/a-r-j/graphein/pull/284) - Create ML datasets from FoldComp databases.
 
+* [PDBManager] - [#272](https://github.com/a-r-j/graphein/pull/272) Adds a utility for creating custom dataset splits from the PDB.
+* [FoldComp Dataset] - [#284](https://github.com/a-r-j/graphein/pull/284) - Create ML datasets from FoldComp databases.
+* [ESM] - [#284](https://github.com/a-r-j/graphein/pull/284) - Wrapper for ESMFold batch folding & embedding.
+* [Downloads] MMTF downloading now supported in download utilities. [#272](https://github.com/a-r-j/graphein/pull/272)
 
-### 1.6.0dev - UNRELEASED
+### 1.6.0 - 18/03/2023
 
 #### New Features
+
 * [Metrics] - [#245](https://github.com/a-r-j/graphein/pull/221) Adds a selection of structural metrics relevant to protein structures.
 * [Tensor Operations] - [#244](https://github.com/a-r-j/graphein/pull/244) Adds suite of utilities for working directly with tensor-based representations of proteins (graphein.protein.tensor).
 * [Tensor Operations] - [#244](https://github.com/a-r-j/graphein/pull/244) Adds suite of utilities for working with ESMfold (graphein.protein.folding_utils).
 
-
-
 #### Improvements
+
 * [Feature] = [#277](https://github.com/a-r-j/graphein/pull/227) Adds support for pathlib paths for protein graph creation. [#269](https://github.com/a-r-j/graphein/issues/269)
 * [Logging] - [#221](https://github.com/a-r-j/graphein/pull/221) Adds global control of logging with `graphein.verbose(enabled=False)`.
 * [Logging] - [#242](https://github.com/a-r-j/graphein/pull/242) Adds control of protein graph construction logging. Resolves [#238](https://github.com/a-r-j/graphein/issues/238)
 
 #### Protein
-* [Bugfix] - [#222]https://github.com/a-r-j/graphein/pull/222) Fixes entrypoint for user-defined `df_processing_funcs` ([#216](https://github.com/a-r-j/graphein/issues/216))
+
+* [Bugfix] - [#222]<https://github.com/a-r-j/graphein/pull/222)> Fixes entrypoint for user-defined `df_processing_funcs` ([#216](https://github.com/a-r-j/graphein/issues/216))
 * [Feature] = [#263](https://github.com/a-r-j/graphein/pull/263) Adds control of Alt Loc selection strategy. N.b. Default `ProteinGraphConfig` changed to include insertions by default (`insertions=True`) and `alt_locs="max_occupancy"`.
 * [Feature] - [#264](https://github.com/a-r-j/graphein/pull/264) Adds entrypoint to `graphein.protein.graphs.construct_graph` for passing in a BioPandas dataframe directly.
 * [Feature] - [#229](https://github.com/a-r-j/graphein/pull/220) Adds support for filtering KNN edges based on self-loops and chain membership. Contribution by @anton-bushuiev.
@@ -36,31 +45,33 @@
 
 * [Bugfix] - [#268](https://github.com/a-r-j/graphein/pull/268) Fixes 'sequence' metadata feature for atomistic graphs, removing duplicate residues.  Contribution by @kamurani.
 
-
 #### ML
+
 * [Bugfix] - [#234](https://github.com/a-r-j/graphein/pull/234) - Fixes bugs and improves `conversion.convert_nx_to_pyg` and `visualisation.plot_pyg_data`. Removes distance matrix (`dist_mat`) from defualt set of features converted to tensor.
 
 #### Utils
+
 * [Improvement] - [#234](https://github.com/a-r-j/graphein/pull/234) - Adds `parse_aggregation_type` to retrieve aggregation functions.
 
 #### RNA
+
 * [Bugfix] - [#281](https://github.com/a-r-j/graphein/pull/234) - Bugfix for nx->PyG conversion for graphs containing edges without "kind" attributes. Contribution by @rg314.
 
 #### Constants
-* [Improvement] - [#234](https://github.com/a-r-j/graphein/pull/234) - Adds 1 to 3 mappings to `graphein.protein.resi_atoms`.
 
+* [Improvement] - [#234](https://github.com/a-r-j/graphein/pull/234) - Adds 1 to 3 mappings to `graphein.protein.resi_atoms`.
 
 #### Documentation
+
 * [Tensor Module] - [#244](https://github.com/a-r-j/graphein/pull/244) Documents new graphein.protein.tensor module.
 * [CI] - [#244](https://github.com/a-r-j/graphein/pull/244) Updates to intersphinx maps
 
-
 #### Package
+
 * [CI] - [#244](https://github.com/a-r-j/graphein/pull/244) CI now runs for python 3.8, 3.9 and torch 1.12.0 and 1.13.0
 * [CI] - [#244](https://github.com/a-r-j/graphein/pull/244) Separate builds for core library and library with DL dependencies.
 * [Licence] - [#244](https://github.com/a-r-j/graphein/pull/244) Bump to 2023
 
-
 ### 1.5.2 - 19/9/2022
 
 #### Protein

diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM pytorch/pytorch:1.9.1-cuda11.1-cudnn8-runtime
+FROM pytorch/pytorch:1.13.0-cuda11.6-cudnn8-runtime
 
 RUN apt-get update \
     && apt-get -y install build-essential ffmpeg libsm6 libxext6 wget git \
@@ -12,6 +12,10 @@ RUN apt-get update && apt-get install -y iputils-ping && apt-get clean \
 RUN apt-get update && apt-get install -y ncbi-blast+ && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
 
+# Install DSSP
+RUN apt-get update && apt-get install -y dssp && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
 ENV CONDA_ALWAYS_YES=true
 
 
@@ -41,7 +45,6 @@ ENV PATH /getcontacts:$PATH
 RUN conda install -c fvcore -c iopath -c conda-forge fvcore iopath
 RUN conda install -c pytorch3d pytorch3d
 RUN conda install -c dglteam dgl
-RUN conda install -c salilab dssp
 RUN conda install -c conda-forge ipywidgets
 
 RUN export CUDA=$(python -c "import torch; print('cu'+torch.version.cuda.replace('.',''))") \

diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst
@@ -13,6 +13,7 @@ Summaries
 
    notebooks/dataloader_tutorial.nblink
    notebooks/foldcomp.nblink
+   notebooks/creating_datasets_from_the_pdb.nblink
    datasets/pscdb
    notebooks/pscdb_processing.nblink
    notebooks/pscdb_baselines.nblink

diff --git a/docs/source/notebooks/creating_datasets_from_the_pdb.nblink b/docs/source/notebooks/creating_datasets_from_the_pdb.nblink
@@ -0,0 +1,3 @@
+{
+    "path": "../../../notebooks/creating_datasets_from_the_pdb.ipynb"
+}
diff --git a/graphein/grn/features/node_features.py b/graphein/grn/features/node_features.py
@@ -1,21 +1,9 @@
 """Node featurisation utilities for Gene Regulatory Networks."""
 from typing import Any, Dict
 
+from bioservices import HGNC, UniProt
 from loguru import logger as log
 
-from graphein.utils.utils import import_message
-
-try:
-    from bioservices import HGNC, UniProt
-except ImportError:
-    message = import_message(
-        submodule="graphein.grn.features.node_features",
-        package="bioservices",
-        conda_channel="bioconda",
-        pip_install=True,
-    )
-    log.warning(message)
-
 
 def add_sequence_to_nodes(n: str, d: Dict[str, Any]):
     """

diff --git a/graphein/ml/clustering.py b/graphein/ml/clustering.py
@@ -16,11 +16,8 @@
 
 import networkx as nx
 import numpy as np
-import pandas as pd
 from Bio import SeqIO
 
-from graphein.protein.utils import is_tool
-
 
 def build_fasta_file_from_mapping(
     pdb_sequence_mapping: Dict[str, str],
@@ -46,7 +43,7 @@ def build_fasta_file_from_graphs(
     if chains is None:
         chains = ["A"] * len(graphs)
     mapping = {
-        g.name + "_" + chain: g.graph[f"sequence_{chain}"]
+        f"{g.name}_{chain}": g.graph[f"sequence_{chain}"]
         for g, chain in zip(graphs, chains)
     }
 
@@ -104,9 +101,7 @@ def get_seq_records(
                             f"WARNING in {get_seq_records.__name__} sequence {record.seq.id} from file "
                             f"{filename} is not compatible with declared alphabet {str(alphabet)}\n"
                         )
-    if return_as_dictionary:
-        return SeqIO.to_dict(records)
-    return records
+    return SeqIO.to_dict(records) if return_as_dictionary else records
 
 
 def create_pairs_for_clustering(
@@ -497,8 +492,8 @@ def generate_random_sets(
     n = 0
     in_other_tests = []
     while n < number_of_sets:
-        train_set_name = train_set_key + f"_{n:02}"
-        test_set_name = test_set_key + f"_{n:02}"
+        train_set_name = f"{train_set_key}_{n:02}"
+        test_set_name = f"{test_set_key}_{n:02}"
         with open(train_set_name, mode="w") as train:
             with open(test_set_name, mode="w") as test:
                 ids_in_test = []

diff --git a/graphein/ml/conversion.py b/graphein/ml/conversion.py
@@ -15,7 +15,7 @@
 import torch
 from loguru import logger as log
 
-from graphein.utils.utils import import_message
+from graphein.utils.dependencies import import_message
 
 try:
     import torch

diff --git a/graphein/ml/datasets/__init__.py b/graphein/ml/datasets/__init__.py
@@ -1,5 +1,10 @@
-from .torch_geometric_dataset import (
-    InMemoryProteinGraphDataset,
-    ProteinGraphDataset,
-    ProteinGraphListDataset,
-)
+from .pdb_data import PDBManager
+
+try:
+    from .torch_geometric_dataset import (
+        InMemoryProteinGraphDataset,
+        ProteinGraphDataset,
+        ProteinGraphListDataset,
+    )
+except (NameError, ImportError):
+    pass
diff --git a/graphein/ml/datasets/foldcomp_dataset.py b/graphein/ml/datasets/foldcomp_dataset.py
@@ -22,13 +22,13 @@
 from tqdm import tqdm
 
 from graphein.protein.tensor import Protein
-from graphein.utils.utils import import_message
+from graphein.utils.dependencies import import_message
 
 try:
     import foldcomp
 except ImportError:
     message = import_message(
-        "graphein.ml.datasets.foldcomp", "foldcomp", None, True
+        "graphein.ml.datasets.foldcomp", "foldcomp", None, True, extras=True
     )
     log.warning(message)