Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[feat] add uniprot ecnumber and cath label options to pdb manager #398

Merged
merged 7 commits into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ jobs:
channels: "conda-forge, salilab, pytorch, pyg"
python-version: ${{ matrix.python-version }}
use-mamba: true
- name: Install setuptools
run: pip install setuptools==69.5.1
- name: Install Boost 1.7.3 (for DSSP)
run: conda install -c anaconda libboost=1.73.0
- name: Install DSSP
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
* Fix cluster file loading bug in `pdb_data.py` [#396](https://github.com/a-r-j/graphein/pull/396)

#### Misc
* add metadata options for uniprot, ecnumber and CATH code to pdb manager [#398](https://github.com/a-r-j/graphein/pull/398)
* bumped logging level down from `INFO` to `DEBUG` at several places to reduced output length [#391](https://github.com/a-r-j/graphein/pull/391)
* exposed `fill_value` and `bfactor` option to `protein_to_pyg` function. [#385](https://github.com/a-r-j/graphein/pull/385) and [#388](https://github.com/a-r-j/graphein/pull/388)
* Updated Foldcomp datasets with improved setup function and updated database choices such as ESMAtlas. [#382](https://github.com/a-r-j/graphein/pull/382)
Expand Down
278 changes: 269 additions & 9 deletions graphein/ml/datasets/pdb_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from datetime import datetime
from io import StringIO
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Union
from typing import Any, Callable, Dict, List, Literal, Optional, Union

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -36,13 +36,16 @@ def __init__(
split_ratios: Optional[List[float]] = None,
split_time_frames: Optional[List[np.datetime64]] = None,
assign_leftover_rows_to_split_n: int = 0,
labels: Optional[
List[Literal["uniprot_id", "cath_code", "ec_number"]]
] = None,
):
"""Instantiate a selection of experimental PDB structures.

:param root_dir: The directory in which to store all PDB entries,
defaults to ``"."``.
:type root_dir: str, optional
:param structure_format: Whether to use ``.pdb`` or ``.mmtf`` file.
:param structure_format: Whether to use ``.pdb``, ``.mmtf`` or ``mmcif`` file.
Defaults to ``"pdb"``.
:type structure_format: str, optional
:param splits: A list of names corresponding to each dataset split,
Expand All @@ -58,6 +61,9 @@ def __init__(
to assign any rows remaining after creation of new dataset splits,
defaults to ``0``.
:type assign_leftover_rows_to_split_n: int, optional
:param labels: A list of names corresponding to metadata labels that should be included in PDB manager dataframe,
defaults to ``None``.
:type labels: Optional[List[Literal["uniprot_id", "cath_code", "ec_number"]]], optional
"""
# Arguments
self.root_dir = Path(root_dir)
Expand All @@ -83,6 +89,12 @@ def __init__(
)
self.pdb_availability_url = "https://files.wwpdb.org/pub/pdb/compatible/pdb_bundle/pdb_bundle_index.txt"

self.pdb_chain_cath_uniprot_url = "https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_cath_uniprot.tsv.gz"

self.cath_id_cath_code_url = "http://download.cathdb.info/cath/releases/daily-release/newest/cath-b-newest-all.gz"

self.pdb_chain_ec_number_url = "https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_enzyme.tsv.gz"

self.pdb_dir = self.root_dir / "pdb"
if not os.path.exists(self.pdb_dir):
os.makedirs(self.pdb_dir)
Expand All @@ -99,12 +111,19 @@ def __init__(
self.pdb_deposition_date_url
).name
self.pdb_availability_filename = Path(self.pdb_availability_url).name
self.pdb_chain_cath_uniprot_filename = Path(
self.pdb_chain_cath_uniprot_url
).name
self.cath_id_cath_code_filename = Path(self.cath_id_cath_code_url).name
self.pdb_chain_ec_number_filename = Path(
self.pdb_chain_ec_number_url
).name

self.list_columns = ["ligands"]

# Data
self.download_metadata()
self.df = self.parse()
self.df = self.parse(labels)
self.source = self.df.copy()

# Splits
Expand Down Expand Up @@ -146,6 +165,9 @@ def download_metadata(self):
self._download_entry_metadata()
self._download_exp_type()
self._download_pdb_availability()
self._download_pdb_chain_cath_uniprot_map()
a-r-j marked this conversation as resolved.
Show resolved Hide resolved
self._download_cath_id_cath_code_map()
self._download_pdb_chain_ec_number_map()

def get_unavailable_pdb_files(
self, splits: Optional[List[str]] = None
Expand Down Expand Up @@ -411,6 +433,39 @@ def _download_pdb_availability(self):
wget.download(self.pdb_availability_url, out=str(self.root_dir))
log.debug("Downloaded PDB availability map")

def _download_pdb_chain_cath_uniprot_map(self):
"""Download mapping from PDB chain to uniprot accession and CATH ID from
https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_cath_uniprot.tsv.gz
"""
if not os.path.exists(
self.root_dir / self.pdb_chain_cath_uniprot_filename
):
log.info("Downloading Uniprot CATH map...")
wget.download(
self.pdb_chain_cath_uniprot_url, out=str(self.root_dir)
)
log.debug("Downloaded Uniprot CATH map")

def _download_cath_id_cath_code_map(self):
"""Download mapping from CATH IDs to CATH code from
http://download.cathdb.info/cath/releases/daily-release/newest/cath-b-newest-all.gz
"""
if not os.path.exists(self.root_dir / self.cath_id_cath_code_filename):
log.info("Downloading CATH ID to CATH code map...")
wget.download(self.cath_id_cath_code_url, out=str(self.root_dir))
log.debug("Downloaded CATH ID to CATH code map")

def _download_pdb_chain_ec_number_map(self):
"""Download mapping from PDB chains to EC number from
https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_enzyme.tsv.gz
"""
if not os.path.exists(
self.root_dir / self.pdb_chain_ec_number_filename
):
log.info("Downloading EC number map...")
wget.download(self.pdb_chain_ec_number_url, out=str(self.root_dir))
log.debug("Downloaded EC number map")

def _parse_ligand_map(self) -> Dict[str, List[str]]:
"""Parse the ligand maps for all PDB records.

Expand Down Expand Up @@ -508,7 +563,7 @@ def _parse_entries(self) -> Dict[str, datetime]:
df.dropna(subset=["id"], inplace=True)

df.id = df.id.str.lower()
df.date = pd.to_datetime(df.date)
df.date = pd.to_datetime(df.date, format="%m/%d/%y")
return pd.Series(df["date"].values, index=df["id"]).to_dict()

def _parse_experiment_type(self) -> Dict[str, str]:
Expand Down Expand Up @@ -536,9 +591,107 @@ def _parse_pdb_availability(self) -> Dict[str, bool]:
ids = {id: False for id in ids}
return ids

def parse(self) -> pd.DataFrame:
def _parse_uniprot_id(self) -> Dict[str, str]:
"""Parse the uniprot ID for all PDB chains.

:return: Dictionary of PDB chain ID with their
corresponding uniprot ID.
:rtype: Dict[str, str]
"""
uniprot_mapping = {}
with gzip.open(
self.root_dir / self.pdb_chain_cath_uniprot_filename, "rt"
) as f:
for line in f:
try:
pdb, chain, uniprot_id, cath_id = line.strip().split("\t")
key = f"{pdb}_{chain}"
uniprot_mapping[key] = uniprot_id
except ValueError:
continue
return uniprot_mapping

def _parse_cath_id(self) -> Dict[str, str]:
"""Parse the CATH ID for all PDB chains.

:return: Dictionary of PDB chain ID with their
corresponding CATH ID.
:rtype: Dict[str, str]
"""
cath_mapping = {}
with gzip.open(
self.root_dir / self.pdb_chain_cath_uniprot_filename, "rt"
) as f:
next(f) # Skip header line
for line in f:
try:
pdb, chain, uniprot_id, cath_id = line.strip().split("\t")
key = f"{pdb}_{chain}"
cath_mapping[key] = cath_id
except ValueError:
continue
return cath_mapping

def _parse_cath_code(self) -> Dict[str, str]:
"""Parse the CATH code for all CATH IDs.

:return: Dictionary of CATH ID with their
corresponding CATH code.
:rtype: Dict[str, str]
"""
cath_mapping = {}
with gzip.open(
self.root_dir / self.cath_id_cath_code_filename, "rt"
) as f:
print(f)
for line in f:
print(line)
try:
cath_id, cath_version, cath_code, cath_segment = (
line.strip().split()
)
cath_mapping[cath_id] = cath_code
print(cath_id, cath_code)
except ValueError:
continue
return cath_mapping

def _parse_ec_number(self) -> Dict[str, str]:
"""Parse the CATH ID for all PDB chains and adds None when no EC number is present.

:return: Dictionary of PDB chain ID with their
corresponding EC number.
:rtype: Dict[str, str]
"""
ec_mapping = {}
with gzip.open(
self.root_dir / self.pdb_chain_ec_number_filename, "rt"
) as f:
next(f) # Skip header line
for line in f:
try:
pdb, chain, uniprot_id, ec_number = line.strip().split(
"\t"
)
key = f"{pdb}_{chain}"
ec_number = None if ec_number == "?" else ec_number
ec_mapping[key] = ec_number
except ValueError:
continue
return ec_mapping

def parse(
self,
labels: Optional[
List[Literal["uniprot_id", "cath_code", "ec_number"]]
] = None,
) -> pd.DataFrame:
"""Parse all PDB sequence records.

:param labels: A list of names corresponding to metadata labels that should be included in PDB manager dataframe,
defaults to ``None``.
:type labels: Optional[List[str]], optional

:return: DataFrame containing PDB sequence entries
with their corresponding metadata.
:rtype: pd.DataFrame
Expand Down Expand Up @@ -578,7 +731,15 @@ def parse(self) -> pd.DataFrame:
df["deposition_date"] = df.pdb.map(self._parse_entries())
df["experiment_type"] = df.pdb.map(self._parse_experiment_type())
df["pdb_file_available"] = df.pdb.map(self._parse_pdb_availability())
df.pdb_file_available.fillna(True, inplace=True)
df["pdb_file_available"] = df["pdb_file_available"].fillna(True)
if labels:
if "uniprot_id" in labels:
df["uniprot_id"] = df.id.map(self._parse_uniprot_id())
if "cath_code" in labels:
df["cath_id"] = df.id.map(self._parse_cath_id())
df["cath_code"] = df.cath_id.map(self._parse_cath_code())
if "ec_number" in labels:
df["ec_number"] = df.id.map(self._parse_ec_number())

return df

Expand Down Expand Up @@ -1150,6 +1311,105 @@ def select_complexes_with_grouped_molecule_types(
if update:
self.df = df

def has_uniprot_id(
self,
select_ids: Optional[List[str]] = None,
splits: Optional[List[str]] = None,
a-r-j marked this conversation as resolved.
Show resolved Hide resolved
update: bool = False,
) -> pd.DataFrame:
"""
Select entries that have a uniprot ID.

:param select_ids: If present, filter for only these IDs. If not present, filter for entries
that have any uniprot ID.
defaults to ``None``.
:type select_ids: Optional[List[str]], optional
:param splits: Names of splits for which to perform the operation,
defaults to ``None``.
:type splits: Optional[List[str]], optional
:param update: Whether to modify the DataFrame in place, defaults to
``False``.
:type update: bool, optional

:return: DataFrame of selected molecules.
:rtype: pd.DataFrame
"""
splits_df = self.get_splits(splits)
df = splits_df.dropna(subset=["uniprot_id"])

if select_ids:
df = df[df["uniprot_id"].isin(select_ids)]

if update:
self.df = df
return df

def has_cath_code(
self,
select_ids: Optional[List[str]] = None,
splits: Optional[List[str]] = None,
update: bool = False,
) -> pd.DataFrame:
"""
Select entries that have a cath code.

:param select_ids: If present, filter for only these CATH codes. If not present, filter for entries
that have any cath code.
defaults to ``None``.
:type select_ids: Optional[List[str]], optional
:param splits: Names of splits for which to perform the operation,
defaults to ``None``.
:type splits: Optional[List[str]], optional
:param update: Whether to modify the DataFrame in place, defaults to
``False``.
:type update: bool, optional

:return: DataFrame of selected molecules.
:rtype: pd.DataFrame
"""
splits_df = self.get_splits(splits)
df = splits_df.dropna(subset=["cath_code"])

if select_ids:
df = df[df["cath_code"].isin(select_ids)]

if update:
self.df = df
return df

def has_ec_number(
self,
select_ids: Optional[List[str]] = None,
splits: Optional[List[str]] = None,
update: bool = False,
) -> pd.DataFrame:
"""
Select entries that have an EC number.

:param select_ids: If present, filter for only these ec_numbers. If not present, filter for entries
that have any EC number
defaults to ``None``.
:type select_ids: Optional[List[str]], optional
:param splits: Names of splits for which to perform the operation,
defaults to ``None``.
:type splits: Optional[List[str]], optional
:param update: Whether to modify the DataFrame in place, defaults to
``False``.
:type update: bool, optional

:return: DataFrame of selected molecules.
:rtype: pd.DataFrame
"""
splits_df = self.get_splits(splits)
df = splits_df.dropna(subset=["ec_number"])

if select_ids:
df = df[df["ec_number"].isin(select_ids)]

if update:
self.df = df
return df

def split_df_proportionally(
self,
df: pd.DataFrame,
Expand Down Expand Up @@ -1561,8 +1821,8 @@ def reset(self) -> pd.DataFrame:

def download_pdbs(
self,
out_dir=".",
format="pdb",
out_dir: str = ".",
format: str = "pdb",
splits: Optional[List[str]] = None,
overwrite: bool = False,
max_workers: int = 8,
Expand All @@ -1572,7 +1832,7 @@ def download_pdbs(

:param out_dir: Output directory, defaults to ``"."``
:type out_dir: str, optional
:param format: Filetype to download. ``pdb`` or ``mmtf``.
:param format: Filetype to download. ``pdb``, ``mmtf``, ``mmcif`` or ``bcif``.
:type format: str
:param splits: Names of splits for which to perform the operation,
defaults to ``None``.
Expand Down
Loading
Loading