diff --git a/CHANGELOG.md b/CHANGELOG.md index 80ccadf3..57f7d4b5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,20 @@ +### 1.5.2 + +### GRN +* [Bugfix] - [#208](https://github.com/a-r-j/graphein/pull/208) - Resolves SSL issues with RegNetwork. + + +#### ML +* [Feature] - [#208](https://github.com/a-r-j/graphein/pull/208) support for loading local pdb files by ``ProteinGraphDataset`` and ``InMemoryProteinGraphDataset``. +>by adding a params:`pdb_paths` and set the `self.raw_dir` to the root path(`self.pdb_path`) of pdb_paths list (the root path should be only one, pdb files should be under the same folder). +> +>it will works from loading pdb files from the `self.pdb_path` instead of loading from self.raw. +> If desire to download from af2 or pdb, just set `pdb_paths` to `None` and it goes back to the former version. + +#### CI +* [Bugfix] - [#208](https://github.com/a-r-j/graphein/pull/208) explicitly installs `jupyter_contrib_nbextensions` in Docker. + + ### 1.5.1 #### Protein diff --git a/Dockerfile b/Dockerfile index c86d49b1..ec0d6904 100644 --- a/Dockerfile +++ b/Dockerfile @@ -42,9 +42,7 @@ RUN conda install -c fvcore -c iopath -c conda-forge fvcore iopath RUN conda install -c pytorch3d pytorch3d RUN conda install -c dglteam dgl RUN conda install -c salilab dssp - RUN conda install -c conda-forge ipywidgets -RUN jupyter nbextension enable --py widgetsnbextension RUN export CUDA=$(python -c "import torch; print('cu'+torch.version.cuda.replace('.',''))") \ && export TORCH=$(python -c "import torch; print(torch.__version__)") \ @@ -54,6 +52,8 @@ RUN export CUDA=$(python -c "import torch; print('cu'+torch.version.cuda.replace && pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-${TORCH}+${CUDA}.html --no-cache-dir \ && pip install torch-geometric --no-cache-dir +RUN pip install jupyter_contrib_nbextensions +RUN jupyter nbextension enable --py widgetsnbextension # Testing # docker-compose -f docker-compose.cpu.yml up -d --build diff --git a/graphein/grn/parse_regnetwork.py b/graphein/grn/parse_regnetwork.py index 8b0a2282..8677866b 100644 --- a/graphein/grn/parse_regnetwork.py +++ b/graphein/grn/parse_regnetwork.py @@ -14,6 +14,7 @@ import pandas as pd import wget +import ssl from graphein.utils.utils import filter_dataframe, ping @@ -41,10 +42,10 @@ def _download_RegNetwork( "RegNetwork is not available. Please check your internet connection or verify at: http://www.regnetworkweb.org" ) - mouse_url = "http://regnetworkweb.org/download/mouse.zip" + mouse_url = "https://regnetworkweb.org/download/mouse.zip" if network_type == "human": - human_url = "http://www.regnetworkweb.org/download/human.zip" + human_url = "https://regnetworkweb.org/download/human.zip" url = human_url elif network_type == "mouse": url = mouse_url @@ -66,8 +67,12 @@ def _download_RegNetwork( # Download data and unzip if not os.path.exists(file): log.info("Downloading RegNetwork ...") + # switch ssl context for unverified download + default_https_context = ssl._create_default_https_context + ssl._create_default_https_context = ssl._create_unverified_context wget.download(url, compressed_file) - + # switch ssl context back to default + ssl._create_default_https_context = default_https_context with zipfile.ZipFile(compressed_file, "r") as zip_ref: zip_ref.extractall(out_dir) @@ -80,7 +85,7 @@ def _download_RegNetwork_regtypes(root_dir: Optional[Path] = None) -> str: :param root_dir: Path object specifying the location to download RegNetwork to """ - url = "http://www.regnetworkweb.org/download/RegulatoryDirections.zip" + url = "https://regnetworkweb.org/download/RegulatoryDirections.zip" if root_dir is None: root_dir = Path(__file__).parent.parent.parent / "datasets" @@ -94,7 +99,12 @@ def _download_RegNetwork_regtypes(root_dir: Optional[Path] = None) -> str: # Download data and unzip if not os.path.exists(file): log.info("Downloading RegNetwork reg types ...") + # switch ssl context for unverified download + default_https_context = ssl._create_default_https_context + ssl._create_default_https_context = ssl._create_unverified_context wget.download(url, compressed_file) + # switch ssl context back to default + ssl._create_default_https_context = default_https_context with zipfile.ZipFile(compressed_file, "r") as zip_ref: zip_ref.extractall(out_dir) diff --git a/graphein/ml/datasets/torch_geometric_dataset.py b/graphein/ml/datasets/torch_geometric_dataset.py index 3665918f..5de6f9eb 100644 --- a/graphein/ml/datasets/torch_geometric_dataset.py +++ b/graphein/ml/datasets/torch_geometric_dataset.py @@ -41,6 +41,7 @@ def __init__( self, root: str, name: str, + pdb_paths: Optional[List[str]] = None, pdb_codes: Optional[List[str]] = None, uniprot_ids: Optional[List[str]] = None, graph_label_map: Optional[Dict[str, torch.Tensor]] = None, @@ -72,6 +73,8 @@ def __init__( :type root: str :param name: Name of the dataset. Will be saved to ``data_$name.pt``. :type name: str + :param pdb_paths: List of full path of pdb files to load. Defaults to ``None``. + :type pdb_paths: Optional[List[str]], optional :param pdb_codes: List of PDB codes to download and parse from the PDB. Defaults to None. :type pdb_codes: Optional[List[str]], optional @@ -135,6 +138,23 @@ def __init__( else None ) + self.pdb_paths = pdb_paths + if self.pdb_paths is None: + if self.pdb_codes and self.uniprot_ids: + self.structures = self.pdb_codes + self.uniprot_ids + elif self.pdb_codes: + self.structures = pdb_codes + elif self.uniprot_ids: + self.structures = uniprot_ids + # Use local saved pdb_files instead of download or move them to self.root/raw dir + else: + if isinstance(self.pdb_paths, list): + self.structures = [ + os.path.splitext(os.path.split(pdb_path)[-1])[0] + for pdb_path in self.pdb_paths + ] + self.pdb_path, _ = os.path.split(self.pdb_paths[0]) + if self.pdb_codes and self.uniprot_ids: self.structures = self.pdb_codes + self.uniprot_ids elif self.pdb_codes: @@ -157,6 +177,7 @@ def __init__( self.graph_transformation_funcs = graph_transformation_funcs self.pdb_transform = pdb_transform self.num_cores = num_cores + self.af_version = af_version super().__init__( root, transform=transform, @@ -176,6 +197,13 @@ def processed_file_names(self) -> List[str]: """Name of the processed file.""" return [f"data_{self.name}.pt"] + @property + def raw_dir(self) -> str: + if self.pdb_paths is not None: + return self.pdb_path # replace raw dir with user local pdb_path + else: + return os.path.join(self.root, "raw") + def download(self): """Download the PDB files from RCSB or Alphafold.""" self.config.pdb_dir = Path(self.raw_dir) @@ -298,7 +326,8 @@ def process(self): class ProteinGraphDataset(Dataset): def __init__( self, - root, + root: str, + pdb_paths: Optional[List[str]] = None, pdb_codes: Optional[List[str]] = None, uniprot_ids: Optional[List[str]] = None, # graph_label_map: Optional[Dict[str, int]] = None, @@ -327,6 +356,8 @@ def __init__( :param root: Root directory where the dataset should be saved. :type root: str + :param pdb_paths: List of full path of pdb files to load. Defaults to ``None``. + :type pdb_paths: Optional[List[str]], optional :param pdb_codes: List of PDB codes to download and parse from the PDB. Defaults to ``None``. :type pdb_codes: Optional[List[str]], optional @@ -388,14 +419,22 @@ def __init__( if uniprot_ids is not None else None ) - - if self.pdb_codes and self.uniprot_ids: - self.structures = self.pdb_codes + self.uniprot_ids - elif self.pdb_codes: - self.structures = pdb_codes - elif self.uniprot_ids: - self.structures = uniprot_ids - self.af_version = af_version + self.pdb_paths = pdb_paths + if self.pdb_paths is None: + if self.pdb_codes and self.uniprot_ids: + self.structures = self.pdb_codes + self.uniprot_ids + elif self.pdb_codes: + self.structures = pdb_codes + elif self.uniprot_ids: + self.structures = uniprot_ids + # Use local saved pdb_files instead of download or move them to self.root/raw dir + else: + if isinstance(self.pdb_paths, list): + self.structures = [ + os.path.splitext(os.path.split(pdb_path)[-1])[0] + for pdb_path in self.pdb_paths + ] + self.pdb_path, _ = os.path.split(self.pdb_paths[0]) # Labels & Chains @@ -424,6 +463,7 @@ def __init__( self.num_cores = num_cores self.pdb_transform = pdb_transform self.graph_transformation_funcs = graph_transformation_funcs + self.af_version = af_version super().__init__( root, transform=transform, @@ -450,6 +490,13 @@ def processed_file_names(self) -> List[str]: else: return [f"{pdb}.pt" for pdb in self.structures] + @property + def raw_dir(self) -> str: + if self.pdb_paths is not None: + return self.pdb_path # replace raw dir with user local pdb_path + else: + return os.path.join(self.root, "raw") + def validate_input(self): if self.graph_label_map is not None: assert len(self.structures) == len( @@ -554,6 +601,7 @@ def divide_chunks(l: List[str], n: int = 2) -> Generator: # Create graph objects file_names = [f"{self.raw_dir}/{pdb}.pdb" for pdb in pdbs] + graphs = construct_graphs_mp( pdb_path_it=file_names, config=self.config, diff --git a/notebooks/dataloader_tutorial.ipynb b/notebooks/dataloader_tutorial.ipynb index 57d68cd4..3b5dcfd8 100644 --- a/notebooks/dataloader_tutorial.ipynb +++ b/notebooks/dataloader_tutorial.ipynb @@ -54,6 +54,8 @@ " # Root directory where the dataset should be saved.\n", " name: str, \n", " # Name of the dataset. Will be saved to ``data_$name.pt``.\n", + " pdb_paths:Optional[List[str]] =None,\n", + " # List of full path of pdb files to load.\n", " pdb_codes: Optional[List[str]] = None, \n", " # List of PDB codes to download and parse from the PDB.\n", " uniprot_ids: Optional[List[str]] = None, \n", @@ -90,7 +92,7 @@ "#### Directory Structure\n", "Creating a ``ProteinGraphDataset`` will create two directories under ``root``:\n", "\n", - "* ``root/raw`` - Contains raw PDB files\n", + "* ``root/raw`` - Contains raw PDB files which are downloaded\n", "* ``root/processed`` - Contains processed graphs (in ``pytorch_geometric.data.Data`` format) saved as ``$PDB.pt / $UNIPROT_ID.pt``" ] }, @@ -156,6 +158,75 @@ " break" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Load from local path\n", + "\n", + "\n", + "Creating a ``ProteinGraphDataset`` from a list of full path of pdb files:\n", + "\n", + "* ``root/raw`` - Will be empty since no pdb files are downloaded\n", + "* ``root/processed`` - Contains processed graphs (in ``pytorch_geometric.data.Data`` format) saved as ``$PDB.pt / $UNIPROT_ID.pt``" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['../tests/protein/test_data/1lds.pdb', '../tests/protein/test_data/4hhb.pdb', '../tests/protein/test_data/alphafold_structure.pdb']\n" + ] + } + ], + "source": [ + "# import sys\n", + "# sys.path.append('../') # add system path for python\n", + "\n", + "import os \n", + "from graphein.protein.config import ProteinGraphConfig\n", + "from graphein.ml import ProteinGraphDataset, ProteinGraphListDataset\n", + "import torch \n", + "\n", + "local_dir = \"../tests/protein/test_data/\"\n", + "pdb_paths = [os.path.join(local_dir, pdb_path) for pdb_path in os.listdir(local_dir) if pdb_path.endswith(\".pdb\")]\n", + "print(pdb_paths)\n", + "\n", + "# let's load local dataset from local_dir!\n", + "ds = ProteinGraphDataset(\n", + " root = \"../graphein/ml/datasets/test\",\n", + " pdb_paths = pdb_paths,\n", + " graphein_config=ProteinGraphConfig(),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DataBatch(edge_index=[2, 666], node_id=[2], coords=[2], name=[2], dist_mat=[2], num_nodes=671, batch=[671], ptr=[3])\n" + ] + } + ], + "source": [ + "# Create a dataloader from dataset and inspect a batch\n", + "from torch_geometric.loader import DataLoader\n", + "dl = DataLoader(ds, batch_size=2, shuffle=True, drop_last=True)\n", + "for i in dl:\n", + " print(i)\n", + " break" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -171,6 +242,8 @@ " # Root directory where the dataset should be saved.\n", " name: str, \n", " # Name of the dataset. Will be saved to ``data_$name.pt``.\n", + " pdb_paths:Optional[List[str]] =None,\n", + " # List of full path of pdb files to load.\n", " pdb_codes: Optional[List[str]] = None, \n", " # List of PDB codes to download and parse from the PDB.\n", " uniprot_ids: Optional[List[str]] = None, \n", @@ -292,6 +365,124 @@ " break" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Load from local path\n", + "\n", + "\n", + "Creating an ``InMemoryProteinGraphDataset`` from a list of full path of pdb files:\n", + "\n", + "* ``root/raw`` - Will be empty since no pdb files are downloaded\n", + "* ``root/processed`` - Contains processed datasets saved as ``data_{name}.pt``\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['../tests/protein/test_data/1lds.pdb', '../tests/protein/test_data/4hhb.pdb', '../tests/protein/test_data/alphafold_structure.pdb']\n", + "Constructing Graphs...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing...\n" + ] + }, + { + "data": { + "application/json": { + "ascii": false, + "bar_format": null, + "colour": null, + "elapsed": 0.2526402473449707, + "initial": 0, + "n": 0, + "ncols": null, + "nrows": null, + "postfix": null, + "prefix": "", + "rate": null, + "total": 3, + "unit": "it", + "unit_divisor": 1000, + "unit_scale": false + }, + "application/vnd.jupyter.widget-view+json": { + "model_id": "d5ed353098664f6f803fa502264df986", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/3 [00:00