From 4fa4d2ca87b736d04a05609d1512845393cfb5e2 Mon Sep 17 00:00:00 2001 From: Anton Bushuiev Date: Wed, 23 Aug 2023 17:39:27 +0200 Subject: [PATCH 01/10] Fix nx -> pyg conversion corner case when graph has no edges --- graphein/ml/conversion.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/graphein/ml/conversion.py b/graphein/ml/conversion.py index 7d7e3edb0..58dcf4160 100644 --- a/graphein/ml/conversion.py +++ b/graphein/ml/conversion.py @@ -289,7 +289,8 @@ def convert_nx_to_pyg(self, G: nx.Graph) -> Data: data[key].append(value) # Add edge features - edge_feature_names = list(G.edges(data=True))[0][2].keys() + edge_list = list(G.edges(data=True)) + edge_feature_names = edge_list[0][2].keys() if edge_list else [] edge_feature_names = list( filter( lambda x: x in self.columns and x != "kind", edge_feature_names From cac8b93a55a85f317145a1b6741727c0df839ee2 Mon Sep 17 00:00:00 2001 From: Anton Bushuiev Date: Wed, 23 Aug 2023 17:43:57 +0200 Subject: [PATCH 02/10] Update changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5286fa214..d240a78e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,9 @@ * [Bugfix] - [#305](https://github.com/a-r-j/graphein/pull/305) Fixes the construction of geometric features when beta-carbons or side chains are missing in non-glycine residues (for example in `H:CYS:104` in 3SE8). * [Bugfix] - [#305](https://github.com/a-r-j/graphein/pull/305) Fixes data types of geometric feature vectors: `object` -> `float`. * [Bugfix] - [#301](https://github.com/a-r-j/graphein/pull/301) Fixes the conversion of undirected NetworkX graph to directed PyG data. +* [Bugfix] - [#334](https://github.com/a-r-j/graphein/pull/334) Fixes the corner case of the NetworkX -> PyG conversion when input graph has no edges. + +https://github.com/a-r-j/graphein/pull/334 #### Bugfixes * Adds missing `stage` parameter to `graphein.ml.datasets.foldcomp_data.FoldCompDataModule.setup()`. [#310](https://github.com/a-r-j/graphein/pull/310) From 472518e93b89f6cc9311b0f00bc3bcb2a17c931a Mon Sep 17 00:00:00 2001 From: Anton Bushuiev Date: Sat, 26 Aug 2023 14:13:04 +0200 Subject: [PATCH 03/10] Add todos --- graphein/protein/features/nodes/geometry.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/graphein/protein/features/nodes/geometry.py b/graphein/protein/features/nodes/geometry.py index ea2e6f3f3..b3876285b 100644 --- a/graphein/protein/features/nodes/geometry.py +++ b/graphein/protein/features/nodes/geometry.py @@ -179,10 +179,12 @@ def add_sequence_neighbour_vector( ) continue # Asserts residues are on the same chain + # TODO It seems redunant cond_1 = ( residue[1]["chain_id"] == chain_residues[i + 1][1]["chain_id"] ) # Asserts residue numbers are adjacent + # TODO What about insertions? cond_2 = ( abs( residue[1]["residue_number"] @@ -192,6 +194,7 @@ def add_sequence_neighbour_vector( ) # If this checks out, we compute the vector + # TODO What if not? -> vec is unknown if (cond_1) and (cond_2): vec = chain_residues[i + 1][1]["coords"] - residue[1]["coords"] From 1343e3c546f232679ea28f0128bd05915fada845 Mon Sep 17 00:00:00 2001 From: Anton Bushuiev Date: Sat, 26 Aug 2023 14:17:22 +0200 Subject: [PATCH 04/10] Remove redundant check --- graphein/protein/features/nodes/geometry.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/graphein/protein/features/nodes/geometry.py b/graphein/protein/features/nodes/geometry.py index b3876285b..fcf6ab22e 100644 --- a/graphein/protein/features/nodes/geometry.py +++ b/graphein/protein/features/nodes/geometry.py @@ -178,14 +178,10 @@ def add_sequence_neighbour_vector( [0.0, 0.0, 0.0] ) continue - # Asserts residues are on the same chain - # TODO It seems redunant - cond_1 = ( - residue[1]["chain_id"] == chain_residues[i + 1][1]["chain_id"] - ) + # Asserts residue numbers are adjacent # TODO What about insertions? - cond_2 = ( + cond = ( abs( residue[1]["residue_number"] - chain_residues[i + 1][1]["residue_number"] @@ -195,7 +191,7 @@ def add_sequence_neighbour_vector( # If this checks out, we compute the vector # TODO What if not? -> vec is unknown - if (cond_1) and (cond_2): + if cond: vec = chain_residues[i + 1][1]["coords"] - residue[1]["coords"] if reverse: From 29ca587c07d86a5eb2eb93fcc7e69c3f2cd15f98 Mon Sep 17 00:00:00 2001 From: Anton Bushuiev Date: Sat, 26 Aug 2023 14:26:03 +0200 Subject: [PATCH 05/10] Fix propagation of same `vec` on non-adjacent nodes --- graphein/protein/features/nodes/geometry.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/graphein/protein/features/nodes/geometry.py b/graphein/protein/features/nodes/geometry.py index fcf6ab22e..3a6d2260b 100644 --- a/graphein/protein/features/nodes/geometry.py +++ b/graphein/protein/features/nodes/geometry.py @@ -190,7 +190,6 @@ def add_sequence_neighbour_vector( ) # If this checks out, we compute the vector - # TODO What if not? -> vec is unknown if cond: vec = chain_residues[i + 1][1]["coords"] - residue[1]["coords"] @@ -198,6 +197,8 @@ def add_sequence_neighbour_vector( vec = -vec if scale: vec = vec / np.linalg.norm(vec) + else: + vec = np.array([0.0, 0.0, 0.0]) residue[1][f"sequence_neighbour_vector_{suffix}"] = vec From 1f5826ff6c99f0fed3887a2ce46f27d0d6833efa Mon Sep 17 00:00:00 2001 From: Anton Bushuiev Date: Sat, 26 Aug 2023 14:41:23 +0200 Subject: [PATCH 06/10] Fix adjacency check for insertion codes --- graphein/protein/features/nodes/geometry.py | 30 ++++++++++++++++----- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/graphein/protein/features/nodes/geometry.py b/graphein/protein/features/nodes/geometry.py index 3a6d2260b..10e68b781 100644 --- a/graphein/protein/features/nodes/geometry.py +++ b/graphein/protein/features/nodes/geometry.py @@ -179,18 +179,36 @@ def add_sequence_neighbour_vector( ) continue - # Asserts residue numbers are adjacent - # TODO What about insertions? - cond = ( + # Get insertion codes + ins_current = ( + residue[0].split(':')[3] + if residue[0].count(':') > 2 + else '' + ) + ins_next = ( + chain_residues[i + 1][0].split(':')[3] + if chain_residues[i + 1][0].count(':') > 2 + else '' + ) + + # Asserts residues are adjacent + cond_adjacent = ( abs( residue[1]["residue_number"] - chain_residues[i + 1][1]["residue_number"] ) == 1 + or ( + not ins_current and ins_next == 'A' + ) + or ( + ins_current and ins_next + and chr(ord(ins_current) + 1) == ins_next + ) ) - # If this checks out, we compute the vector - if cond: + # If this checks out, we compute the non-zero vector + if cond_adjacent: vec = chain_residues[i + 1][1]["coords"] - residue[1]["coords"] if reverse: @@ -198,7 +216,7 @@ def add_sequence_neighbour_vector( if scale: vec = vec / np.linalg.norm(vec) else: - vec = np.array([0.0, 0.0, 0.0]) + vec = np.array([0.0, 0.0, 0.0]) residue[1][f"sequence_neighbour_vector_{suffix}"] = vec From 9e638c8e170020a8c5e3728d1b9e26aa0942c527 Mon Sep 17 00:00:00 2001 From: Anton Bushuiev Date: Sat, 26 Aug 2023 15:10:14 +0200 Subject: [PATCH 07/10] Fix adjacency check for insertion codes for backward order --- graphein/protein/features/nodes/geometry.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/graphein/protein/features/nodes/geometry.py b/graphein/protein/features/nodes/geometry.py index 10e68b781..e65cf644b 100644 --- a/graphein/protein/features/nodes/geometry.py +++ b/graphein/protein/features/nodes/geometry.py @@ -190,19 +190,25 @@ def add_sequence_neighbour_vector( if chain_residues[i + 1][0].count(':') > 2 else '' ) + if not n_to_c: + ins_current, ins_next = ins_next, ins_current + + # Get sequence distance + dist = abs( + residue[1]["residue_number"] + - chain_residues[i + 1][1]["residue_number"] + ) # Asserts residues are adjacent cond_adjacent = ( - abs( - residue[1]["residue_number"] - - chain_residues[i + 1][1]["residue_number"] - ) - == 1 + dist == 1 or ( - not ins_current and ins_next == 'A' + dist == 0 + and not ins_current and ins_next == 'A' ) or ( - ins_current and ins_next + dist == 0 + and ins_current and ins_next and chr(ord(ins_current) + 1) == ins_next ) ) From 3b7fac837d15871b1d06bf405babc45d3523f630 Mon Sep 17 00:00:00 2001 From: Anton Bushuiev Date: Sat, 26 Aug 2023 15:10:32 +0200 Subject: [PATCH 08/10] Test `add_sequence_neighbour_vector` --- tests/protein/nodes/features/test_geometry.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/protein/nodes/features/test_geometry.py b/tests/protein/nodes/features/test_geometry.py index 0d08d1fca..a525c6c40 100644 --- a/tests/protein/nodes/features/test_geometry.py +++ b/tests/protein/nodes/features/test_geometry.py @@ -8,6 +8,7 @@ import operator from functools import partial +import pytest import numpy as np from loguru import logger @@ -16,6 +17,7 @@ add_beta_carbon_vector, add_sidechain_vector, add_virtual_beta_carbon_vector, + add_sequence_neighbour_vector ) from graphein.protein.graphs import construct_graph @@ -195,3 +197,22 @@ def test_add_virtual_beta_carbon_vector(): g = construct_graph(config=config, pdb_code="7w9w") for n, d in g.nodes(data=True): assert d["virtual_c_beta_vector"].shape == (3,) + + +@pytest.mark.parametrize("n_to_c", [True, False]) +def test_add_sequence_neighbour_vector(n_to_c): + config = ProteinGraphConfig(edge_construction_functions=[]) + g = construct_graph(pdb_code="1igt", config=config) + add_sequence_neighbour_vector(g, n_to_c=n_to_c) + + key = "sequence_neighbour_vector_" + ("n_to_c" if n_to_c else "c_to_n") + for n, d in g.nodes(data=True): + # Check that the node has the correct attributes + assert key in d.keys() + # Check the vector is of the correct dimensionality + assert d[key].shape == (3,) + + # check A insertions have non-zero backward vectors + print(n, n_to_c, d[key]) + if n.endswith(":A") and not n_to_c: + assert np.any(np.not_equal(d[key], [0.0, 0.0, 0.0])) From ba6c1cb2fc72d7e2803e2c950907dcba8d052618 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 26 Aug 2023 13:18:12 +0000 Subject: [PATCH 09/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- graphein/protein/features/nodes/geometry.py | 22 ++++++++----------- tests/protein/nodes/features/test_geometry.py | 4 ++-- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/graphein/protein/features/nodes/geometry.py b/graphein/protein/features/nodes/geometry.py index e65cf644b..b24c0880a 100644 --- a/graphein/protein/features/nodes/geometry.py +++ b/graphein/protein/features/nodes/geometry.py @@ -181,14 +181,12 @@ def add_sequence_neighbour_vector( # Get insertion codes ins_current = ( - residue[0].split(':')[3] - if residue[0].count(':') > 2 - else '' + residue[0].split(":")[3] if residue[0].count(":") > 2 else "" ) ins_next = ( - chain_residues[i + 1][0].split(':')[3] - if chain_residues[i + 1][0].count(':') > 2 - else '' + chain_residues[i + 1][0].split(":")[3] + if chain_residues[i + 1][0].count(":") > 2 + else "" ) if not n_to_c: ins_current, ins_next = ins_next, ins_current @@ -198,17 +196,15 @@ def add_sequence_neighbour_vector( residue[1]["residue_number"] - chain_residues[i + 1][1]["residue_number"] ) - + # Asserts residues are adjacent cond_adjacent = ( dist == 1 + or (dist == 0 and not ins_current and ins_next == "A") or ( dist == 0 - and not ins_current and ins_next == 'A' - ) - or ( - dist == 0 - and ins_current and ins_next + and ins_current + and ins_next and chr(ord(ins_current) + 1) == ins_next ) ) @@ -222,7 +218,7 @@ def add_sequence_neighbour_vector( if scale: vec = vec / np.linalg.norm(vec) else: - vec = np.array([0.0, 0.0, 0.0]) + vec = np.array([0.0, 0.0, 0.0]) residue[1][f"sequence_neighbour_vector_{suffix}"] = vec diff --git a/tests/protein/nodes/features/test_geometry.py b/tests/protein/nodes/features/test_geometry.py index a525c6c40..b23eb4314 100644 --- a/tests/protein/nodes/features/test_geometry.py +++ b/tests/protein/nodes/features/test_geometry.py @@ -8,16 +8,16 @@ import operator from functools import partial -import pytest import numpy as np +import pytest from loguru import logger from graphein.protein.config import ProteinGraphConfig from graphein.protein.features.nodes.geometry import ( add_beta_carbon_vector, + add_sequence_neighbour_vector, add_sidechain_vector, add_virtual_beta_carbon_vector, - add_sequence_neighbour_vector ) from graphein.protein.graphs import construct_graph From 5601e14433f819f3b6838e193a4127d78621f91e Mon Sep 17 00:00:00 2001 From: Anton Bushuiev <67932762+anton-bushuiev@users.noreply.github.com> Date: Mon, 2 Oct 2023 23:21:05 +0200 Subject: [PATCH 10/10] Update CHANGELOG.md --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ffcc07476..14576829e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +### Unreleased + +* Fixes `add_sequence_neighbour_vector` to have a zero vector when no neighbor is feasible. Extend to handle insertion codes ([#336](https://github.com/a-r-j/graphein/pull/336)). + ### 1.7.3 - 30/08/2023 * Fixes edge case in FoldComp database download if target directory has same name as database ([#339](https://github.com/a-r-j/graphein/pull/339))