atomistic-machine-learning · stefaanhessmann · Mar 12, 2024 · Feb 7, 2024 · Feb 7, 2024 · Feb 12, 2024
diff --git a/.gitignore b/.gitignore
@@ -125,4 +125,4 @@ interfaces/lammps/examples/*/*.dat
 interfaces/lammps/examples/*/deployed_model
 
 # batchwise optimizer examples
-examples/howtos/howto_batchwise_relaxations_outputs/*
+examples/howtos/howto_batchwise_relaxations_outputs/*
diff --git a/src/schnetpack/nn/__init__.py b/src/schnetpack/nn/__init__.py
@@ -12,3 +12,4 @@
 from schnetpack.nn.scatter import *
 from schnetpack.nn.radial import *
 from schnetpack.nn.utils import *
+from schnetpack.nn.embedding import *
diff --git a/src/schnetpack/nn/activations.py b/src/schnetpack/nn/activations.py
@@ -3,7 +3,7 @@
 
 from torch.nn import functional
 
-__all__ = ["shifted_softplus", "softplus_inverse"]
+__all__ = ["shifted_softplus", "softplus_inverse", "ShiftedSoftplus"]
 
 
 def shifted_softplus(x: torch.Tensor):
@@ -33,3 +33,63 @@ def softplus_inverse(x: torch.Tensor):
         torch.Tensor: softplus inverse of input.
     """
     return x + (torch.log(-torch.expm1(-x)))
+
+
+class ShiftedSoftplus(torch.nn.Module):
+    """
+    Shifted softplus activation function with learnable feature-wise parameters:
+    f(x) = alpha/beta * (softplus(beta*x) - log(2))
+    softplus(x) = log(exp(x) + 1)
+    For beta -> 0  : f(x) -> 0.5*alpha*x
+    For beta -> inf: f(x) -> max(0, alpha*x)
+
+    With learnable parameters alpha and beta, the shifted softplus function can
+    become equivalent to ReLU (if alpha is equal 1 and beta approaches infinity) or to
+    the identity function (if alpha is equal 2 and beta is equal 0).
+
+    Arguments:
+        num_features (int):
+            Dimensions of feature space.
+        initial_alpha (float):
+            Initial "scale" alpha of the softplus function.
+        initial_beta (float):
+            Initial "temperature" beta of the softplus function.
+    """
+
+    def __init__(
+        self, 
+        num_features: int, 
+        initial_alpha: float = 1.0,
+        initial_beta: float = 1.0,
+        trainable: bool = False) -> None:
+
+        """ Initializes the ShiftedSoftplus class. """
+        super(ShiftedSoftplus, self).__init__()
+        initial_alpha = torch.tensor(initial_alpha)
+        initial_beta = torch.tensor(initial_beta)
+
+        if trainable:
+            self.alpha = torch.nn.Parameter(torch.Tensor(num_features))
+            self.beta = torch.nn.Parameter(torch.Tensor(num_features))
+        else:
+            self.register_buffer("alpha", initial_alpha)
+            self.register_buffer("beta", initial_beta)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Evaluate activation function given the input features x.
+        num_features: Dimensions of feature space.
+
+        Arguments:
+            x (FloatTensor [:, num_features]):
+                Input features.
+
+        Returns:
+            y (FloatTensor [:, num_features]):
+                Activated features.
+        """
+        return self.alpha * torch.where(
+            self.beta != 0,
+            (torch.nn.functional.softplus(self.beta * x) - math.log(2)) / self.beta,
+            0.5 * x,
+        )
diff --git a/src/schnetpack/nn/blocks.py b/src/schnetpack/nn/blocks.py
@@ -4,6 +4,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import schnetpack.nn as snn
+from schnetpack.nn.activations import shifted_softplus
 
 __all__ = ["build_mlp", "build_gated_equivariant_mlp"]
 
@@ -153,3 +154,148 @@ def build_gated_equivariant_mlp(
     # put all layers together to make the network
     out_net = nn.Sequential(*layers)
     return out_net
+
+
+class Residual(nn.Module):
+    """
+    Pre-activation residual block inspired by He, Kaiming, et al. "Identity
+    mappings in deep residual networks.".
+
+    Arguments:
+        num_features (int):
+            Dimensions of feature space.
+        activation (str):
+            Kind of activation function. Possible value:
+            'ssp': Shifted softplus activation function.
+    """
+
+    def __init__(
+        self,
+        num_features: int,
+        activation: Union[Callable, nn.Module] = None,
+        bias: bool = True,
+        zero_init: bool = True,
+    ) -> None:
+        """ Initializes the Residual class. """
+        super(Residual, self).__init__()
+        # initialize attributes
+
+        self.activation1 = activation#(num_features)
+        self.linear1 = nn.Linear(num_features, num_features, bias=bias)
+        self.activation2 = activation#(num_features)
+        self.linear2 = nn.Linear(num_features, num_features, bias=bias)
+        self.reset_parameters(bias, zero_init)
+
+    def reset_parameters(self, bias: bool = True, zero_init: bool = True) -> None:
+        """ Initialize parameters to compute an identity mapping. """
+        nn.init.orthogonal_(self.linear1.weight)
+        if zero_init:
+            nn.init.zeros_(self.linear2.weight)
+        else:
+            nn.init.orthogonal_(self.linear2.weight)
+        if bias:
+            nn.init.zeros_(self.linear1.bias)
+            nn.init.zeros_(self.linear2.bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Apply residual block to input atomic features.
+        N: Number of atoms.
+        num_features: Dimensions of feature space.
+
+        Arguments:
+            x (FloatTensor [N, num_features]):
+                Input feature representations of atoms.
+
+        Returns:
+            y (FloatTensor [N, num_features]):
+                Output feature representations of atoms.
+        """
+        y = self.activation1(x)
+        y = self.linear1(y)
+        y = self.activation2(y)
+        y = self.linear2(y)
+        return x + y
+
+
+class ResidualStack(nn.Module):
+    """
+    Stack of num_blocks pre-activation residual blocks evaluated in sequence.
+
+    Arguments:
+        num_blocks (int):
+            Number of residual blocks to be stacked in sequence.
+        num_features (int):
+            Dimensions of feature space.
+        activation (str):
+            Kind of activation function. Possible values:
+            'ssp': Shifted softplus activation function.
+    """
+
+    def __init__(
+        self,
+        num_features: int,
+        num_residual: int,
+        activation: Union[Callable, nn.Module],
+        bias: bool = True,
+        zero_init: bool = True,
+    ) -> None:
+        """ Initializes the ResidualStack class. """
+        super(ResidualStack, self).__init__()
+        self.stack = nn.ModuleList(
+            [
+                Residual(num_features, activation, bias, zero_init)
+                for i in range(num_residual)
+            ]
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies all residual blocks to input features in sequence.
+        N: Number of inputs.
+        num_features: Dimensions of feature space.
+
+        Arguments:
+            x (FloatTensor [N, num_features]):
+                Input feature representations.
+
+        Returns:
+            y (FloatTensor [N, num_features]):
+                Output feature representations.
+        """
+        for residual in self.stack:
+            x = residual(x)
+        return x
+
+
+class ResidualMLP(nn.Module):
+
+    # if used with learnable shifted softplus activation function, callable needs to be initiated with num features
+
+    def __init__(
+        self,
+        num_features: int,
+        num_residual: int,
+        activation: Union[Callable, nn.Module],
+        bias: bool = True,
+        zero_init: bool = False,
+    ) -> None:
+        super(ResidualMLP, self).__init__()
+        self.residual = ResidualStack(
+            num_features, num_residual, activation=activation, bias=bias, zero_init=True
+        )
+
+        self.linear = nn.Linear(num_features, num_features, bias=bias)
+        self.activation = activation
+        self.reset_parameters(bias, zero_init)
+
+    def reset_parameters(self, bias: bool = True, zero_init: bool = False) -> None:
+        if zero_init:
+            nn.init.zeros_(self.linear.weight)
+        else:
+            nn.init.orthogonal_(self.linear.weight)
+        if bias:
+            nn.init.zeros_(self.linear.bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.linear(self.activation(self.residual(x)))
diff --git a/src/schnetpack/nn/electronic_embeeding.py b/src/schnetpack/nn/electronic_embeeding.py
@@ -0,0 +1,124 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from typing import Optional
+from schnetpack.nn.blocks import ResidualMLP
+
+class ElectronicEmbedding(nn.Module):
+    """
+    Single Head self attention like block for updating atomic features through nonlocal interactions with the
+    electrons.
+    The embeddings are used to map the total molecular charge or molecular spin to a feature vector.
+    Since those properties are not localized on a specific atom they have to be delocalized over the whole molecule.
+    The delocalization is achieved by using a self attention like mechanism.
+
+
+    Arguments:
+        num_features (int):
+            Dimensions of feature space aka the number of features to describe atomic environments.
+            This determines the size of each embedding vector
+        num_residual (int):
+            Number of residual blocks applied to atomic features
+        activation (str):
+            Kind of activation function. Possible value:
+            'ssp': Shifted softplus activation function.
+        is_charged (bool):
+            is_charged True corresponds to building embedding for molecular charge and
+            separate weights are used for positive and negative charges.
+            i_charged False corresponds to building embedding for spin values,
+            no seperate weights are used
+    """
+
+    def __init__(
+        self,
+        num_features: int,
+        num_residual: int,
+        activation: str = "ssp",
+        is_charged: bool = False,
+    ) -> None:
+        """ Initializes the ElectronicEmbedding class. """
+        super(ElectronicEmbedding, self).__init__()
+        self.is_charged = is_charged
+        self.linear_q = nn.Linear(num_features, num_features)
+        if is_charged:  # charges are duplicated to use separate weights for +/-
+            self.linear_k = nn.Linear(2, num_features, bias=False)
+            self.linear_v = nn.Linear(2, num_features, bias=False)
+        else:
+            self.linear_k = nn.Linear(1, num_features, bias=False)
+            self.linear_v = nn.Linear(1, num_features, bias=False)
+        self.resblock = ResidualMLP(
+            num_features,
+            num_residual,
+            activation=activation,
+            zero_init=True,
+            bias=False,
+        )
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        """ Initialize parameters. """
+        nn.init.orthogonal_(self.linear_k.weight)
+        nn.init.orthogonal_(self.linear_v.weight)
+        nn.init.orthogonal_(self.linear_q.weight)
+        nn.init.zeros_(self.linear_q.bias)
+
+    def forward(
+        self,
+        atomic_features: torch.Tensor,
+        electronic_feature: torch.Tensor,
+        num_batch: int,
+        batch_seg: torch.Tensor,
+        eps: float = 1e-8,
+    ) -> torch.Tensor:
+        """
+        Evaluate interaction block.
+
+        atomic_features (FloatTensor [N, num_features]):
+            Atomic feature vectors.
+        electronic_feature (FloatTensor [N]): 
+            either charges or spin values per molecular graph
+        num_batch (int): 
+            number of molecular graphs in the batch
+        batch_seq (LongTensor [N]): 
+            segment ids (aka _idx_m) are used to separate different molecules in a batch
+        eps (float): 
+            small number to avoid division by zero
+        """
+
+        # queries (Batchsize x N_atoms, n_atom_basis)
+        q = self.linear_q(atomic_features) 
+
+        # to account for negative and positive charge
+        if self.is_charged:
+            e = F.relu(torch.stack([electronic_feature, -electronic_feature], dim=-1))
+        # +/- spin is the same => abs
+        else:
+            e = torch.abs(electronic_feature).unsqueeze(-1)  
+        enorm = torch.maximum(e, torch.ones_like(e))
+
+        # keys (Batchsize x N_atoms, n_atom_basis), the batch_seg ensures that the key is the same for all atoms belonging to the same graph
+        k = self.linear_k(e / enorm)[batch_seg] 
+
+        # values (Batchsize x N_atoms, n_atom_basis) the batch_seg ensures that the value is the same for all atoms belonging to the same graph
+        v = self.linear_v(e)[batch_seg]
+
+        # unnormalized, scaled attention weights, obtained by dot product of queries and keys (are logits)
+        # scaling by square root of attention dimension
+        weights = torch.sum(k * q, dim=-1) / k.shape[-1] ** 0.5
+
+        # probability distribution of scaled unnormalized attention weights, by applying softmax function
+        a = nn.functional.softplus(weights)
+
+        # normalization factor for every molecular graph, by adding up attention weights of every atom in the graph
+        anorm = a.new_zeros(num_batch).index_add_(0, batch_seg, a)
+
+        # make tensor filled with anorm value at the position of the corresponding molecular graph, 
+        # indexing faster on CPU, gather faster on GPU
+        if a.device.type == "cpu": 
+            anorm = anorm[batch_seg]
+        else:
+            anorm = torch.gather(anorm, 0, batch_seg)
+
+        # return probability distribution of scaled normalized attention weights, eps is added for numerical stability (sum / batchsize equals 1)
+        return self.resblock((a / (anorm + eps)).unsqueeze(-1) * v)