Molecular input (#234)

This PR adds molecular GPs from Gauche --------- Co-authored-by: Simon <simon.sung@evonik.com> Co-authored-by: Johannes P. Dürholt <johannespeter.duerholt@evonik.com>
experimental-design · Aug 2, 2023 · e313881 · e313881
1 parent 0215f59
commit e313881
Show file tree

Hide file tree

Showing 37 changed files with 4,312 additions and 161 deletions.
diff --git a/bofire/benchmarks/multi.py b/bofire/benchmarks/multi.py
@@ -479,7 +479,7 @@ def __init__(
         data_model = SingleTaskGPSurrogate(
             inputs=Inputs(features=inputs),
             outputs=Outputs(features=[outputs[0]]),
-            input_preprocessing_specs=input_preprocessing_specs,
+            input_preprocessing_specs=input_preprocessing_specs,  # type: ignore
         )
         ground_truth_yield = surrogates.map(data_model)
 

diff --git a/bofire/data_models/api.py b/bofire/data_models/api.py
@@ -17,6 +17,10 @@
 try:
     # in case of the minimal installation these import are not available
     from bofire.data_models.kernels.api import AnyKernel, Kernel
+    from bofire.data_models.molfeatures.api import (  # noqa: F401
+        AnyMolFeatures,
+        MolFeatures,
+    )
     from bofire.data_models.objectives.api import AnyObjective, Objective
     from bofire.data_models.outlier_detection.api import (
         AnyOutlierDetection,
@@ -49,6 +53,7 @@
         AnyObjective,
         AnyPrior,
         AnyStrategy,
+        AnyMolFeatures,
         Domain,
     ]
 except ImportError:

diff --git a/bofire/data_models/domain/features.py b/bofire/data_models/domain/features.py
@@ -22,9 +22,11 @@
     ContinuousOutput,
     DiscreteInput,
     Input,
+    MolecularInput,
     Output,
     TInputTransformSpecs,
 )
+from bofire.data_models.molfeatures.api import MolFeatures
 from bofire.data_models.objectives.api import AbstractObjective, Objective
 
 FeatureSequence = Union[List[AnyFeature], Tuple[AnyFeature]]
@@ -348,6 +350,18 @@ def _get_transform_info(
                     [f"{feat.key}{_CAT_SEP}{d}" for d in feat.descriptors]
                 )
                 counter += len(feat.descriptors)
+            elif isinstance(specs[feat.key], MolFeatures):
+                assert isinstance(feat, MolecularInput)
+                descriptor_names = specs[
+                    feat.key
+                ].get_descriptor_names()  # type: ignore
+                features2idx[feat.key] = tuple(
+                    (np.array(range(len(descriptor_names))) + counter).tolist()
+                )
+                features2names[feat.key] = tuple(
+                    [f"{feat.key}{_CAT_SEP}{d}" for d in descriptor_names]
+                )
+                counter += len(descriptor_names)
         return features2idx, features2names
 
     def transform(
@@ -383,6 +397,9 @@ def transform(
             elif specs[feat.key] == CategoricalEncodingEnum.DESCRIPTOR:
                 assert isinstance(feat, CategoricalDescriptorInput)
                 transformed.append(feat.to_descriptor_encoding(s))
+            elif isinstance(specs[feat.key], MolFeatures):
+                assert isinstance(feat, MolecularInput)
+                transformed.append(feat.to_descriptor_encoding(specs[feat.key], s))  # type: ignore
         return pd.concat(transformed, axis=1)
 
     def inverse_transform(
@@ -420,6 +437,7 @@ def inverse_transform(
             elif specs[feat.key] == CategoricalEncodingEnum.DESCRIPTOR:
                 assert isinstance(feat, CategoricalDescriptorInput)
                 transformed.append(feat.from_descriptor_encoding(experiments))
+
         return pd.concat(transformed, axis=1)
 
     def _validate_transform_specs(self, specs: TInputTransformSpecs):
@@ -429,24 +447,47 @@ def _validate_transform_specs(self, specs: TInputTransformSpecs):
             specs (TInputTransformSpecs): Transform specs to be validated.
         """
         # first check that the keys in the specs dict are correct also correct feature keys
-        if len(set(specs.keys()) - set(self.get_keys(CategoricalInput))) > 0:
+        if (
+            len(
+                set(specs.keys())
+                - set(self.get_keys(CategoricalInput))
+                - set(self.get_keys(MolecularInput))
+            )
+            > 0
+        ):
             raise ValueError("Unknown features specified in transform specs.")
-        # next check that all values are of type CategoricalEncodingEnum
+        # next check that all values are of type CategoricalEncodingEnum or MolFeatures
         if not (
-            all(isinstance(enc, CategoricalEncodingEnum) for enc in specs.values())
+            all(
+                isinstance(enc, (CategoricalEncodingEnum, MolFeatures))
+                for enc in specs.values()
+            )
         ):
             raise ValueError("Unknown transform specified.")
-        # next check that only Categoricalwithdescriptor have the value DESCRIPTOR
-        descriptor_keys = [
-            key
-            for key, value in specs.items()
-            if value == CategoricalEncodingEnum.DESCRIPTOR
-        ]
+        # next check that only CategoricalDescriptorInput can have the value DESCRIPTOR
+        descriptor_keys = []
+        for key, value in specs.items():
+            if value == CategoricalEncodingEnum.DESCRIPTOR:
+                descriptor_keys.append(key)
         if (
             len(set(descriptor_keys) - set(self.get_keys(CategoricalDescriptorInput)))
             > 0
         ):
             raise ValueError("Wrong features types assigned to DESCRIPTOR transform.")
+        # next check if MolFeatures have been assigned to feature types other than MolecularInput
+        molfeature_keys = []
+        for key, value in specs.items():
+            if isinstance(value, MolFeatures):
+                molfeature_keys.append(key)
+        if len(set(molfeature_keys) - set(self.get_keys(MolecularInput))) > 0:
+            raise ValueError("Wrong features types assigned to MolFeatures transforms.")
+        # next check that all MolecularInput have MolFeatures transforms
+        for feat in self.get(includes=[MolecularInput]):
+            mol_encoding = specs.get(feat.key)
+            if mol_encoding is None:
+                raise ValueError("No transform assigned to MolecularInput.")
+            elif not isinstance(mol_encoding, MolFeatures):
+                raise ValueError("Incorrect transform assigned to MolecularInput.")
         return specs
 
     def get_bounds(

diff --git a/bofire/data_models/features/feature.py b/bofire/data_models/features/feature.py
@@ -7,6 +7,7 @@
 
 from bofire.data_models.base import BaseModel
 from bofire.data_models.enum import CategoricalEncodingEnum
+from bofire.data_models.molfeatures.api import AnyMolFeatures
 from bofire.data_models.surrogates.scaler import ScalerEnum
 
 TTransform = Union[CategoricalEncodingEnum, ScalerEnum]
@@ -141,7 +142,7 @@ def is_categorical(s: pd.Series, categories: List[str]):
     return sum(s.isin(categories)) == len(s)
 
 
-TInputTransformSpecs = Dict[str, CategoricalEncodingEnum]
+TInputTransformSpecs = Dict[str, Union[CategoricalEncodingEnum, AnyMolFeatures]]
 
 
 TDescriptors = Annotated[List[str], Field(min_items=1)]
@@ -158,5 +159,4 @@ def is_categorical(s: pd.Series, categories: List[str]):
 
 TDiscreteVals = Annotated[List[float], Field(min_items=1)]
 
-
 _CAT_SEP = "_"
diff --git a/bofire/data_models/features/molecular.py b/bofire/data_models/features/molecular.py
@@ -2,14 +2,10 @@
 
 import pandas as pd
 
-from bofire.data_models.features.categorical import _CAT_SEP, TTransform
+from bofire.data_models.features.categorical import _CAT_SEP
 from bofire.data_models.features.feature import Input
-from bofire.utils.cheminformatics import (
-    smiles2bag_of_characters,
-    smiles2fingerprints,
-    smiles2fragments,
-    smiles2mol,
-)
+from bofire.data_models.molfeatures.api import AnyMolFeatures
+from bofire.utils.cheminformatics import smiles2mol
 
 
 class MolecularInput(Input):
@@ -21,55 +17,54 @@ def validate_experimental(
     ) -> pd.Series:
         for smi in values:
             smiles2mol(smi)
+
         return values
 
     def validate_candidental(self, values: pd.Series) -> pd.Series:
         for smi in values:
             smiles2mol(smi)
         return values
 
-    def fixed_value(self, transform_type: Optional[TTransform] = None) -> None:
-        return None
-
     def is_fixed(self) -> bool:
         return False
 
-    # TODO: model descriptors as pydantic class
-    def to_fingerprints(
-        self, values: pd.Series, bond_radius: int = 5, n_bits: int = 2048
-    ) -> pd.DataFrame:
-        # validate it
-        data = smiles2fingerprints(
-            values.to_list(), bond_radius=bond_radius, n_bits=n_bits
-        )
-        return pd.DataFrame(
-            data=data,
-            columns=[f"{self.key}{_CAT_SEP}{i}" for i in range(data.shape[1])],
-        )
-
-    def to_bag_of_characters(
-        self, values: pd.Series, max_ngram: int = 5
-    ) -> pd.DataFrame:
-        # todo: add selfies later
-        data = smiles2bag_of_characters(values.to_list(), max_ngram=max_ngram)
-        return pd.DataFrame(
-            data=data,
-            columns=[f"{self.key}{_CAT_SEP}{i}" for i in range(data.shape[1])],
-        )
-
-    def to_fragments(self, values: pd.Series):
-        data = smiles2fragments(values.to_list())
-        return pd.DataFrame(
-            data=data,
-            columns=[f"{self.key}{_CAT_SEP}{i}" for i in range(data.shape[1])],
-        )
+    def fixed_value(self, transform_type: Optional[AnyMolFeatures] = None) -> None:
+        return None
 
     def sample(self, n: int) -> pd.Series:
-        raise ValueError("Sampling not supported for `MolecularInput`.")
+        raise ValueError("Sampling not supported for `MolecularInput`")
 
     def get_bounds(
-        self, transform_type: TTransform, values: pd.Series
+        self, transform_type: AnyMolFeatures, values: pd.Series
     ) -> Tuple[List[float], List[float]]:
-        # TODO: this is only needed for optimization for which we need also
-        # MolecularCategorical, this will be added later.
-        raise NotImplementedError("`get_bounds` not yet implemented.")
+        if values is None:
+            raise NotImplementedError(
+                "`values` is currently required for `MolecularInput`"
+            )
+        else:
+            data = self.to_descriptor_encoding(transform_type, values)
+
+        lower = data.min(axis=0).values.tolist()
+        upper = data.max(axis=0).values.tolist()
+
+        return lower, upper
+
+    def to_descriptor_encoding(
+        self, transform_type: AnyMolFeatures, values: pd.Series
+    ) -> pd.DataFrame:
+        """Converts values to descriptor encoding.
+
+        Args:
+            values (pd.Series): Values to transform.
+
+        Returns:
+            pd.DataFrame: Descriptor encoded dataframe.
+        """
+        descriptor_values = transform_type.get_descriptor_values(values)
+
+        descriptor_values.columns = [
+            f"{self.key}{_CAT_SEP}{d}" for d in transform_type.get_descriptor_names()
+        ]
+        descriptor_values.index = values.index
+
+        return descriptor_values
diff --git a/bofire/data_models/kernels/aggregation.py b/bofire/data_models/kernels/aggregation.py
@@ -3,6 +3,7 @@
 from bofire.data_models.kernels.categorical import HammondDistanceKernel
 from bofire.data_models.kernels.continuous import LinearKernel, MaternKernel, RBFKernel
 from bofire.data_models.kernels.kernel import Kernel
+from bofire.data_models.kernels.molecular import TanimotoKernel
 from bofire.data_models.priors.api import AnyPrior
 
 
@@ -14,6 +15,7 @@ class AdditiveKernel(Kernel):
             MaternKernel,
             LinearKernel,
             HammondDistanceKernel,
+            TanimotoKernel,
             "AdditiveKernel",
             "MultiplicativeKernel",
             "ScaleKernel",
@@ -31,6 +33,7 @@ class MultiplicativeKernel(Kernel):
             LinearKernel,
             HammondDistanceKernel,
             AdditiveKernel,
+            TanimotoKernel,
             "MultiplicativeKernel",
             "ScaleKernel",
         ]
@@ -46,6 +49,7 @@ class ScaleKernel(Kernel):
         HammondDistanceKernel,
         AdditiveKernel,
         MultiplicativeKernel,
+        TanimotoKernel,
         "ScaleKernel",
     ]
     outputscale_prior: Optional[AnyPrior] = None

diff --git a/bofire/data_models/kernels/api.py b/bofire/data_models/kernels/api.py
@@ -16,12 +16,9 @@
     RBFKernel,
 )
 from bofire.data_models.kernels.kernel import Kernel
+from bofire.data_models.kernels.molecular import MolecularKernel, TanimotoKernel
 
-AbstractKernel = Union[
-    Kernel,
-    CategoricalKernel,
-    ContinuousKernel,
-]
+AbstractKernel = Union[Kernel, CategoricalKernel, ContinuousKernel, MolecularKernel]
 
 AnyContinuousKernel = Union[
     MaternKernel,
@@ -31,6 +28,8 @@
 
 AnyCategoricalKernal = HammondDistanceKernel
 
+AnyMolecularKernel = TanimotoKernel
+
 AnyKernel = Union[
     AdditiveKernel,
     MultiplicativeKernel,
@@ -39,4 +38,5 @@
     LinearKernel,
     MaternKernel,
     RBFKernel,
+    TanimotoKernel,
 ]
diff --git a/bofire/data_models/kernels/molecular.py b/bofire/data_models/kernels/molecular.py
@@ -0,0 +1,12 @@
+from typing import Literal
+
+from bofire.data_models.kernels.kernel import Kernel
+
+
+class MolecularKernel(Kernel):
+    pass
+
+
+class TanimotoKernel(MolecularKernel):
+    type: Literal["TanimotoKernel"] = "TanimotoKernel"
+    ard: bool = True
diff --git a/bofire/data_models/molfeatures/__init__.py b/bofire/data_models/molfeatures/__init__.py
diff --git a/bofire/data_models/molfeatures/api.py b/bofire/data_models/molfeatures/api.py
@@ -0,0 +1,19 @@
+from typing import Union
+
+from bofire.data_models.molfeatures.molfeatures import (  # BagOfCharacters
+    Fingerprints,
+    FingerprintsFragments,
+    Fragments,
+    MolFeatures,
+    MordredDescriptors,
+)
+
+AbstractMolFeatures = MolFeatures
+
+AnyMolFeatures = Union[
+    Fingerprints,
+    Fragments,
+    FingerprintsFragments,
+    # BagOfCharacters,
+    MordredDescriptors,
+]