experimental-design · jduerholt · Aug 2, 2023 · May 23, 2023 · May 24, 2023 · Jun 30, 2023
diff --git a/bofire/data_models/constraints/nonlinear.py b/bofire/data_models/constraints/nonlinear.py
@@ -52,7 +52,6 @@ def __call__(self, experiments: pd.DataFrame) -> pd.Series:
         return experiments.eval(self.expression)
 
     def jacobian(self, experiments: pd.DataFrame) -> pd.DataFrame:
-
         if self.jacobian_expression is not None:
             res = experiments.eval(self.jacobian_expression)
             for i, col in enumerate(res):

diff --git a/bofire/data_models/domain/features.py b/bofire/data_models/domain/features.py
@@ -22,9 +22,11 @@
     ContinuousOutput,
     DiscreteInput,
     Input,
+    MolecularInput,
     Output,
     TInputTransformSpecs,
 )
+from bofire.data_models.molfeatures.api import MolFeatures
 from bofire.data_models.objectives.api import AbstractObjective, Objective
 
 FeatureSequence = Union[List[AnyFeature], Tuple[AnyFeature]]
@@ -348,6 +350,16 @@ def _get_transform_info(
                     [f"{feat.key}{_CAT_SEP}{d}" for d in feat.descriptors]
                 )
                 counter += len(feat.descriptors)
+            elif isinstance(specs[feat.key], MolFeatures):
+                assert isinstance(feat, MolecularInput)
+                descriptor_names = specs[feat.key].get_descriptor_names()
+                features2idx[feat.key] = tuple(
+                    (np.array(range(len(descriptor_names))) + counter).tolist()
+                )
+                features2names[feat.key] = tuple(
+                    [f"{feat.key}{_CAT_SEP}{d}" for d in descriptor_names]
+                )
+                counter += len(descriptor_names)
         return features2idx, features2names
 
     def transform(
@@ -383,6 +395,9 @@ def transform(
             elif specs[feat.key] == CategoricalEncodingEnum.DESCRIPTOR:
                 assert isinstance(feat, CategoricalDescriptorInput)
                 transformed.append(feat.to_descriptor_encoding(s))
+            elif isinstance(specs[feat.key], MolFeatures):
+                assert isinstance(feat, MolecularInput)
+                transformed.append(feat.to_descriptor_encoding(specs[feat.key], s))
         return pd.concat(transformed, axis=1)
 
     def inverse_transform(
@@ -420,6 +435,7 @@ def inverse_transform(
             elif specs[feat.key] == CategoricalEncodingEnum.DESCRIPTOR:
                 assert isinstance(feat, CategoricalDescriptorInput)
                 transformed.append(feat.from_descriptor_encoding(experiments))
+
         return pd.concat(transformed, axis=1)
 
     def _validate_transform_specs(self, specs: TInputTransformSpecs):
@@ -429,21 +445,36 @@ def _validate_transform_specs(self, specs: TInputTransformSpecs):
             specs (TInputTransformSpecs): Transform specs to be validated.
         """
         # first check that the keys in the specs dict are correct also correct feature keys
-        if len(set(specs.keys()) - set(self.get_keys(CategoricalInput))) > 0:
+        if (
+            len(
+                set(specs.keys())
+                - set(self.get_keys(CategoricalInput))
+                - set(self.get_keys(MolecularInput))
+            )
+            > 0
+        ):
             raise ValueError("Unknown features specified in transform specs.")
-        # next check that all values are of type CategoricalEncodingEnum
+        # next check that all values are of type CategoricalEncodingEnum or MolFeatures
         if not (
-            all(isinstance(enc, CategoricalEncodingEnum) for enc in specs.values())
+            all(
+                isinstance(enc, (CategoricalEncodingEnum, MolFeatures))
+                for enc in specs.values()
+            )
         ):
             raise ValueError("Unknown transform specified.")
-        # next check that only Categoricalwithdescriptor have the value DESCRIPTOR
-        descriptor_keys = [
-            key
-            for key, value in specs.items()
-            if value == CategoricalEncodingEnum.DESCRIPTOR
-        ]
+        # next check that only Categoricalwithdescriptor have the value DESCRIPTOR or are of type MolFeatures
+        descriptor_keys = []
+        for key, value in specs.items():
+            if value == CategoricalEncodingEnum.DESCRIPTOR or (
+                isinstance(value, MolFeatures)
+            ):
+                descriptor_keys.append(key)
         if (
-            len(set(descriptor_keys) - set(self.get_keys(CategoricalDescriptorInput)))
+            len(
+                set(descriptor_keys)
+                - set(self.get_keys(CategoricalDescriptorInput))
+                - set(self.get_keys(MolecularInput))
+            )
             > 0
         ):
             raise ValueError("Wrong features types assigned to DESCRIPTOR transform.")

diff --git a/bofire/data_models/enum.py b/bofire/data_models/enum.py
@@ -1,6 +1,5 @@
 from enum import Enum
 
-
 class SamplingMethodEnum(Enum):
     UNIFORM = "UNIFORM"
     SOBOL = "SOBOL"

diff --git a/bofire/data_models/features/feature.py b/bofire/data_models/features/feature.py
@@ -6,6 +6,7 @@
 
 from bofire.data_models.base import BaseModel
 from bofire.data_models.enum import CategoricalEncodingEnum
+from bofire.data_models.molfeatures.api import AnyMolFeatures
 from bofire.data_models.surrogates.scaler import ScalerEnum
 
 TTransform = Union[CategoricalEncodingEnum, ScalerEnum]
@@ -140,7 +141,7 @@ def is_categorical(s: pd.Series, categories: List[str]):
     return sum(s.isin(categories)) == len(s)
 
 
-TInputTransformSpecs = Dict[str, CategoricalEncodingEnum]
+TInputTransformSpecs = Dict[str, Union[CategoricalEncodingEnum, AnyMolFeatures]]
 
 
 TDescriptors = Annotated[List[str], Field(min_items=1)]
@@ -157,5 +158,6 @@ def is_categorical(s: pd.Series, categories: List[str]):
 
 TDiscreteVals = Annotated[List[float], Field(min_items=1)]
 
+TMolecularVals = List[List[Union[float, int]]]
 
 _CAT_SEP = "_"
diff --git a/bofire/data_models/features/molecular.py b/bofire/data_models/features/molecular.py
@@ -1,75 +1,82 @@
 from typing import ClassVar, List, Literal, Optional, Tuple
 
 import pandas as pd
+from pydantic import Field
 
-from bofire.data_models.features.categorical import _CAT_SEP, TTransform
-from bofire.data_models.features.feature import Input
-from bofire.utils.cheminformatics import (
-    smiles2bag_of_characters,
-    smiles2fingerprints,
-    smiles2fragments,
-    smiles2mol,
-)
+from bofire.data_models.features.categorical import _CAT_SEP
+from bofire.data_models.features.feature import Input, TMolecularVals
+from bofire.data_models.molfeatures.api import AnyMolFeatures
+from bofire.utils.cheminformatics import smiles2mol
 
 
 class MolecularInput(Input):
     type: Literal["MolecularInput"] = "MolecularInput"
+    descriptor_values: TMolecularVals = Field(default_factory=list)
     order: ClassVar[int] = 6
 
     def validate_experimental(
         self, values: pd.Series, strict: bool = False
     ) -> pd.Series:
         for smi in values:
             smiles2mol(smi)
+
         return values
 
     def validate_candidental(self, values: pd.Series) -> pd.Series:
         for smi in values:
             smiles2mol(smi)
         return values
 
-    def fixed_value(self, transform_type: Optional[TTransform] = None) -> None:
-        return None
-
     def is_fixed(self) -> bool:
         return False
 
-    # TODO: model descriptors as pydantic class
-    def to_fingerprints(
-        self, values: pd.Series, bond_radius: int = 5, n_bits: int = 2048
-    ) -> pd.DataFrame:
-        # validate it
-        data = smiles2fingerprints(
-            values.to_list(), bond_radius=bond_radius, n_bits=n_bits
-        )
-        return pd.DataFrame(
-            data=data,
-            columns=[f"{self.key}{_CAT_SEP}{i}" for i in range(data.shape[1])],
-        )
-
-    def to_bag_of_characters(
-        self, values: pd.Series, max_ngram: int = 5
-    ) -> pd.DataFrame:
-        # todo: add selfies later
-        data = smiles2bag_of_characters(values.to_list(), max_ngram=max_ngram)
-        return pd.DataFrame(
-            data=data,
-            columns=[f"{self.key}{_CAT_SEP}{i}" for i in range(data.shape[1])],
-        )
-
-    def to_fragments(self, values: pd.Series):
-        data = smiles2fragments(values.to_list())
-        return pd.DataFrame(
-            data=data,
-            columns=[f"{self.key}{_CAT_SEP}{i}" for i in range(data.shape[1])],
-        )
+    def fixed_value(self, transform_type: Optional[AnyMolFeatures] = None) -> None:
+        return None
 
     def sample(self, n: int) -> pd.Series:
-        raise ValueError("Sampling not supported for `MolecularInput`.")
+        raise ValueError("Sampling not supported for `MolecularInput`")
 
     def get_bounds(
-        self, transform_type: TTransform, values: pd.Series
+        self, transform_type: AnyMolFeatures, values: pd.Series
     ) -> Tuple[List[float], List[float]]:
-        # TODO: this is only needed for optimization for which we need also
-        # MolecularCategorical, this will be added later.
-        raise NotImplementedError("`get_bounds` not yet implemented.")
+        if len(self.descriptor_values) == 0:
+            self.descriptor_values = transform_type.get_descriptor_values(
+                values
+            ).values.tolist()
+
+        if values is None:
+            raise NotImplementedError(
+                "`values` is currently required for `MolecularInput`"
+            )
+        else:
+            data = self.to_descriptor_encoding(transform_type, values)
+
+        lower = data.min(axis=0).values.tolist()
+        upper = data.max(axis=0).values.tolist()
+
+        return lower, upper
+
+    def to_descriptor_encoding(
+        self, transform_type: AnyMolFeatures, values: pd.Series
+    ) -> pd.DataFrame:
+        """Converts values to descriptor encoding.
+
+        Args:
+            values (pd.Series): Values to transform.
+
+        Returns:
+            pd.DataFrame: Descriptor encoded dataframe.
+        """
+        if len(self.descriptor_values) == 0:
+            self.descriptor_values = transform_type.get_descriptor_values(
+                values
+            ).values.tolist()
+
+        return pd.DataFrame(
+            data=self.descriptor_values,
+            columns=[
+                f"{self.key}{_CAT_SEP}{d}"
+                for d in transform_type.get_descriptor_names()
+            ],
+            index=values.index,
+        )
diff --git a/bofire/data_models/kernels/aggregation.py b/bofire/data_models/kernels/aggregation.py
@@ -3,6 +3,7 @@
 from bofire.data_models.kernels.categorical import HammondDistanceKernel
 from bofire.data_models.kernels.continuous import LinearKernel, MaternKernel, RBFKernel
 from bofire.data_models.kernels.kernel import Kernel
+from bofire.data_models.kernels.molecular import TanimotoKernel
 from bofire.data_models.priors.api import AnyPrior
 
 
@@ -14,6 +15,7 @@ class AdditiveKernel(Kernel):
             MaternKernel,
             LinearKernel,
             HammondDistanceKernel,
+            TanimotoKernel,
             "AdditiveKernel",
             "MultiplicativeKernel",
             "ScaleKernel",
@@ -31,6 +33,7 @@ class MultiplicativeKernel(Kernel):
             LinearKernel,
             HammondDistanceKernel,
             AdditiveKernel,
+            TanimotoKernel,
             "MultiplicativeKernel",
             "ScaleKernel",
         ]
@@ -46,6 +49,7 @@ class ScaleKernel(Kernel):
         HammondDistanceKernel,
         AdditiveKernel,
         MultiplicativeKernel,
+        TanimotoKernel,
         "ScaleKernel",
     ]
     outputscale_prior: Optional[AnyPrior] = None

diff --git a/bofire/data_models/kernels/api.py b/bofire/data_models/kernels/api.py
@@ -16,12 +16,9 @@
     RBFKernel,
 )
 from bofire.data_models.kernels.kernel import Kernel
+from bofire.data_models.kernels.molecular import MolecularKernel, TanimotoKernel
 
-AbstractKernel = Union[
-    Kernel,
-    CategoricalKernel,
-    ContinuousKernel,
-]
+AbstractKernel = Union[Kernel, CategoricalKernel, ContinuousKernel, MolecularKernel]
 
 AnyContinuousKernel = Union[
     MaternKernel,
@@ -31,6 +28,8 @@
 
 AnyCategoricalKernal = HammondDistanceKernel
 
+AnyMolecularKernel = TanimotoKernel
+
 AnyKernel = Union[
     AdditiveKernel,
     MultiplicativeKernel,
@@ -39,4 +38,5 @@
     LinearKernel,
     MaternKernel,
     RBFKernel,
+    TanimotoKernel,
 ]
diff --git a/bofire/data_models/kernels/molecular.py b/bofire/data_models/kernels/molecular.py
@@ -0,0 +1,12 @@
+from typing import Literal
+
+from bofire.data_models.kernels.kernel import Kernel
+
+
+class MolecularKernel(Kernel):
+    pass
+
+
+class TanimotoKernel(MolecularKernel):
+    type: Literal["TanimotoKernel"] = "TanimotoKernel"
+    ard: bool = True
diff --git a/bofire/data_models/molfeatures/__init__.py b/bofire/data_models/molfeatures/__init__.py
diff --git a/bofire/data_models/molfeatures/api.py b/bofire/data_models/molfeatures/api.py
@@ -0,0 +1,19 @@
+from typing import Union
+
+from bofire.data_models.molfeatures.molfeatures import (  # BagOfCharacters
+    Fingerprints,
+    FingerprintsFragments,
+    Fragments,
+    MolFeatures,
+    MordredDescriptors,
+)
+
+AbstractMolFeatures = MolFeatures
+
+AnyMolFeatures = Union[
+    Fingerprints,
+    Fragments,
+    FingerprintsFragments,
+    # BagOfCharacters,
+    MordredDescriptors,
+]