Skip to content

Commit

Permalink
Molecular input (#234)
Browse files Browse the repository at this point in the history
This PR adds molecular GPs from Gauche
---------

Co-authored-by: Simon <simon.sung@evonik.com>
Co-authored-by: Johannes P. Dürholt <johannespeter.duerholt@evonik.com>
  • Loading branch information
3 people authored Aug 2, 2023
1 parent 0215f59 commit e313881
Show file tree
Hide file tree
Showing 37 changed files with 4,312 additions and 161 deletions.
2 changes: 1 addition & 1 deletion bofire/benchmarks/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,7 +479,7 @@ def __init__(
data_model = SingleTaskGPSurrogate(
inputs=Inputs(features=inputs),
outputs=Outputs(features=[outputs[0]]),
input_preprocessing_specs=input_preprocessing_specs,
input_preprocessing_specs=input_preprocessing_specs, # type: ignore
)
ground_truth_yield = surrogates.map(data_model)

Expand Down
5 changes: 5 additions & 0 deletions bofire/data_models/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@
try:
# in case of the minimal installation these import are not available
from bofire.data_models.kernels.api import AnyKernel, Kernel
from bofire.data_models.molfeatures.api import ( # noqa: F401
AnyMolFeatures,
MolFeatures,
)
from bofire.data_models.objectives.api import AnyObjective, Objective
from bofire.data_models.outlier_detection.api import (
AnyOutlierDetection,
Expand Down Expand Up @@ -49,6 +53,7 @@
AnyObjective,
AnyPrior,
AnyStrategy,
AnyMolFeatures,
Domain,
]
except ImportError:
Expand Down
59 changes: 50 additions & 9 deletions bofire/data_models/domain/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,11 @@
ContinuousOutput,
DiscreteInput,
Input,
MolecularInput,
Output,
TInputTransformSpecs,
)
from bofire.data_models.molfeatures.api import MolFeatures
from bofire.data_models.objectives.api import AbstractObjective, Objective

FeatureSequence = Union[List[AnyFeature], Tuple[AnyFeature]]
Expand Down Expand Up @@ -348,6 +350,18 @@ def _get_transform_info(
[f"{feat.key}{_CAT_SEP}{d}" for d in feat.descriptors]
)
counter += len(feat.descriptors)
elif isinstance(specs[feat.key], MolFeatures):
assert isinstance(feat, MolecularInput)
descriptor_names = specs[
feat.key
].get_descriptor_names() # type: ignore
features2idx[feat.key] = tuple(
(np.array(range(len(descriptor_names))) + counter).tolist()
)
features2names[feat.key] = tuple(
[f"{feat.key}{_CAT_SEP}{d}" for d in descriptor_names]
)
counter += len(descriptor_names)
return features2idx, features2names

def transform(
Expand Down Expand Up @@ -383,6 +397,9 @@ def transform(
elif specs[feat.key] == CategoricalEncodingEnum.DESCRIPTOR:
assert isinstance(feat, CategoricalDescriptorInput)
transformed.append(feat.to_descriptor_encoding(s))
elif isinstance(specs[feat.key], MolFeatures):
assert isinstance(feat, MolecularInput)
transformed.append(feat.to_descriptor_encoding(specs[feat.key], s)) # type: ignore
return pd.concat(transformed, axis=1)

def inverse_transform(
Expand Down Expand Up @@ -420,6 +437,7 @@ def inverse_transform(
elif specs[feat.key] == CategoricalEncodingEnum.DESCRIPTOR:
assert isinstance(feat, CategoricalDescriptorInput)
transformed.append(feat.from_descriptor_encoding(experiments))

return pd.concat(transformed, axis=1)

def _validate_transform_specs(self, specs: TInputTransformSpecs):
Expand All @@ -429,24 +447,47 @@ def _validate_transform_specs(self, specs: TInputTransformSpecs):
specs (TInputTransformSpecs): Transform specs to be validated.
"""
# first check that the keys in the specs dict are correct also correct feature keys
if len(set(specs.keys()) - set(self.get_keys(CategoricalInput))) > 0:
if (
len(
set(specs.keys())
- set(self.get_keys(CategoricalInput))
- set(self.get_keys(MolecularInput))
)
> 0
):
raise ValueError("Unknown features specified in transform specs.")
# next check that all values are of type CategoricalEncodingEnum
# next check that all values are of type CategoricalEncodingEnum or MolFeatures
if not (
all(isinstance(enc, CategoricalEncodingEnum) for enc in specs.values())
all(
isinstance(enc, (CategoricalEncodingEnum, MolFeatures))
for enc in specs.values()
)
):
raise ValueError("Unknown transform specified.")
# next check that only Categoricalwithdescriptor have the value DESCRIPTOR
descriptor_keys = [
key
for key, value in specs.items()
if value == CategoricalEncodingEnum.DESCRIPTOR
]
# next check that only CategoricalDescriptorInput can have the value DESCRIPTOR
descriptor_keys = []
for key, value in specs.items():
if value == CategoricalEncodingEnum.DESCRIPTOR:
descriptor_keys.append(key)
if (
len(set(descriptor_keys) - set(self.get_keys(CategoricalDescriptorInput)))
> 0
):
raise ValueError("Wrong features types assigned to DESCRIPTOR transform.")
# next check if MolFeatures have been assigned to feature types other than MolecularInput
molfeature_keys = []
for key, value in specs.items():
if isinstance(value, MolFeatures):
molfeature_keys.append(key)
if len(set(molfeature_keys) - set(self.get_keys(MolecularInput))) > 0:
raise ValueError("Wrong features types assigned to MolFeatures transforms.")
# next check that all MolecularInput have MolFeatures transforms
for feat in self.get(includes=[MolecularInput]):
mol_encoding = specs.get(feat.key)
if mol_encoding is None:
raise ValueError("No transform assigned to MolecularInput.")
elif not isinstance(mol_encoding, MolFeatures):
raise ValueError("Incorrect transform assigned to MolecularInput.")
return specs

def get_bounds(
Expand Down
4 changes: 2 additions & 2 deletions bofire/data_models/features/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from bofire.data_models.base import BaseModel
from bofire.data_models.enum import CategoricalEncodingEnum
from bofire.data_models.molfeatures.api import AnyMolFeatures
from bofire.data_models.surrogates.scaler import ScalerEnum

TTransform = Union[CategoricalEncodingEnum, ScalerEnum]
Expand Down Expand Up @@ -141,7 +142,7 @@ def is_categorical(s: pd.Series, categories: List[str]):
return sum(s.isin(categories)) == len(s)


TInputTransformSpecs = Dict[str, CategoricalEncodingEnum]
TInputTransformSpecs = Dict[str, Union[CategoricalEncodingEnum, AnyMolFeatures]]


TDescriptors = Annotated[List[str], Field(min_items=1)]
Expand All @@ -158,5 +159,4 @@ def is_categorical(s: pd.Series, categories: List[str]):

TDiscreteVals = Annotated[List[float], Field(min_items=1)]


_CAT_SEP = "_"
83 changes: 39 additions & 44 deletions bofire/data_models/features/molecular.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,10 @@

import pandas as pd

from bofire.data_models.features.categorical import _CAT_SEP, TTransform
from bofire.data_models.features.categorical import _CAT_SEP
from bofire.data_models.features.feature import Input
from bofire.utils.cheminformatics import (
smiles2bag_of_characters,
smiles2fingerprints,
smiles2fragments,
smiles2mol,
)
from bofire.data_models.molfeatures.api import AnyMolFeatures
from bofire.utils.cheminformatics import smiles2mol


class MolecularInput(Input):
Expand All @@ -21,55 +17,54 @@ def validate_experimental(
) -> pd.Series:
for smi in values:
smiles2mol(smi)

return values

def validate_candidental(self, values: pd.Series) -> pd.Series:
for smi in values:
smiles2mol(smi)
return values

def fixed_value(self, transform_type: Optional[TTransform] = None) -> None:
return None

def is_fixed(self) -> bool:
return False

# TODO: model descriptors as pydantic class
def to_fingerprints(
self, values: pd.Series, bond_radius: int = 5, n_bits: int = 2048
) -> pd.DataFrame:
# validate it
data = smiles2fingerprints(
values.to_list(), bond_radius=bond_radius, n_bits=n_bits
)
return pd.DataFrame(
data=data,
columns=[f"{self.key}{_CAT_SEP}{i}" for i in range(data.shape[1])],
)

def to_bag_of_characters(
self, values: pd.Series, max_ngram: int = 5
) -> pd.DataFrame:
# todo: add selfies later
data = smiles2bag_of_characters(values.to_list(), max_ngram=max_ngram)
return pd.DataFrame(
data=data,
columns=[f"{self.key}{_CAT_SEP}{i}" for i in range(data.shape[1])],
)

def to_fragments(self, values: pd.Series):
data = smiles2fragments(values.to_list())
return pd.DataFrame(
data=data,
columns=[f"{self.key}{_CAT_SEP}{i}" for i in range(data.shape[1])],
)
def fixed_value(self, transform_type: Optional[AnyMolFeatures] = None) -> None:
return None

def sample(self, n: int) -> pd.Series:
raise ValueError("Sampling not supported for `MolecularInput`.")
raise ValueError("Sampling not supported for `MolecularInput`")

def get_bounds(
self, transform_type: TTransform, values: pd.Series
self, transform_type: AnyMolFeatures, values: pd.Series
) -> Tuple[List[float], List[float]]:
# TODO: this is only needed for optimization for which we need also
# MolecularCategorical, this will be added later.
raise NotImplementedError("`get_bounds` not yet implemented.")
if values is None:
raise NotImplementedError(
"`values` is currently required for `MolecularInput`"
)
else:
data = self.to_descriptor_encoding(transform_type, values)

lower = data.min(axis=0).values.tolist()
upper = data.max(axis=0).values.tolist()

return lower, upper

def to_descriptor_encoding(
self, transform_type: AnyMolFeatures, values: pd.Series
) -> pd.DataFrame:
"""Converts values to descriptor encoding.
Args:
values (pd.Series): Values to transform.
Returns:
pd.DataFrame: Descriptor encoded dataframe.
"""
descriptor_values = transform_type.get_descriptor_values(values)

descriptor_values.columns = [
f"{self.key}{_CAT_SEP}{d}" for d in transform_type.get_descriptor_names()
]
descriptor_values.index = values.index

return descriptor_values
4 changes: 4 additions & 0 deletions bofire/data_models/kernels/aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from bofire.data_models.kernels.categorical import HammondDistanceKernel
from bofire.data_models.kernels.continuous import LinearKernel, MaternKernel, RBFKernel
from bofire.data_models.kernels.kernel import Kernel
from bofire.data_models.kernels.molecular import TanimotoKernel
from bofire.data_models.priors.api import AnyPrior


Expand All @@ -14,6 +15,7 @@ class AdditiveKernel(Kernel):
MaternKernel,
LinearKernel,
HammondDistanceKernel,
TanimotoKernel,
"AdditiveKernel",
"MultiplicativeKernel",
"ScaleKernel",
Expand All @@ -31,6 +33,7 @@ class MultiplicativeKernel(Kernel):
LinearKernel,
HammondDistanceKernel,
AdditiveKernel,
TanimotoKernel,
"MultiplicativeKernel",
"ScaleKernel",
]
Expand All @@ -46,6 +49,7 @@ class ScaleKernel(Kernel):
HammondDistanceKernel,
AdditiveKernel,
MultiplicativeKernel,
TanimotoKernel,
"ScaleKernel",
]
outputscale_prior: Optional[AnyPrior] = None
Expand Down
10 changes: 5 additions & 5 deletions bofire/data_models/kernels/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,9 @@
RBFKernel,
)
from bofire.data_models.kernels.kernel import Kernel
from bofire.data_models.kernels.molecular import MolecularKernel, TanimotoKernel

AbstractKernel = Union[
Kernel,
CategoricalKernel,
ContinuousKernel,
]
AbstractKernel = Union[Kernel, CategoricalKernel, ContinuousKernel, MolecularKernel]

AnyContinuousKernel = Union[
MaternKernel,
Expand All @@ -31,6 +28,8 @@

AnyCategoricalKernal = HammondDistanceKernel

AnyMolecularKernel = TanimotoKernel

AnyKernel = Union[
AdditiveKernel,
MultiplicativeKernel,
Expand All @@ -39,4 +38,5 @@
LinearKernel,
MaternKernel,
RBFKernel,
TanimotoKernel,
]
12 changes: 12 additions & 0 deletions bofire/data_models/kernels/molecular.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from typing import Literal

from bofire.data_models.kernels.kernel import Kernel


class MolecularKernel(Kernel):
pass


class TanimotoKernel(MolecularKernel):
type: Literal["TanimotoKernel"] = "TanimotoKernel"
ard: bool = True
Empty file.
19 changes: 19 additions & 0 deletions bofire/data_models/molfeatures/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from typing import Union

from bofire.data_models.molfeatures.molfeatures import ( # BagOfCharacters
Fingerprints,
FingerprintsFragments,
Fragments,
MolFeatures,
MordredDescriptors,
)

AbstractMolFeatures = MolFeatures

AnyMolFeatures = Union[
Fingerprints,
Fragments,
FingerprintsFragments,
# BagOfCharacters,
MordredDescriptors,
]
Loading

0 comments on commit e313881

Please sign in to comment.