Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Molecular input #234

Merged
merged 27 commits into from
Aug 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bofire/benchmarks/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,7 +479,7 @@ def __init__(
data_model = SingleTaskGPSurrogate(
inputs=Inputs(features=inputs),
outputs=Outputs(features=[outputs[0]]),
input_preprocessing_specs=input_preprocessing_specs,
input_preprocessing_specs=input_preprocessing_specs, # type: ignore
)
ground_truth_yield = surrogates.map(data_model)

Expand Down
5 changes: 5 additions & 0 deletions bofire/data_models/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@
try:
# in case of the minimal installation these import are not available
from bofire.data_models.kernels.api import AnyKernel, Kernel
from bofire.data_models.molfeatures.api import ( # noqa: F401
AnyMolFeatures,
MolFeatures,
)
from bofire.data_models.objectives.api import AnyObjective, Objective
from bofire.data_models.outlier_detection.api import (
AnyOutlierDetection,
Expand Down Expand Up @@ -49,6 +53,7 @@
AnyObjective,
AnyPrior,
AnyStrategy,
AnyMolFeatures,
Domain,
]
except ImportError:
Expand Down
59 changes: 50 additions & 9 deletions bofire/data_models/domain/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,11 @@
ContinuousOutput,
DiscreteInput,
Input,
MolecularInput,
Output,
TInputTransformSpecs,
)
from bofire.data_models.molfeatures.api import MolFeatures
from bofire.data_models.objectives.api import AbstractObjective, Objective

FeatureSequence = Union[List[AnyFeature], Tuple[AnyFeature]]
Expand Down Expand Up @@ -348,6 +350,18 @@ def _get_transform_info(
[f"{feat.key}{_CAT_SEP}{d}" for d in feat.descriptors]
)
counter += len(feat.descriptors)
elif isinstance(specs[feat.key], MolFeatures):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you also include this in the tests?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added MolecularInput to test_inputs_get_transform_info

assert isinstance(feat, MolecularInput)
descriptor_names = specs[
feat.key
].get_descriptor_names() # type: ignore
features2idx[feat.key] = tuple(
(np.array(range(len(descriptor_names))) + counter).tolist()
)
features2names[feat.key] = tuple(
[f"{feat.key}{_CAT_SEP}{d}" for d in descriptor_names]
)
counter += len(descriptor_names)
return features2idx, features2names

def transform(
Expand Down Expand Up @@ -383,6 +397,9 @@ def transform(
elif specs[feat.key] == CategoricalEncodingEnum.DESCRIPTOR:
assert isinstance(feat, CategoricalDescriptorInput)
transformed.append(feat.to_descriptor_encoding(s))
elif isinstance(specs[feat.key], MolFeatures):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you also include this in the tests?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Testing for this can be found in test_inputs_transform_molecular. This only tests the transform in the forward direction. This is kept separate from test_inputs_transform for now because the inverse transform for molecular inputs is not implemented yet.

assert isinstance(feat, MolecularInput)
transformed.append(feat.to_descriptor_encoding(specs[feat.key], s)) # type: ignore
return pd.concat(transformed, axis=1)

def inverse_transform(
Expand Down Expand Up @@ -420,6 +437,7 @@ def inverse_transform(
elif specs[feat.key] == CategoricalEncodingEnum.DESCRIPTOR:
assert isinstance(feat, CategoricalDescriptorInput)
transformed.append(feat.from_descriptor_encoding(experiments))

return pd.concat(transformed, axis=1)

def _validate_transform_specs(self, specs: TInputTransformSpecs):
Expand All @@ -429,24 +447,47 @@ def _validate_transform_specs(self, specs: TInputTransformSpecs):
specs (TInputTransformSpecs): Transform specs to be validated.
"""
# first check that the keys in the specs dict are correct also correct feature keys
if len(set(specs.keys()) - set(self.get_keys(CategoricalInput))) > 0:
if (
len(
set(specs.keys())
- set(self.get_keys(CategoricalInput))
- set(self.get_keys(MolecularInput))
)
> 0
):
raise ValueError("Unknown features specified in transform specs.")
# next check that all values are of type CategoricalEncodingEnum
# next check that all values are of type CategoricalEncodingEnum or MolFeatures
if not (
all(isinstance(enc, CategoricalEncodingEnum) for enc in specs.values())
all(
isinstance(enc, (CategoricalEncodingEnum, MolFeatures))
for enc in specs.values()
)
):
raise ValueError("Unknown transform specified.")
# next check that only Categoricalwithdescriptor have the value DESCRIPTOR
descriptor_keys = [
key
for key, value in specs.items()
if value == CategoricalEncodingEnum.DESCRIPTOR
]
# next check that only CategoricalDescriptorInput can have the value DESCRIPTOR
descriptor_keys = []
for key, value in specs.items():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure, is this also raisig an error if one assigns a molfeatures transform to a categoricaldescriptorinput?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you then also write tests for the addtions in this method?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Normally the validate input_processing_specs would have caught what you describe I think But nonetheless, def _validate_transform_specs has been improved to make sure that it will solve these types of issues too. It has been changed from how I had it before so that checking of CategoricalEncodingEnum.DESCRIPTOR and MolFeatures is separate to avoid a bug that can occur in case of user errors that can happen when there are multiple categorical variables inputs with various mistakes in transform type. Furthermore, MolecularInputs require a MolFeatures in the transform specs. Hopefully that's fine with you too.

if value == CategoricalEncodingEnum.DESCRIPTOR:
descriptor_keys.append(key)
if (
len(set(descriptor_keys) - set(self.get_keys(CategoricalDescriptorInput)))
> 0
):
raise ValueError("Wrong features types assigned to DESCRIPTOR transform.")
# next check if MolFeatures have been assigned to feature types other than MolecularInput
molfeature_keys = []
for key, value in specs.items():
if isinstance(value, MolFeatures):
molfeature_keys.append(key)
if len(set(molfeature_keys) - set(self.get_keys(MolecularInput))) > 0:
raise ValueError("Wrong features types assigned to MolFeatures transforms.")
# next check that all MolecularInput have MolFeatures transforms
for feat in self.get(includes=[MolecularInput]):
mol_encoding = specs.get(feat.key)
if mol_encoding is None:
raise ValueError("No transform assigned to MolecularInput.")
elif not isinstance(mol_encoding, MolFeatures):
raise ValueError("Incorrect transform assigned to MolecularInput.")
return specs

def get_bounds(
Expand Down
4 changes: 2 additions & 2 deletions bofire/data_models/features/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from bofire.data_models.base import BaseModel
from bofire.data_models.enum import CategoricalEncodingEnum
from bofire.data_models.molfeatures.api import AnyMolFeatures
from bofire.data_models.surrogates.scaler import ScalerEnum

TTransform = Union[CategoricalEncodingEnum, ScalerEnum]
Expand Down Expand Up @@ -141,7 +142,7 @@ def is_categorical(s: pd.Series, categories: List[str]):
return sum(s.isin(categories)) == len(s)


TInputTransformSpecs = Dict[str, CategoricalEncodingEnum]
TInputTransformSpecs = Dict[str, Union[CategoricalEncodingEnum, AnyMolFeatures]]


TDescriptors = Annotated[List[str], Field(min_items=1)]
Expand All @@ -158,5 +159,4 @@ def is_categorical(s: pd.Series, categories: List[str]):

TDiscreteVals = Annotated[List[float], Field(min_items=1)]


_CAT_SEP = "_"
83 changes: 39 additions & 44 deletions bofire/data_models/features/molecular.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,10 @@

import pandas as pd

from bofire.data_models.features.categorical import _CAT_SEP, TTransform
from bofire.data_models.features.categorical import _CAT_SEP
from bofire.data_models.features.feature import Input
from bofire.utils.cheminformatics import (
smiles2bag_of_characters,
smiles2fingerprints,
smiles2fragments,
smiles2mol,
)
from bofire.data_models.molfeatures.api import AnyMolFeatures
from bofire.utils.cheminformatics import smiles2mol


class MolecularInput(Input):
Expand All @@ -21,55 +17,54 @@ def validate_experimental(
) -> pd.Series:
for smi in values:
smiles2mol(smi)

return values

def validate_candidental(self, values: pd.Series) -> pd.Series:
for smi in values:
smiles2mol(smi)
return values

def fixed_value(self, transform_type: Optional[TTransform] = None) -> None:
return None

def is_fixed(self) -> bool:
return False

# TODO: model descriptors as pydantic class
def to_fingerprints(
self, values: pd.Series, bond_radius: int = 5, n_bits: int = 2048
) -> pd.DataFrame:
# validate it
data = smiles2fingerprints(
values.to_list(), bond_radius=bond_radius, n_bits=n_bits
)
return pd.DataFrame(
data=data,
columns=[f"{self.key}{_CAT_SEP}{i}" for i in range(data.shape[1])],
)

def to_bag_of_characters(
self, values: pd.Series, max_ngram: int = 5
) -> pd.DataFrame:
# todo: add selfies later
data = smiles2bag_of_characters(values.to_list(), max_ngram=max_ngram)
return pd.DataFrame(
data=data,
columns=[f"{self.key}{_CAT_SEP}{i}" for i in range(data.shape[1])],
)

def to_fragments(self, values: pd.Series):
data = smiles2fragments(values.to_list())
return pd.DataFrame(
data=data,
columns=[f"{self.key}{_CAT_SEP}{i}" for i in range(data.shape[1])],
)
def fixed_value(self, transform_type: Optional[AnyMolFeatures] = None) -> None:
return None

def sample(self, n: int) -> pd.Series:
raise ValueError("Sampling not supported for `MolecularInput`.")
raise ValueError("Sampling not supported for `MolecularInput`")

def get_bounds(
self, transform_type: TTransform, values: pd.Series
self, transform_type: AnyMolFeatures, values: pd.Series
) -> Tuple[List[float], List[float]]:
# TODO: this is only needed for optimization for which we need also
# MolecularCategorical, this will be added later.
raise NotImplementedError("`get_bounds` not yet implemented.")
if values is None:
raise NotImplementedError(
"`values` is currently required for `MolecularInput`"
)
else:
data = self.to_descriptor_encoding(transform_type, values)

lower = data.min(axis=0).values.tolist()
upper = data.max(axis=0).values.tolist()

return lower, upper

simonsung06 marked this conversation as resolved.
Show resolved Hide resolved
def to_descriptor_encoding(
self, transform_type: AnyMolFeatures, values: pd.Series
) -> pd.DataFrame:
"""Converts values to descriptor encoding.

Args:
values (pd.Series): Values to transform.

Returns:
pd.DataFrame: Descriptor encoded dataframe.
"""
descriptor_values = transform_type.get_descriptor_values(values)

descriptor_values.columns = [
f"{self.key}{_CAT_SEP}{d}" for d in transform_type.get_descriptor_names()
]
descriptor_values.index = values.index

return descriptor_values
4 changes: 4 additions & 0 deletions bofire/data_models/kernels/aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from bofire.data_models.kernels.categorical import HammondDistanceKernel
from bofire.data_models.kernels.continuous import LinearKernel, MaternKernel, RBFKernel
from bofire.data_models.kernels.kernel import Kernel
from bofire.data_models.kernels.molecular import TanimotoKernel
from bofire.data_models.priors.api import AnyPrior


Expand All @@ -14,6 +15,7 @@ class AdditiveKernel(Kernel):
MaternKernel,
LinearKernel,
HammondDistanceKernel,
TanimotoKernel,
"AdditiveKernel",
"MultiplicativeKernel",
"ScaleKernel",
Expand All @@ -31,6 +33,7 @@ class MultiplicativeKernel(Kernel):
LinearKernel,
HammondDistanceKernel,
AdditiveKernel,
TanimotoKernel,
"MultiplicativeKernel",
"ScaleKernel",
]
Expand All @@ -46,6 +49,7 @@ class ScaleKernel(Kernel):
HammondDistanceKernel,
AdditiveKernel,
MultiplicativeKernel,
TanimotoKernel,
"ScaleKernel",
]
outputscale_prior: Optional[AnyPrior] = None
Expand Down
10 changes: 5 additions & 5 deletions bofire/data_models/kernels/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,9 @@
RBFKernel,
)
from bofire.data_models.kernels.kernel import Kernel
from bofire.data_models.kernels.molecular import MolecularKernel, TanimotoKernel

AbstractKernel = Union[
Kernel,
CategoricalKernel,
ContinuousKernel,
]
AbstractKernel = Union[Kernel, CategoricalKernel, ContinuousKernel, MolecularKernel]

AnyContinuousKernel = Union[
MaternKernel,
Expand All @@ -31,6 +28,8 @@

AnyCategoricalKernal = HammondDistanceKernel

AnyMolecularKernel = TanimotoKernel

AnyKernel = Union[
AdditiveKernel,
MultiplicativeKernel,
Expand All @@ -39,4 +38,5 @@
LinearKernel,
MaternKernel,
RBFKernel,
TanimotoKernel,
]
12 changes: 12 additions & 0 deletions bofire/data_models/kernels/molecular.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from typing import Literal

from bofire.data_models.kernels.kernel import Kernel


class MolecularKernel(Kernel):
pass


class TanimotoKernel(MolecularKernel):
type: Literal["TanimotoKernel"] = "TanimotoKernel"
ard: bool = True
Empty file.
19 changes: 19 additions & 0 deletions bofire/data_models/molfeatures/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from typing import Union

from bofire.data_models.molfeatures.molfeatures import ( # BagOfCharacters
Fingerprints,
FingerprintsFragments,
Fragments,
MolFeatures,
MordredDescriptors,
)

AbstractMolFeatures = MolFeatures

AnyMolFeatures = Union[
Fingerprints,
Fragments,
FingerprintsFragments,
# BagOfCharacters,
MordredDescriptors,
]
Loading
Loading