-
Notifications
You must be signed in to change notification settings - Fork 22
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Molecular input #234
Molecular input #234
Changes from 10 commits
fff267b
8f40ee7
cab7db2
050a526
4121c59
29356d2
6df508b
dfe64f2
208af95
225ee76
e252160
2114de5
6c6d8cd
bc62b87
fca6a94
06f4844
fd12a81
7d49979
189fcae
781f0dd
22441fa
857be19
1254648
ed83a3c
a139854
49013c3
c734620
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,9 +22,11 @@ | |
ContinuousOutput, | ||
DiscreteInput, | ||
Input, | ||
MolecularInput, | ||
Output, | ||
TInputTransformSpecs, | ||
) | ||
from bofire.data_models.molfeatures.api import MolFeatures | ||
from bofire.data_models.objectives.api import AbstractObjective, Objective | ||
|
||
FeatureSequence = Union[List[AnyFeature], Tuple[AnyFeature]] | ||
|
@@ -348,6 +350,16 @@ def _get_transform_info( | |
[f"{feat.key}{_CAT_SEP}{d}" for d in feat.descriptors] | ||
) | ||
counter += len(feat.descriptors) | ||
elif isinstance(specs[feat.key], MolFeatures): | ||
assert isinstance(feat, MolecularInput) | ||
descriptor_names = specs[feat.key].get_descriptor_names() | ||
features2idx[feat.key] = tuple( | ||
(np.array(range(len(descriptor_names))) + counter).tolist() | ||
) | ||
features2names[feat.key] = tuple( | ||
[f"{feat.key}{_CAT_SEP}{d}" for d in descriptor_names] | ||
) | ||
counter += len(descriptor_names) | ||
return features2idx, features2names | ||
|
||
def transform( | ||
|
@@ -383,6 +395,9 @@ def transform( | |
elif specs[feat.key] == CategoricalEncodingEnum.DESCRIPTOR: | ||
assert isinstance(feat, CategoricalDescriptorInput) | ||
transformed.append(feat.to_descriptor_encoding(s)) | ||
elif isinstance(specs[feat.key], MolFeatures): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you also include this in the tests? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Testing for this can be found in |
||
assert isinstance(feat, MolecularInput) | ||
transformed.append(feat.to_descriptor_encoding(specs[feat.key], s)) | ||
return pd.concat(transformed, axis=1) | ||
|
||
def inverse_transform( | ||
|
@@ -420,6 +435,7 @@ def inverse_transform( | |
elif specs[feat.key] == CategoricalEncodingEnum.DESCRIPTOR: | ||
assert isinstance(feat, CategoricalDescriptorInput) | ||
transformed.append(feat.from_descriptor_encoding(experiments)) | ||
|
||
return pd.concat(transformed, axis=1) | ||
|
||
def _validate_transform_specs(self, specs: TInputTransformSpecs): | ||
|
@@ -429,21 +445,36 @@ def _validate_transform_specs(self, specs: TInputTransformSpecs): | |
specs (TInputTransformSpecs): Transform specs to be validated. | ||
""" | ||
# first check that the keys in the specs dict are correct also correct feature keys | ||
if len(set(specs.keys()) - set(self.get_keys(CategoricalInput))) > 0: | ||
if ( | ||
len( | ||
set(specs.keys()) | ||
- set(self.get_keys(CategoricalInput)) | ||
- set(self.get_keys(MolecularInput)) | ||
) | ||
> 0 | ||
): | ||
raise ValueError("Unknown features specified in transform specs.") | ||
# next check that all values are of type CategoricalEncodingEnum | ||
# next check that all values are of type CategoricalEncodingEnum or MolFeatures | ||
if not ( | ||
all(isinstance(enc, CategoricalEncodingEnum) for enc in specs.values()) | ||
all( | ||
isinstance(enc, (CategoricalEncodingEnum, MolFeatures)) | ||
for enc in specs.values() | ||
) | ||
): | ||
raise ValueError("Unknown transform specified.") | ||
# next check that only Categoricalwithdescriptor have the value DESCRIPTOR | ||
descriptor_keys = [ | ||
key | ||
for key, value in specs.items() | ||
if value == CategoricalEncodingEnum.DESCRIPTOR | ||
] | ||
# next check that only Categoricalwithdescriptor have the value DESCRIPTOR or are of type MolFeatures | ||
descriptor_keys = [] | ||
for key, value in specs.items(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure, is this also raisig an error if one assigns a molfeatures transform to a categoricaldescriptorinput? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you then also write tests for the addtions in this method? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Normally the validate input_processing_specs would have caught what you describe I think But nonetheless, def _validate_transform_specs has been improved to make sure that it will solve these types of issues too. It has been changed from how I had it before so that checking of CategoricalEncodingEnum.DESCRIPTOR and MolFeatures is separate to avoid a bug that can occur in case of user errors that can happen when there are multiple categorical variables inputs with various mistakes in transform type. Furthermore, MolecularInputs require a MolFeatures in the transform specs. Hopefully that's fine with you too. |
||
if value == CategoricalEncodingEnum.DESCRIPTOR or ( | ||
isinstance(value, MolFeatures) | ||
): | ||
descriptor_keys.append(key) | ||
if ( | ||
len(set(descriptor_keys) - set(self.get_keys(CategoricalDescriptorInput))) | ||
len( | ||
set(descriptor_keys) | ||
- set(self.get_keys(CategoricalDescriptorInput)) | ||
- set(self.get_keys(MolecularInput)) | ||
) | ||
> 0 | ||
): | ||
raise ValueError("Wrong features types assigned to DESCRIPTOR transform.") | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,5 @@ | ||
from enum import Enum | ||
|
||
|
||
class SamplingMethodEnum(Enum): | ||
UNIFORM = "UNIFORM" | ||
SOBOL = "SOBOL" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,75 +1,82 @@ | ||
from typing import ClassVar, List, Literal, Optional, Tuple | ||
|
||
import pandas as pd | ||
from pydantic import Field | ||
|
||
from bofire.data_models.features.categorical import _CAT_SEP, TTransform | ||
from bofire.data_models.features.feature import Input | ||
from bofire.utils.cheminformatics import ( | ||
smiles2bag_of_characters, | ||
smiles2fingerprints, | ||
smiles2fragments, | ||
smiles2mol, | ||
) | ||
from bofire.data_models.features.categorical import _CAT_SEP | ||
from bofire.data_models.features.feature import Input, TMolecularVals | ||
from bofire.data_models.molfeatures.api import AnyMolFeatures | ||
from bofire.utils.cheminformatics import smiles2mol | ||
|
||
|
||
class MolecularInput(Input): | ||
type: Literal["MolecularInput"] = "MolecularInput" | ||
descriptor_values: TMolecularVals = Field(default_factory=list) | ||
jduerholt marked this conversation as resolved.
Show resolved
Hide resolved
|
||
order: ClassVar[int] = 6 | ||
|
||
def validate_experimental( | ||
self, values: pd.Series, strict: bool = False | ||
) -> pd.Series: | ||
for smi in values: | ||
smiles2mol(smi) | ||
|
||
return values | ||
|
||
def validate_candidental(self, values: pd.Series) -> pd.Series: | ||
for smi in values: | ||
smiles2mol(smi) | ||
return values | ||
|
||
def fixed_value(self, transform_type: Optional[TTransform] = None) -> None: | ||
return None | ||
|
||
def is_fixed(self) -> bool: | ||
return False | ||
|
||
# TODO: model descriptors as pydantic class | ||
def to_fingerprints( | ||
self, values: pd.Series, bond_radius: int = 5, n_bits: int = 2048 | ||
) -> pd.DataFrame: | ||
# validate it | ||
data = smiles2fingerprints( | ||
values.to_list(), bond_radius=bond_radius, n_bits=n_bits | ||
) | ||
return pd.DataFrame( | ||
data=data, | ||
columns=[f"{self.key}{_CAT_SEP}{i}" for i in range(data.shape[1])], | ||
) | ||
|
||
def to_bag_of_characters( | ||
self, values: pd.Series, max_ngram: int = 5 | ||
) -> pd.DataFrame: | ||
# todo: add selfies later | ||
data = smiles2bag_of_characters(values.to_list(), max_ngram=max_ngram) | ||
return pd.DataFrame( | ||
data=data, | ||
columns=[f"{self.key}{_CAT_SEP}{i}" for i in range(data.shape[1])], | ||
) | ||
|
||
def to_fragments(self, values: pd.Series): | ||
data = smiles2fragments(values.to_list()) | ||
return pd.DataFrame( | ||
data=data, | ||
columns=[f"{self.key}{_CAT_SEP}{i}" for i in range(data.shape[1])], | ||
) | ||
def fixed_value(self, transform_type: Optional[AnyMolFeatures] = None) -> None: | ||
return None | ||
|
||
def sample(self, n: int) -> pd.Series: | ||
raise ValueError("Sampling not supported for `MolecularInput`.") | ||
raise ValueError("Sampling not supported for `MolecularInput`") | ||
|
||
def get_bounds( | ||
self, transform_type: TTransform, values: pd.Series | ||
self, transform_type: AnyMolFeatures, values: pd.Series | ||
) -> Tuple[List[float], List[float]]: | ||
# TODO: this is only needed for optimization for which we need also | ||
# MolecularCategorical, this will be added later. | ||
raise NotImplementedError("`get_bounds` not yet implemented.") | ||
if len(self.descriptor_values) == 0: | ||
self.descriptor_values = transform_type.get_descriptor_values( | ||
values | ||
).values.tolist() | ||
|
||
if values is None: | ||
raise NotImplementedError( | ||
"`values` is currently required for `MolecularInput`" | ||
) | ||
else: | ||
data = self.to_descriptor_encoding(transform_type, values) | ||
|
||
lower = data.min(axis=0).values.tolist() | ||
upper = data.max(axis=0).values.tolist() | ||
|
||
return lower, upper | ||
|
||
simonsung06 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
def to_descriptor_encoding( | ||
self, transform_type: AnyMolFeatures, values: pd.Series | ||
) -> pd.DataFrame: | ||
"""Converts values to descriptor encoding. | ||
|
||
Args: | ||
values (pd.Series): Values to transform. | ||
|
||
Returns: | ||
pd.DataFrame: Descriptor encoded dataframe. | ||
""" | ||
if len(self.descriptor_values) == 0: | ||
self.descriptor_values = transform_type.get_descriptor_values( | ||
values | ||
).values.tolist() | ||
|
||
return pd.DataFrame( | ||
data=self.descriptor_values, | ||
columns=[ | ||
f"{self.key}{_CAT_SEP}{d}" | ||
for d in transform_type.get_descriptor_names() | ||
], | ||
index=values.index, | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
from typing import Literal | ||
|
||
from bofire.data_models.kernels.kernel import Kernel | ||
|
||
|
||
class MolecularKernel(Kernel): | ||
pass | ||
|
||
|
||
class TanimotoKernel(MolecularKernel): | ||
type: Literal["TanimotoKernel"] = "TanimotoKernel" | ||
ard: bool = True |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
from typing import Union | ||
|
||
from bofire.data_models.molfeatures.molfeatures import ( # BagOfCharacters | ||
Fingerprints, | ||
FingerprintsFragments, | ||
Fragments, | ||
MolFeatures, | ||
MordredDescriptors, | ||
) | ||
|
||
AbstractMolFeatures = MolFeatures | ||
|
||
AnyMolFeatures = Union[ | ||
Fingerprints, | ||
Fragments, | ||
FingerprintsFragments, | ||
# BagOfCharacters, | ||
MordredDescriptors, | ||
] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you also include this in the tests?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added MolecularInput to test_inputs_get_transform_info