diff --git a/bofire/benchmarks/multi.py b/bofire/benchmarks/multi.py index fb8316054..ca08b4569 100644 --- a/bofire/benchmarks/multi.py +++ b/bofire/benchmarks/multi.py @@ -479,7 +479,7 @@ def __init__( data_model = SingleTaskGPSurrogate( inputs=Inputs(features=inputs), outputs=Outputs(features=[outputs[0]]), - input_preprocessing_specs=input_preprocessing_specs, + input_preprocessing_specs=input_preprocessing_specs, # type: ignore ) ground_truth_yield = surrogates.map(data_model) diff --git a/bofire/data_models/api.py b/bofire/data_models/api.py index ee54b9358..e01662049 100644 --- a/bofire/data_models/api.py +++ b/bofire/data_models/api.py @@ -17,6 +17,10 @@ try: # in case of the minimal installation these import are not available from bofire.data_models.kernels.api import AnyKernel, Kernel + from bofire.data_models.molfeatures.api import ( # noqa: F401 + AnyMolFeatures, + MolFeatures, + ) from bofire.data_models.objectives.api import AnyObjective, Objective from bofire.data_models.outlier_detection.api import ( AnyOutlierDetection, @@ -49,6 +53,7 @@ AnyObjective, AnyPrior, AnyStrategy, + AnyMolFeatures, Domain, ] except ImportError: diff --git a/bofire/data_models/domain/features.py b/bofire/data_models/domain/features.py index e1d9f8788..e9d0d44f5 100644 --- a/bofire/data_models/domain/features.py +++ b/bofire/data_models/domain/features.py @@ -22,9 +22,11 @@ ContinuousOutput, DiscreteInput, Input, + MolecularInput, Output, TInputTransformSpecs, ) +from bofire.data_models.molfeatures.api import MolFeatures from bofire.data_models.objectives.api import AbstractObjective, Objective FeatureSequence = Union[List[AnyFeature], Tuple[AnyFeature]] @@ -348,6 +350,18 @@ def _get_transform_info( [f"{feat.key}{_CAT_SEP}{d}" for d in feat.descriptors] ) counter += len(feat.descriptors) + elif isinstance(specs[feat.key], MolFeatures): + assert isinstance(feat, MolecularInput) + descriptor_names = specs[ + feat.key + ].get_descriptor_names() # type: ignore + features2idx[feat.key] = tuple( + (np.array(range(len(descriptor_names))) + counter).tolist() + ) + features2names[feat.key] = tuple( + [f"{feat.key}{_CAT_SEP}{d}" for d in descriptor_names] + ) + counter += len(descriptor_names) return features2idx, features2names def transform( @@ -383,6 +397,9 @@ def transform( elif specs[feat.key] == CategoricalEncodingEnum.DESCRIPTOR: assert isinstance(feat, CategoricalDescriptorInput) transformed.append(feat.to_descriptor_encoding(s)) + elif isinstance(specs[feat.key], MolFeatures): + assert isinstance(feat, MolecularInput) + transformed.append(feat.to_descriptor_encoding(specs[feat.key], s)) # type: ignore return pd.concat(transformed, axis=1) def inverse_transform( @@ -420,6 +437,7 @@ def inverse_transform( elif specs[feat.key] == CategoricalEncodingEnum.DESCRIPTOR: assert isinstance(feat, CategoricalDescriptorInput) transformed.append(feat.from_descriptor_encoding(experiments)) + return pd.concat(transformed, axis=1) def _validate_transform_specs(self, specs: TInputTransformSpecs): @@ -429,24 +447,47 @@ def _validate_transform_specs(self, specs: TInputTransformSpecs): specs (TInputTransformSpecs): Transform specs to be validated. """ # first check that the keys in the specs dict are correct also correct feature keys - if len(set(specs.keys()) - set(self.get_keys(CategoricalInput))) > 0: + if ( + len( + set(specs.keys()) + - set(self.get_keys(CategoricalInput)) + - set(self.get_keys(MolecularInput)) + ) + > 0 + ): raise ValueError("Unknown features specified in transform specs.") - # next check that all values are of type CategoricalEncodingEnum + # next check that all values are of type CategoricalEncodingEnum or MolFeatures if not ( - all(isinstance(enc, CategoricalEncodingEnum) for enc in specs.values()) + all( + isinstance(enc, (CategoricalEncodingEnum, MolFeatures)) + for enc in specs.values() + ) ): raise ValueError("Unknown transform specified.") - # next check that only Categoricalwithdescriptor have the value DESCRIPTOR - descriptor_keys = [ - key - for key, value in specs.items() - if value == CategoricalEncodingEnum.DESCRIPTOR - ] + # next check that only CategoricalDescriptorInput can have the value DESCRIPTOR + descriptor_keys = [] + for key, value in specs.items(): + if value == CategoricalEncodingEnum.DESCRIPTOR: + descriptor_keys.append(key) if ( len(set(descriptor_keys) - set(self.get_keys(CategoricalDescriptorInput))) > 0 ): raise ValueError("Wrong features types assigned to DESCRIPTOR transform.") + # next check if MolFeatures have been assigned to feature types other than MolecularInput + molfeature_keys = [] + for key, value in specs.items(): + if isinstance(value, MolFeatures): + molfeature_keys.append(key) + if len(set(molfeature_keys) - set(self.get_keys(MolecularInput))) > 0: + raise ValueError("Wrong features types assigned to MolFeatures transforms.") + # next check that all MolecularInput have MolFeatures transforms + for feat in self.get(includes=[MolecularInput]): + mol_encoding = specs.get(feat.key) + if mol_encoding is None: + raise ValueError("No transform assigned to MolecularInput.") + elif not isinstance(mol_encoding, MolFeatures): + raise ValueError("Incorrect transform assigned to MolecularInput.") return specs def get_bounds( diff --git a/bofire/data_models/features/feature.py b/bofire/data_models/features/feature.py index 7d9d0140a..1d9c3f309 100644 --- a/bofire/data_models/features/feature.py +++ b/bofire/data_models/features/feature.py @@ -7,6 +7,7 @@ from bofire.data_models.base import BaseModel from bofire.data_models.enum import CategoricalEncodingEnum +from bofire.data_models.molfeatures.api import AnyMolFeatures from bofire.data_models.surrogates.scaler import ScalerEnum TTransform = Union[CategoricalEncodingEnum, ScalerEnum] @@ -141,7 +142,7 @@ def is_categorical(s: pd.Series, categories: List[str]): return sum(s.isin(categories)) == len(s) -TInputTransformSpecs = Dict[str, CategoricalEncodingEnum] +TInputTransformSpecs = Dict[str, Union[CategoricalEncodingEnum, AnyMolFeatures]] TDescriptors = Annotated[List[str], Field(min_items=1)] @@ -158,5 +159,4 @@ def is_categorical(s: pd.Series, categories: List[str]): TDiscreteVals = Annotated[List[float], Field(min_items=1)] - _CAT_SEP = "_" diff --git a/bofire/data_models/features/molecular.py b/bofire/data_models/features/molecular.py index 818143f71..3100fdc07 100644 --- a/bofire/data_models/features/molecular.py +++ b/bofire/data_models/features/molecular.py @@ -2,14 +2,10 @@ import pandas as pd -from bofire.data_models.features.categorical import _CAT_SEP, TTransform +from bofire.data_models.features.categorical import _CAT_SEP from bofire.data_models.features.feature import Input -from bofire.utils.cheminformatics import ( - smiles2bag_of_characters, - smiles2fingerprints, - smiles2fragments, - smiles2mol, -) +from bofire.data_models.molfeatures.api import AnyMolFeatures +from bofire.utils.cheminformatics import smiles2mol class MolecularInput(Input): @@ -21,6 +17,7 @@ def validate_experimental( ) -> pd.Series: for smi in values: smiles2mol(smi) + return values def validate_candidental(self, values: pd.Series) -> pd.Series: @@ -28,48 +25,46 @@ def validate_candidental(self, values: pd.Series) -> pd.Series: smiles2mol(smi) return values - def fixed_value(self, transform_type: Optional[TTransform] = None) -> None: - return None - def is_fixed(self) -> bool: return False - # TODO: model descriptors as pydantic class - def to_fingerprints( - self, values: pd.Series, bond_radius: int = 5, n_bits: int = 2048 - ) -> pd.DataFrame: - # validate it - data = smiles2fingerprints( - values.to_list(), bond_radius=bond_radius, n_bits=n_bits - ) - return pd.DataFrame( - data=data, - columns=[f"{self.key}{_CAT_SEP}{i}" for i in range(data.shape[1])], - ) - - def to_bag_of_characters( - self, values: pd.Series, max_ngram: int = 5 - ) -> pd.DataFrame: - # todo: add selfies later - data = smiles2bag_of_characters(values.to_list(), max_ngram=max_ngram) - return pd.DataFrame( - data=data, - columns=[f"{self.key}{_CAT_SEP}{i}" for i in range(data.shape[1])], - ) - - def to_fragments(self, values: pd.Series): - data = smiles2fragments(values.to_list()) - return pd.DataFrame( - data=data, - columns=[f"{self.key}{_CAT_SEP}{i}" for i in range(data.shape[1])], - ) + def fixed_value(self, transform_type: Optional[AnyMolFeatures] = None) -> None: + return None def sample(self, n: int) -> pd.Series: - raise ValueError("Sampling not supported for `MolecularInput`.") + raise ValueError("Sampling not supported for `MolecularInput`") def get_bounds( - self, transform_type: TTransform, values: pd.Series + self, transform_type: AnyMolFeatures, values: pd.Series ) -> Tuple[List[float], List[float]]: - # TODO: this is only needed for optimization for which we need also - # MolecularCategorical, this will be added later. - raise NotImplementedError("`get_bounds` not yet implemented.") + if values is None: + raise NotImplementedError( + "`values` is currently required for `MolecularInput`" + ) + else: + data = self.to_descriptor_encoding(transform_type, values) + + lower = data.min(axis=0).values.tolist() + upper = data.max(axis=0).values.tolist() + + return lower, upper + + def to_descriptor_encoding( + self, transform_type: AnyMolFeatures, values: pd.Series + ) -> pd.DataFrame: + """Converts values to descriptor encoding. + + Args: + values (pd.Series): Values to transform. + + Returns: + pd.DataFrame: Descriptor encoded dataframe. + """ + descriptor_values = transform_type.get_descriptor_values(values) + + descriptor_values.columns = [ + f"{self.key}{_CAT_SEP}{d}" for d in transform_type.get_descriptor_names() + ] + descriptor_values.index = values.index + + return descriptor_values diff --git a/bofire/data_models/kernels/aggregation.py b/bofire/data_models/kernels/aggregation.py index 925c358b7..21c4bef16 100644 --- a/bofire/data_models/kernels/aggregation.py +++ b/bofire/data_models/kernels/aggregation.py @@ -3,6 +3,7 @@ from bofire.data_models.kernels.categorical import HammondDistanceKernel from bofire.data_models.kernels.continuous import LinearKernel, MaternKernel, RBFKernel from bofire.data_models.kernels.kernel import Kernel +from bofire.data_models.kernels.molecular import TanimotoKernel from bofire.data_models.priors.api import AnyPrior @@ -14,6 +15,7 @@ class AdditiveKernel(Kernel): MaternKernel, LinearKernel, HammondDistanceKernel, + TanimotoKernel, "AdditiveKernel", "MultiplicativeKernel", "ScaleKernel", @@ -31,6 +33,7 @@ class MultiplicativeKernel(Kernel): LinearKernel, HammondDistanceKernel, AdditiveKernel, + TanimotoKernel, "MultiplicativeKernel", "ScaleKernel", ] @@ -46,6 +49,7 @@ class ScaleKernel(Kernel): HammondDistanceKernel, AdditiveKernel, MultiplicativeKernel, + TanimotoKernel, "ScaleKernel", ] outputscale_prior: Optional[AnyPrior] = None diff --git a/bofire/data_models/kernels/api.py b/bofire/data_models/kernels/api.py index 32643ff17..4f9eaaa63 100644 --- a/bofire/data_models/kernels/api.py +++ b/bofire/data_models/kernels/api.py @@ -16,12 +16,9 @@ RBFKernel, ) from bofire.data_models.kernels.kernel import Kernel +from bofire.data_models.kernels.molecular import MolecularKernel, TanimotoKernel -AbstractKernel = Union[ - Kernel, - CategoricalKernel, - ContinuousKernel, -] +AbstractKernel = Union[Kernel, CategoricalKernel, ContinuousKernel, MolecularKernel] AnyContinuousKernel = Union[ MaternKernel, @@ -31,6 +28,8 @@ AnyCategoricalKernal = HammondDistanceKernel +AnyMolecularKernel = TanimotoKernel + AnyKernel = Union[ AdditiveKernel, MultiplicativeKernel, @@ -39,4 +38,5 @@ LinearKernel, MaternKernel, RBFKernel, + TanimotoKernel, ] diff --git a/bofire/data_models/kernels/molecular.py b/bofire/data_models/kernels/molecular.py new file mode 100644 index 000000000..522986f2b --- /dev/null +++ b/bofire/data_models/kernels/molecular.py @@ -0,0 +1,12 @@ +from typing import Literal + +from bofire.data_models.kernels.kernel import Kernel + + +class MolecularKernel(Kernel): + pass + + +class TanimotoKernel(MolecularKernel): + type: Literal["TanimotoKernel"] = "TanimotoKernel" + ard: bool = True diff --git a/bofire/data_models/molfeatures/__init__.py b/bofire/data_models/molfeatures/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/bofire/data_models/molfeatures/api.py b/bofire/data_models/molfeatures/api.py new file mode 100644 index 000000000..2693580b3 --- /dev/null +++ b/bofire/data_models/molfeatures/api.py @@ -0,0 +1,19 @@ +from typing import Union + +from bofire.data_models.molfeatures.molfeatures import ( # BagOfCharacters + Fingerprints, + FingerprintsFragments, + Fragments, + MolFeatures, + MordredDescriptors, +) + +AbstractMolFeatures = MolFeatures + +AnyMolFeatures = Union[ + Fingerprints, + Fragments, + FingerprintsFragments, + # BagOfCharacters, + MordredDescriptors, +] diff --git a/bofire/data_models/molfeatures/molfeatures.py b/bofire/data_models/molfeatures/molfeatures.py new file mode 100644 index 000000000..8b6ec236c --- /dev/null +++ b/bofire/data_models/molfeatures/molfeatures.py @@ -0,0 +1,149 @@ +from typing import List, Literal, Optional + +import pandas as pd +from pydantic import Field, validator +from typing_extensions import Annotated + +import bofire.data_models.molfeatures.names as names +from bofire.data_models.base import BaseModel +from bofire.utils.cheminformatics import ( # smiles2bag_of_characters, + smiles2fingerprints, + smiles2fragments, + smiles2fragments_fingerprints, + smiles2mordred, +) + + +class MolFeatures(BaseModel): + """Base class for all molecular features""" + + type: str + + +class Fingerprints(MolFeatures): + type: Literal["Fingerprints"] = "Fingerprints" + bond_radius: int = 5 + n_bits: int = 2048 + + def get_descriptor_names(self) -> List[str]: + return [f"fingerprint_{i}" for i in range(self.n_bits)] + + def get_descriptor_values(self, values: pd.Series) -> pd.DataFrame: + return pd.DataFrame( + data=smiles2fingerprints( + values.to_list(), bond_radius=self.bond_radius, n_bits=self.n_bits + ).astype(float), + columns=self.get_descriptor_names(), + index=values.index, + ) + + +class Fragments(MolFeatures): + type: Literal["Fragments"] = "Fragments" + fragments: Optional[List[str]] = None + + @validator( + "fragments", + ) + def validate_fragments(cls, fragments): + """validates that fragments have unique names + + Args: + categories (List[str]): List of fragment names + + Raises: + ValueError: when fragments have non-unique names + + Returns: + List[str]: List of the fragments + """ + if fragments is not None: + if len(fragments) != len(set(fragments)): + raise ValueError("Fragments must be unique") + + if not all(user_fragment in names.fragments for user_fragment in fragments): + raise ValueError( + "Not all provided fragments were not found in the RDKit list" + ) + + return fragments + + def get_descriptor_names(self) -> List[str]: + return self.fragments if self.fragments is not None else names.fragments + + def get_descriptor_values(self, values: pd.Series) -> pd.DataFrame: + return pd.DataFrame( + data=smiles2fragments(values.to_list(), self.get_descriptor_names()), + columns=self.get_descriptor_names(), + index=values.index, + ) + + +class FingerprintsFragments(Fingerprints, Fragments): + type: Literal["FingerprintsFragments"] = "FingerprintsFragments" + + def get_descriptor_names(self) -> List[str]: + fingerprints_list = [f"fingerprint_{i}" for i in range(self.n_bits)] + fragments_list = ( + self.fragments if self.fragments is not None else names.fragments + ) + + fingerprints_fragment_list = fingerprints_list + fragments_list + + return fingerprints_fragment_list + + def get_descriptor_values(self, values: pd.Series) -> pd.DataFrame: + fragments_list = ( + self.fragments if self.fragments is not None else names.fragments + ) + + return pd.DataFrame( + data=smiles2fragments_fingerprints( + values.to_list(), + bond_radius=self.bond_radius, + n_bits=self.n_bits, + fragments_list=fragments_list, + ), + columns=self.get_descriptor_names(), + index=values.index, + ) + + +class MordredDescriptors(MolFeatures): + type: Literal["MordredDescriptors"] = "MordredDescriptors" + descriptors: Annotated[List[str], Field(min_items=1)] + + @validator( + "descriptors", + ) + def validate_descriptors(cls, descriptors): + """validates that descriptors have unique names + + Args: + descriptors (List[str]): List of descriptor names + + Raises: + ValueError: when descriptors have non-unique names + + Returns: + List[str]: List of the descriptors + """ + if len(descriptors) != len(set(descriptors)): + raise ValueError("descriptors must be unique") + + if not all(desc in names.mordred for desc in descriptors): + raise ValueError( + "Not all provided descriptors were not found in the Mordred list" + ) + + return descriptors + + def get_descriptor_names(self) -> List[str]: + return self.descriptors + + def get_descriptor_values(self, values: pd.Series) -> pd.DataFrame: + return pd.DataFrame( + data=smiles2mordred(values.to_list(), self.descriptors), + columns=self.descriptors, + index=values.index, + ) diff --git a/bofire/data_models/molfeatures/names.py b/bofire/data_models/molfeatures/names.py new file mode 100644 index 000000000..bed800a35 --- /dev/null +++ b/bofire/data_models/molfeatures/names.py @@ -0,0 +1,1918 @@ +# names of the rdkit fragments +fragments = [ + "fr_Al_COO", + "fr_Al_OH", + "fr_Al_OH_noTert", + "fr_ArN", + "fr_Ar_COO", + "fr_Ar_N", + "fr_Ar_NH", + "fr_Ar_OH", + "fr_COO", + "fr_COO2", + "fr_C_O", + "fr_C_O_noCOO", + "fr_C_S", + "fr_HOCCN", + "fr_Imine", + "fr_NH0", + "fr_NH1", + "fr_NH2", + "fr_N_O", + "fr_Ndealkylation1", + "fr_Ndealkylation2", + "fr_Nhpyrrole", + "fr_SH", + "fr_aldehyde", + "fr_alkyl_carbamate", + "fr_alkyl_halide", + "fr_allylic_oxid", + "fr_amide", + "fr_amidine", + "fr_aniline", + "fr_aryl_methyl", + "fr_azide", + "fr_azo", + "fr_barbitur", + "fr_benzene", + "fr_benzodiazepine", + "fr_bicyclic", + "fr_diazo", + "fr_dihydropyridine", + "fr_epoxide", + "fr_ester", + "fr_ether", + "fr_furan", + "fr_guanido", + "fr_halogen", + "fr_hdrzine", + "fr_hdrzone", + "fr_imidazole", + "fr_imide", + "fr_isocyan", + "fr_isothiocyan", + "fr_ketone", + "fr_ketone_Topliss", + "fr_lactam", + "fr_lactone", + "fr_methoxy", + "fr_morpholine", + "fr_nitrile", + "fr_nitro", + "fr_nitro_arom", + "fr_nitro_arom_nonortho", + "fr_nitroso", + "fr_oxazole", + "fr_oxime", + "fr_para_hydroxylation", + "fr_phenol", + "fr_phenol_noOrthoHbond", + "fr_phos_acid", + "fr_phos_ester", + "fr_piperdine", + "fr_piperzine", + "fr_priamide", + "fr_prisulfonamd", + "fr_pyridine", + "fr_quatN", + "fr_sulfide", + "fr_sulfonamd", + "fr_sulfone", + "fr_term_acetylene", + "fr_tetrazole", + "fr_thiazole", + "fr_thiocyan", + "fr_thiophene", + "fr_unbrch_alkane", + "fr_urea", +] + +# names of the mordred descriptors +mordred = [ + "ABC", + "ABCGG", + "nAcid", + "nBase", + "SpAbs_A", + "SpMax_A", + "SpDiam_A", + "SpAD_A", + "SpMAD_A", + "LogEE_A", + "VE1_A", + "VE2_A", + "VE3_A", + "VR1_A", + "VR2_A", + "VR3_A", + "nAromAtom", + "nAromBond", + "nAtom", + "nHeavyAtom", + "nSpiro", + "nBridgehead", + "nHetero", + "nH", + "nB", + "nC", + "nN", + "nO", + "nS", + "nP", + "nF", + "nCl", + "nBr", + "nI", + "nX", + "ATS0dv", + "ATS1dv", + "ATS2dv", + "ATS3dv", + "ATS4dv", + "ATS5dv", + "ATS6dv", + "ATS7dv", + "ATS8dv", + "ATS0d", + "ATS1d", + "ATS2d", + "ATS3d", + "ATS4d", + "ATS5d", + "ATS6d", + "ATS7d", + "ATS8d", + "ATS0s", + "ATS1s", + "ATS2s", + "ATS3s", + "ATS4s", + "ATS5s", + "ATS6s", + "ATS7s", + "ATS8s", + "ATS0Z", + "ATS1Z", + "ATS2Z", + "ATS3Z", + "ATS4Z", + "ATS5Z", + "ATS6Z", + "ATS7Z", + "ATS8Z", + "ATS0m", + "ATS1m", + "ATS2m", + "ATS3m", + "ATS4m", + "ATS5m", + "ATS6m", + "ATS7m", + "ATS8m", + "ATS0v", + "ATS1v", + "ATS2v", + "ATS3v", + "ATS4v", + "ATS5v", + "ATS6v", + "ATS7v", + "ATS8v", + "ATS0se", + "ATS1se", + "ATS2se", + "ATS3se", + "ATS4se", + "ATS5se", + "ATS6se", + "ATS7se", + "ATS8se", + "ATS0pe", + "ATS1pe", + "ATS2pe", + "ATS3pe", + "ATS4pe", + "ATS5pe", + "ATS6pe", + "ATS7pe", + "ATS8pe", + "ATS0are", + "ATS1are", + "ATS2are", + "ATS3are", + "ATS4are", + "ATS5are", + "ATS6are", + "ATS7are", + "ATS8are", + "ATS0p", + "ATS1p", + "ATS2p", + "ATS3p", + "ATS4p", + "ATS5p", + "ATS6p", + "ATS7p", + "ATS8p", + "ATS0i", + "ATS1i", + "ATS2i", + "ATS3i", + "ATS4i", + "ATS5i", + "ATS6i", + "ATS7i", + "ATS8i", + "AATS0dv", + "AATS1dv", + "AATS2dv", + "AATS3dv", + "AATS4dv", + "AATS5dv", + "AATS6dv", + "AATS7dv", + "AATS8dv", + "AATS0d", + "AATS1d", + "AATS2d", + "AATS3d", + "AATS4d", + "AATS5d", + "AATS6d", + "AATS7d", + "AATS8d", + "AATS0s", + "AATS1s", + "AATS2s", + "AATS3s", + "AATS4s", + "AATS5s", + "AATS6s", + "AATS7s", + "AATS8s", + "AATS0Z", + "AATS1Z", + "AATS2Z", + "AATS3Z", + "AATS4Z", + "AATS5Z", + "AATS6Z", + "AATS7Z", + "AATS8Z", + "AATS0m", + "AATS1m", + "AATS2m", + "AATS3m", + "AATS4m", + "AATS5m", + "AATS6m", + "AATS7m", + "AATS8m", + "AATS0v", + "AATS1v", + "AATS2v", + "AATS3v", + "AATS4v", + "AATS5v", + "AATS6v", + "AATS7v", + "AATS8v", + "AATS0se", + "AATS1se", + "AATS2se", + "AATS3se", + "AATS4se", + "AATS5se", + "AATS6se", + "AATS7se", + "AATS8se", + "AATS0pe", + "AATS1pe", + "AATS2pe", + "AATS3pe", + "AATS4pe", + "AATS5pe", + "AATS6pe", + "AATS7pe", + "AATS8pe", + "AATS0are", + "AATS1are", + "AATS2are", + "AATS3are", + "AATS4are", + "AATS5are", + "AATS6are", + "AATS7are", + "AATS8are", + "AATS0p", + "AATS1p", + "AATS2p", + "AATS3p", + "AATS4p", + "AATS5p", + "AATS6p", + "AATS7p", + "AATS8p", + "AATS0i", + "AATS1i", + "AATS2i", + "AATS3i", + "AATS4i", + "AATS5i", + "AATS6i", + "AATS7i", + "AATS8i", + "ATSC0c", + "ATSC1c", + "ATSC2c", + "ATSC3c", + "ATSC4c", + "ATSC5c", + "ATSC6c", + "ATSC7c", + "ATSC8c", + "ATSC0dv", + "ATSC1dv", + "ATSC2dv", + "ATSC3dv", + "ATSC4dv", + "ATSC5dv", + "ATSC6dv", + "ATSC7dv", + "ATSC8dv", + "ATSC0d", + "ATSC1d", + "ATSC2d", + "ATSC3d", + "ATSC4d", + "ATSC5d", + "ATSC6d", + "ATSC7d", + "ATSC8d", + "ATSC0s", + "ATSC1s", + "ATSC2s", + "ATSC3s", + "ATSC4s", + "ATSC5s", + "ATSC6s", + "ATSC7s", + "ATSC8s", + "ATSC0Z", + "ATSC1Z", + "ATSC2Z", + "ATSC3Z", + "ATSC4Z", + "ATSC5Z", + "ATSC6Z", + "ATSC7Z", + "ATSC8Z", + "ATSC0m", + "ATSC1m", + "ATSC2m", + "ATSC3m", + "ATSC4m", + "ATSC5m", + "ATSC6m", + "ATSC7m", + "ATSC8m", + "ATSC0v", + "ATSC1v", + "ATSC2v", + "ATSC3v", + "ATSC4v", + "ATSC5v", + "ATSC6v", + "ATSC7v", + "ATSC8v", + "ATSC0se", + "ATSC1se", + "ATSC2se", + "ATSC3se", + "ATSC4se", + "ATSC5se", + "ATSC6se", + "ATSC7se", + "ATSC8se", + "ATSC0pe", + "ATSC1pe", + "ATSC2pe", + "ATSC3pe", + "ATSC4pe", + "ATSC5pe", + "ATSC6pe", + "ATSC7pe", + "ATSC8pe", + "ATSC0are", + "ATSC1are", + "ATSC2are", + "ATSC3are", + "ATSC4are", + "ATSC5are", + "ATSC6are", + "ATSC7are", + "ATSC8are", + "ATSC0p", + "ATSC1p", + "ATSC2p", + "ATSC3p", + "ATSC4p", + "ATSC5p", + "ATSC6p", + "ATSC7p", + "ATSC8p", + "ATSC0i", + "ATSC1i", + "ATSC2i", + "ATSC3i", + "ATSC4i", + "ATSC5i", + "ATSC6i", + "ATSC7i", + "ATSC8i", + "AATSC0c", + "AATSC1c", + "AATSC2c", + "AATSC3c", + "AATSC4c", + "AATSC5c", + "AATSC6c", + "AATSC7c", + "AATSC8c", + "AATSC0dv", + "AATSC1dv", + "AATSC2dv", + "AATSC3dv", + "AATSC4dv", + "AATSC5dv", + "AATSC6dv", + "AATSC7dv", + "AATSC8dv", + "AATSC0d", + "AATSC1d", + "AATSC2d", + "AATSC3d", + "AATSC4d", + "AATSC5d", + "AATSC6d", + "AATSC7d", + "AATSC8d", + "AATSC0s", + "AATSC1s", + "AATSC2s", + "AATSC3s", + "AATSC4s", + "AATSC5s", + "AATSC6s", + "AATSC7s", + "AATSC8s", + "AATSC0Z", + "AATSC1Z", + "AATSC2Z", + "AATSC3Z", + "AATSC4Z", + "AATSC5Z", + "AATSC6Z", + "AATSC7Z", + "AATSC8Z", + "AATSC0m", + "AATSC1m", + "AATSC2m", + "AATSC3m", + "AATSC4m", + "AATSC5m", + "AATSC6m", + "AATSC7m", + "AATSC8m", + "AATSC0v", + "AATSC1v", + "AATSC2v", + "AATSC3v", + "AATSC4v", + "AATSC5v", + "AATSC6v", + "AATSC7v", + "AATSC8v", + "AATSC0se", + "AATSC1se", + "AATSC2se", + "AATSC3se", + "AATSC4se", + "AATSC5se", + "AATSC6se", + "AATSC7se", + "AATSC8se", + "AATSC0pe", + "AATSC1pe", + "AATSC2pe", + "AATSC3pe", + "AATSC4pe", + "AATSC5pe", + "AATSC6pe", + "AATSC7pe", + "AATSC8pe", + "AATSC0are", + "AATSC1are", + "AATSC2are", + "AATSC3are", + "AATSC4are", + "AATSC5are", + "AATSC6are", + "AATSC7are", + "AATSC8are", + "AATSC0p", + "AATSC1p", + "AATSC2p", + "AATSC3p", + "AATSC4p", + "AATSC5p", + "AATSC6p", + "AATSC7p", + "AATSC8p", + "AATSC0i", + "AATSC1i", + "AATSC2i", + "AATSC3i", + "AATSC4i", + "AATSC5i", + "AATSC6i", + "AATSC7i", + "AATSC8i", + "MATS1c", + "MATS2c", + "MATS3c", + "MATS4c", + "MATS5c", + "MATS6c", + "MATS7c", + "MATS8c", + "MATS1dv", + "MATS2dv", + "MATS3dv", + "MATS4dv", + "MATS5dv", + "MATS6dv", + "MATS7dv", + "MATS8dv", + "MATS1d", + "MATS2d", + "MATS3d", + "MATS4d", + "MATS5d", + "MATS6d", + "MATS7d", + "MATS8d", + "MATS1s", + "MATS2s", + "MATS3s", + "MATS4s", + "MATS5s", + "MATS6s", + "MATS7s", + "MATS8s", + "MATS1Z", + "MATS2Z", + "MATS3Z", + "MATS4Z", + "MATS5Z", + "MATS6Z", + "MATS7Z", + "MATS8Z", + "MATS1m", + "MATS2m", + "MATS3m", + "MATS4m", + "MATS5m", + "MATS6m", + "MATS7m", + "MATS8m", + "MATS1v", + "MATS2v", + "MATS3v", + "MATS4v", + "MATS5v", + "MATS6v", + "MATS7v", + "MATS8v", + "MATS1se", + "MATS2se", + "MATS3se", + "MATS4se", + "MATS5se", + "MATS6se", + "MATS7se", + "MATS8se", + "MATS1pe", + "MATS2pe", + "MATS3pe", + "MATS4pe", + "MATS5pe", + "MATS6pe", + "MATS7pe", + "MATS8pe", + "MATS1are", + "MATS2are", + "MATS3are", + "MATS4are", + "MATS5are", + "MATS6are", + "MATS7are", + "MATS8are", + "MATS1p", + "MATS2p", + "MATS3p", + "MATS4p", + "MATS5p", + "MATS6p", + "MATS7p", + "MATS8p", + "MATS1i", + "MATS2i", + "MATS3i", + "MATS4i", + "MATS5i", + "MATS6i", + "MATS7i", + "MATS8i", + "GATS1c", + "GATS2c", + "GATS3c", + "GATS4c", + "GATS5c", + "GATS6c", + "GATS7c", + "GATS8c", + "GATS1dv", + "GATS2dv", + "GATS3dv", + "GATS4dv", + "GATS5dv", + "GATS6dv", + "GATS7dv", + "GATS8dv", + "GATS1d", + "GATS2d", + "GATS3d", + "GATS4d", + "GATS5d", + "GATS6d", + "GATS7d", + "GATS8d", + "GATS1s", + "GATS2s", + "GATS3s", + "GATS4s", + "GATS5s", + "GATS6s", + "GATS7s", + "GATS8s", + "GATS1Z", + "GATS2Z", + "GATS3Z", + "GATS4Z", + "GATS5Z", + "GATS6Z", + "GATS7Z", + "GATS8Z", + "GATS1m", + "GATS2m", + "GATS3m", + "GATS4m", + "GATS5m", + "GATS6m", + "GATS7m", + "GATS8m", + "GATS1v", + "GATS2v", + "GATS3v", + "GATS4v", + "GATS5v", + "GATS6v", + "GATS7v", + "GATS8v", + "GATS1se", + "GATS2se", + "GATS3se", + "GATS4se", + "GATS5se", + "GATS6se", + "GATS7se", + "GATS8se", + "GATS1pe", + "GATS2pe", + "GATS3pe", + "GATS4pe", + "GATS5pe", + "GATS6pe", + "GATS7pe", + "GATS8pe", + "GATS1are", + "GATS2are", + "GATS3are", + "GATS4are", + "GATS5are", + "GATS6are", + "GATS7are", + "GATS8are", + "GATS1p", + "GATS2p", + "GATS3p", + "GATS4p", + "GATS5p", + "GATS6p", + "GATS7p", + "GATS8p", + "GATS1i", + "GATS2i", + "GATS3i", + "GATS4i", + "GATS5i", + "GATS6i", + "GATS7i", + "GATS8i", + "BCUTc-1h", + "BCUTc-1l", + "BCUTdv-1h", + "BCUTdv-1l", + "BCUTd-1h", + "BCUTd-1l", + "BCUTs-1h", + "BCUTs-1l", + "BCUTZ-1h", + "BCUTZ-1l", + "BCUTm-1h", + "BCUTm-1l", + "BCUTv-1h", + "BCUTv-1l", + "BCUTse-1h", + "BCUTse-1l", + "BCUTpe-1h", + "BCUTpe-1l", + "BCUTare-1h", + "BCUTare-1l", + "BCUTp-1h", + "BCUTp-1l", + "BCUTi-1h", + "BCUTi-1l", + "BalabanJ", + "SpAbs_DzZ", + "SpMax_DzZ", + "SpDiam_DzZ", + "SpAD_DzZ", + "SpMAD_DzZ", + "LogEE_DzZ", + "SM1_DzZ", + "VE1_DzZ", + "VE2_DzZ", + "VE3_DzZ", + "VR1_DzZ", + "VR2_DzZ", + "VR3_DzZ", + "SpAbs_Dzm", + "SpMax_Dzm", + "SpDiam_Dzm", + "SpAD_Dzm", + "SpMAD_Dzm", + "LogEE_Dzm", + "SM1_Dzm", + "VE1_Dzm", + "VE2_Dzm", + "VE3_Dzm", + "VR1_Dzm", + "VR2_Dzm", + "VR3_Dzm", + "SpAbs_Dzv", + "SpMax_Dzv", + "SpDiam_Dzv", + "SpAD_Dzv", + "SpMAD_Dzv", + "LogEE_Dzv", + "SM1_Dzv", + "VE1_Dzv", + "VE2_Dzv", + "VE3_Dzv", + "VR1_Dzv", + "VR2_Dzv", + "VR3_Dzv", + "SpAbs_Dzse", + "SpMax_Dzse", + "SpDiam_Dzse", + "SpAD_Dzse", + "SpMAD_Dzse", + "LogEE_Dzse", + "SM1_Dzse", + "VE1_Dzse", + "VE2_Dzse", + "VE3_Dzse", + "VR1_Dzse", + "VR2_Dzse", + "VR3_Dzse", + "SpAbs_Dzpe", + "SpMax_Dzpe", + "SpDiam_Dzpe", + "SpAD_Dzpe", + "SpMAD_Dzpe", + "LogEE_Dzpe", + "SM1_Dzpe", + "VE1_Dzpe", + "VE2_Dzpe", + "VE3_Dzpe", + "VR1_Dzpe", + "VR2_Dzpe", + "VR3_Dzpe", + "SpAbs_Dzare", + "SpMax_Dzare", + "SpDiam_Dzare", + "SpAD_Dzare", + "SpMAD_Dzare", + "LogEE_Dzare", + "SM1_Dzare", + "VE1_Dzare", + "VE2_Dzare", + "VE3_Dzare", + "VR1_Dzare", + "VR2_Dzare", + "VR3_Dzare", + "SpAbs_Dzp", + "SpMax_Dzp", + "SpDiam_Dzp", + "SpAD_Dzp", + "SpMAD_Dzp", + "LogEE_Dzp", + "SM1_Dzp", + "VE1_Dzp", + "VE2_Dzp", + "VE3_Dzp", + "VR1_Dzp", + "VR2_Dzp", + "VR3_Dzp", + "SpAbs_Dzi", + "SpMax_Dzi", + "SpDiam_Dzi", + "SpAD_Dzi", + "SpMAD_Dzi", + "LogEE_Dzi", + "SM1_Dzi", + "VE1_Dzi", + "VE2_Dzi", + "VE3_Dzi", + "VR1_Dzi", + "VR2_Dzi", + "VR3_Dzi", + "BertzCT", + "nBonds", + "nBondsO", + "nBondsS", + "nBondsD", + "nBondsT", + "nBondsA", + "nBondsM", + "nBondsKS", + "nBondsKD", + "PNSA1", + "PNSA2", + "PNSA3", + "PNSA4", + "PNSA5", + "PPSA1", + "PPSA2", + "PPSA3", + "PPSA4", + "PPSA5", + "DPSA1", + "DPSA2", + "DPSA3", + "DPSA4", + "DPSA5", + "FNSA1", + "FNSA2", + "FNSA3", + "FNSA4", + "FNSA5", + "FPSA1", + "FPSA2", + "FPSA3", + "FPSA4", + "FPSA5", + "WNSA1", + "WNSA2", + "WNSA3", + "WNSA4", + "WNSA5", + "WPSA1", + "WPSA2", + "WPSA3", + "WPSA4", + "WPSA5", + "RNCG", + "RPCG", + "RNCS", + "RPCS", + "TASA", + "TPSA", + "RASA", + "RPSA", + "C1SP1", + "C2SP1", + "C1SP2", + "C2SP2", + "C3SP2", + "C1SP3", + "C2SP3", + "C3SP3", + "C4SP3", + "HybRatio", + "FCSP3", + "Xch-3d", + "Xch-4d", + "Xch-5d", + "Xch-6d", + "Xch-7d", + "Xch-3dv", + "Xch-4dv", + "Xch-5dv", + "Xch-6dv", + "Xch-7dv", + "Xc-3d", + "Xc-4d", + "Xc-5d", + "Xc-6d", + "Xc-3dv", + "Xc-4dv", + "Xc-5dv", + "Xc-6dv", + "Xpc-4d", + "Xpc-5d", + "Xpc-6d", + "Xpc-4dv", + "Xpc-5dv", + "Xpc-6dv", + "Xp-0d", + "Xp-1d", + "Xp-2d", + "Xp-3d", + "Xp-4d", + "Xp-5d", + "Xp-6d", + "Xp-7d", + "AXp-0d", + "AXp-1d", + "AXp-2d", + "AXp-3d", + "AXp-4d", + "AXp-5d", + "AXp-6d", + "AXp-7d", + "Xp-0dv", + "Xp-1dv", + "Xp-2dv", + "Xp-3dv", + "Xp-4dv", + "Xp-5dv", + "Xp-6dv", + "Xp-7dv", + "AXp-0dv", + "AXp-1dv", + "AXp-2dv", + "AXp-3dv", + "AXp-4dv", + "AXp-5dv", + "AXp-6dv", + "AXp-7dv", + "SZ", + "Sm", + "Sv", + "Sse", + "Spe", + "Sare", + "Sp", + "Si", + "MZ", + "Mm", + "Mv", + "Mse", + "Mpe", + "Mare", + "Mp", + "Mi", + "SpAbs_Dt", + "SpMax_Dt", + "SpDiam_Dt", + "SpAD_Dt", + "SpMAD_Dt", + "LogEE_Dt", + "SM1_Dt", + "VE1_Dt", + "VE2_Dt", + "VE3_Dt", + "VR1_Dt", + "VR2_Dt", + "VR3_Dt", + "DetourIndex", + "SpAbs_D", + "SpMax_D", + "SpDiam_D", + "SpAD_D", + "SpMAD_D", + "LogEE_D", + "VE1_D", + "VE2_D", + "VE3_D", + "VR1_D", + "VR2_D", + "VR3_D", + "NsLi", + "NssBe", + "NssssBe", + "NssBH", + "NsssB", + "NssssB", + "NsCH3", + "NdCH2", + "NssCH2", + "NtCH", + "NdsCH", + "NaaCH", + "NsssCH", + "NddC", + "NtsC", + "NdssC", + "NaasC", + "NaaaC", + "NssssC", + "NsNH3", + "NsNH2", + "NssNH2", + "NdNH", + "NssNH", + "NaaNH", + "NtN", + "NsssNH", + "NdsN", + "NaaN", + "NsssN", + "NddsN", + "NaasN", + "NssssN", + "NsOH", + "NdO", + "NssO", + "NaaO", + "NsF", + "NsSiH3", + "NssSiH2", + "NsssSiH", + "NssssSi", + "NsPH2", + "NssPH", + "NsssP", + "NdsssP", + "NsssssP", + "NsSH", + "NdS", + "NssS", + "NaaS", + "NdssS", + "NddssS", + "NsCl", + "NsGeH3", + "NssGeH2", + "NsssGeH", + "NssssGe", + "NsAsH2", + "NssAsH", + "NsssAs", + "NsssdAs", + "NsssssAs", + "NsSeH", + "NdSe", + "NssSe", + "NaaSe", + "NdssSe", + "NddssSe", + "NsBr", + "NsSnH3", + "NssSnH2", + "NsssSnH", + "NssssSn", + "NsI", + "NsPbH3", + "NssPbH2", + "NsssPbH", + "NssssPb", + "SsLi", + "SssBe", + "SssssBe", + "SssBH", + "SsssB", + "SssssB", + "SsCH3", + "SdCH2", + "SssCH2", + "StCH", + "SdsCH", + "SaaCH", + "SsssCH", + "SddC", + "StsC", + "SdssC", + "SaasC", + "SaaaC", + "SssssC", + "SsNH3", + "SsNH2", + "SssNH2", + "SdNH", + "SssNH", + "SaaNH", + "StN", + "SsssNH", + "SdsN", + "SaaN", + "SsssN", + "SddsN", + "SaasN", + "SssssN", + "SsOH", + "SdO", + "SssO", + "SaaO", + "SsF", + "SsSiH3", + "SssSiH2", + "SsssSiH", + "SssssSi", + "SsPH2", + "SssPH", + "SsssP", + "SdsssP", + "SsssssP", + "SsSH", + "SdS", + "SssS", + "SaaS", + "SdssS", + "SddssS", + "SsCl", + "SsGeH3", + "SssGeH2", + "SsssGeH", + "SssssGe", + "SsAsH2", + "SssAsH", + "SsssAs", + "SsssdAs", + "SsssssAs", + "SsSeH", + "SdSe", + "SssSe", + "SaaSe", + "SdssSe", + "SddssSe", + "SsBr", + "SsSnH3", + "SssSnH2", + "SsssSnH", + "SssssSn", + "SsI", + "SsPbH3", + "SssPbH2", + "SsssPbH", + "SssssPb", + "MAXsLi", + "MAXssBe", + "MAXssssBe", + "MAXssBH", + "MAXsssB", + "MAXssssB", + "MAXsCH3", + "MAXdCH2", + "MAXssCH2", + "MAXtCH", + "MAXdsCH", + "MAXaaCH", + "MAXsssCH", + "MAXddC", + "MAXtsC", + "MAXdssC", + "MAXaasC", + "MAXaaaC", + "MAXssssC", + "MAXsNH3", + "MAXsNH2", + "MAXssNH2", + "MAXdNH", + "MAXssNH", + "MAXaaNH", + "MAXtN", + "MAXsssNH", + "MAXdsN", + "MAXaaN", + "MAXsssN", + "MAXddsN", + "MAXaasN", + "MAXssssN", + "MAXsOH", + "MAXdO", + "MAXssO", + "MAXaaO", + "MAXsF", + "MAXsSiH3", + "MAXssSiH2", + "MAXsssSiH", + "MAXssssSi", + "MAXsPH2", + "MAXssPH", + "MAXsssP", + "MAXdsssP", + "MAXsssssP", + "MAXsSH", + "MAXdS", + "MAXssS", + "MAXaaS", + "MAXdssS", + "MAXddssS", + "MAXsCl", + "MAXsGeH3", + "MAXssGeH2", + "MAXsssGeH", + "MAXssssGe", + "MAXsAsH2", + "MAXssAsH", + "MAXsssAs", + "MAXsssdAs", + "MAXsssssAs", + "MAXsSeH", + "MAXdSe", + "MAXssSe", + "MAXaaSe", + "MAXdssSe", + "MAXddssSe", + "MAXsBr", + "MAXsSnH3", + "MAXssSnH2", + "MAXsssSnH", + "MAXssssSn", + "MAXsI", + "MAXsPbH3", + "MAXssPbH2", + "MAXsssPbH", + "MAXssssPb", + "MINsLi", + "MINssBe", + "MINssssBe", + "MINssBH", + "MINsssB", + "MINssssB", + "MINsCH3", + "MINdCH2", + "MINssCH2", + "MINtCH", + "MINdsCH", + "MINaaCH", + "MINsssCH", + "MINddC", + "MINtsC", + "MINdssC", + "MINaasC", + "MINaaaC", + "MINssssC", + "MINsNH3", + "MINsNH2", + "MINssNH2", + "MINdNH", + "MINssNH", + "MINaaNH", + "MINtN", + "MINsssNH", + "MINdsN", + "MINaaN", + "MINsssN", + "MINddsN", + "MINaasN", + "MINssssN", + "MINsOH", + "MINdO", + "MINssO", + "MINaaO", + "MINsF", + "MINsSiH3", + "MINssSiH2", + "MINsssSiH", + "MINssssSi", + "MINsPH2", + "MINssPH", + "MINsssP", + "MINdsssP", + "MINsssssP", + "MINsSH", + "MINdS", + "MINssS", + "MINaaS", + "MINdssS", + "MINddssS", + "MINsCl", + "MINsGeH3", + "MINssGeH2", + "MINsssGeH", + "MINssssGe", + "MINsAsH2", + "MINssAsH", + "MINsssAs", + "MINsssdAs", + "MINsssssAs", + "MINsSeH", + "MINdSe", + "MINssSe", + "MINaaSe", + "MINdssSe", + "MINddssSe", + "MINsBr", + "MINsSnH3", + "MINssSnH2", + "MINsssSnH", + "MINssssSn", + "MINsI", + "MINsPbH3", + "MINssPbH2", + "MINsssPbH", + "MINssssPb", + "ECIndex", + "ETA_alpha", + "AETA_alpha", + "ETA_shape_p", + "ETA_shape_y", + "ETA_shape_x", + "ETA_beta", + "AETA_beta", + "ETA_beta_s", + "AETA_beta_s", + "ETA_beta_ns", + "AETA_beta_ns", + "ETA_beta_ns_d", + "AETA_beta_ns_d", + "ETA_eta", + "AETA_eta", + "ETA_eta_L", + "AETA_eta_L", + "ETA_eta_R", + "AETA_eta_R", + "ETA_eta_RL", + "AETA_eta_RL", + "ETA_eta_F", + "AETA_eta_F", + "ETA_eta_FL", + "AETA_eta_FL", + "ETA_eta_B", + "AETA_eta_B", + "ETA_eta_BR", + "AETA_eta_BR", + "ETA_dAlpha_A", + "ETA_dAlpha_B", + "ETA_epsilon_1", + "ETA_epsilon_2", + "ETA_epsilon_3", + "ETA_epsilon_4", + "ETA_epsilon_5", + "ETA_dEpsilon_A", + "ETA_dEpsilon_B", + "ETA_dEpsilon_C", + "ETA_dEpsilon_D", + "ETA_dBeta", + "AETA_dBeta", + "ETA_psi_1", + "ETA_dPsi_A", + "ETA_dPsi_B", + "fragCpx", + "fMF", + "GeomDiameter", + "GeomRadius", + "GeomShapeIndex", + "GeomPetitjeanIndex", + "GRAV", + "GRAVH", + "GRAVp", + "GRAVHp", + "nHBAcc", + "nHBDon", + "IC0", + "IC1", + "IC2", + "IC3", + "IC4", + "IC5", + "TIC0", + "TIC1", + "TIC2", + "TIC3", + "TIC4", + "TIC5", + "SIC0", + "SIC1", + "SIC2", + "SIC3", + "SIC4", + "SIC5", + "BIC0", + "BIC1", + "BIC2", + "BIC3", + "BIC4", + "BIC5", + "CIC0", + "CIC1", + "CIC2", + "CIC3", + "CIC4", + "CIC5", + "MIC0", + "MIC1", + "MIC2", + "MIC3", + "MIC4", + "MIC5", + "ZMIC0", + "ZMIC1", + "ZMIC2", + "ZMIC3", + "ZMIC4", + "ZMIC5", + "Kier1", + "Kier2", + "Kier3", + "Lipinski", + "GhoseFilter", + "FilterItLogS", + "VMcGowan", + "Mor01", + "Mor02", + "Mor03", + "Mor04", + "Mor05", + "Mor06", + "Mor07", + "Mor08", + "Mor09", + "Mor10", + "Mor11", + "Mor12", + "Mor13", + "Mor14", + "Mor15", + "Mor16", + "Mor17", + "Mor18", + "Mor19", + "Mor20", + "Mor21", + "Mor22", + "Mor23", + "Mor24", + "Mor25", + "Mor26", + "Mor27", + "Mor28", + "Mor29", + "Mor30", + "Mor31", + "Mor32", + "Mor01m", + "Mor02m", + "Mor03m", + "Mor04m", + "Mor05m", + "Mor06m", + "Mor07m", + "Mor08m", + "Mor09m", + "Mor10m", + "Mor11m", + "Mor12m", + "Mor13m", + "Mor14m", + "Mor15m", + "Mor16m", + "Mor17m", + "Mor18m", + "Mor19m", + "Mor20m", + "Mor21m", + "Mor22m", + "Mor23m", + "Mor24m", + "Mor25m", + "Mor26m", + "Mor27m", + "Mor28m", + "Mor29m", + "Mor30m", + "Mor31m", + "Mor32m", + "Mor01v", + "Mor02v", + "Mor03v", + "Mor04v", + "Mor05v", + "Mor06v", + "Mor07v", + "Mor08v", + "Mor09v", + "Mor10v", + "Mor11v", + "Mor12v", + "Mor13v", + "Mor14v", + "Mor15v", + "Mor16v", + "Mor17v", + "Mor18v", + "Mor19v", + "Mor20v", + "Mor21v", + "Mor22v", + "Mor23v", + "Mor24v", + "Mor25v", + "Mor26v", + "Mor27v", + "Mor28v", + "Mor29v", + "Mor30v", + "Mor31v", + "Mor32v", + "Mor01se", + "Mor02se", + "Mor03se", + "Mor04se", + "Mor05se", + "Mor06se", + "Mor07se", + "Mor08se", + "Mor09se", + "Mor10se", + "Mor11se", + "Mor12se", + "Mor13se", + "Mor14se", + "Mor15se", + "Mor16se", + "Mor17se", + "Mor18se", + "Mor19se", + "Mor20se", + "Mor21se", + "Mor22se", + "Mor23se", + "Mor24se", + "Mor25se", + "Mor26se", + "Mor27se", + "Mor28se", + "Mor29se", + "Mor30se", + "Mor31se", + "Mor32se", + "Mor01p", + "Mor02p", + "Mor03p", + "Mor04p", + "Mor05p", + "Mor06p", + "Mor07p", + "Mor08p", + "Mor09p", + "Mor10p", + "Mor11p", + "Mor12p", + "Mor13p", + "Mor14p", + "Mor15p", + "Mor16p", + "Mor17p", + "Mor18p", + "Mor19p", + "Mor20p", + "Mor21p", + "Mor22p", + "Mor23p", + "Mor24p", + "Mor25p", + "Mor26p", + "Mor27p", + "Mor28p", + "Mor29p", + "Mor30p", + "Mor31p", + "Mor32p", + "LabuteASA", + "PEOE_VSA1", + "PEOE_VSA2", + "PEOE_VSA3", + "PEOE_VSA4", + "PEOE_VSA5", + "PEOE_VSA6", + "PEOE_VSA7", + "PEOE_VSA8", + "PEOE_VSA9", + "PEOE_VSA10", + "PEOE_VSA11", + "PEOE_VSA12", + "PEOE_VSA13", + "SMR_VSA1", + "SMR_VSA2", + "SMR_VSA3", + "SMR_VSA4", + "SMR_VSA5", + "SMR_VSA6", + "SMR_VSA7", + "SMR_VSA8", + "SMR_VSA9", + "SlogP_VSA1", + "SlogP_VSA2", + "SlogP_VSA3", + "SlogP_VSA4", + "SlogP_VSA5", + "SlogP_VSA6", + "SlogP_VSA7", + "SlogP_VSA8", + "SlogP_VSA9", + "SlogP_VSA10", + "SlogP_VSA11", + "EState_VSA1", + "EState_VSA2", + "EState_VSA3", + "EState_VSA4", + "EState_VSA5", + "EState_VSA6", + "EState_VSA7", + "EState_VSA8", + "EState_VSA9", + "EState_VSA10", + "VSA_EState1", + "VSA_EState2", + "VSA_EState3", + "VSA_EState4", + "VSA_EState5", + "VSA_EState6", + "VSA_EState7", + "VSA_EState8", + "VSA_EState9", + "MDEC-11", + "MDEC-12", + "MDEC-13", + "MDEC-14", + "MDEC-22", + "MDEC-23", + "MDEC-24", + "MDEC-33", + "MDEC-34", + "MDEC-44", + "MDEO-11", + "MDEO-12", + "MDEO-22", + "MDEN-11", + "MDEN-12", + "MDEN-13", + "MDEN-22", + "MDEN-23", + "MDEN-33", + "MID", + "AMID", + "MID_h", + "AMID_h", + "MID_C", + "AMID_C", + "MID_N", + "AMID_N", + "MID_O", + "AMID_O", + "MID_X", + "AMID_X", + "MOMI-X", + "MOMI-Y", + "MOMI-Z", + "PBF", + "MPC2", + "MPC3", + "MPC4", + "MPC5", + "MPC6", + "MPC7", + "MPC8", + "MPC9", + "MPC10", + "TMPC10", + "piPC1", + "piPC2", + "piPC3", + "piPC4", + "piPC5", + "piPC6", + "piPC7", + "piPC8", + "piPC9", + "piPC10", + "TpiPC10", + "apol", + "bpol", + "nRing", + "n3Ring", + "n4Ring", + "n5Ring", + "n6Ring", + "n7Ring", + "n8Ring", + "n9Ring", + "n10Ring", + "n11Ring", + "n12Ring", + "nG12Ring", + "nHRing", + "n3HRing", + "n4HRing", + "n5HRing", + "n6HRing", + "n7HRing", + "n8HRing", + "n9HRing", + "n10HRing", + "n11HRing", + "n12HRing", + "nG12HRing", + "naRing", + "n3aRing", + "n4aRing", + "n5aRing", + "n6aRing", + "n7aRing", + "n8aRing", + "n9aRing", + "n10aRing", + "n11aRing", + "n12aRing", + "nG12aRing", + "naHRing", + "n3aHRing", + "n4aHRing", + "n5aHRing", + "n6aHRing", + "n7aHRing", + "n8aHRing", + "n9aHRing", + "n10aHRing", + "n11aHRing", + "n12aHRing", + "nG12aHRing", + "nARing", + "n3ARing", + "n4ARing", + "n5ARing", + "n6ARing", + "n7ARing", + "n8ARing", + "n9ARing", + "n10ARing", + "n11ARing", + "n12ARing", + "nG12ARing", + "nAHRing", + "n3AHRing", + "n4AHRing", + "n5AHRing", + "n6AHRing", + "n7AHRing", + "n8AHRing", + "n9AHRing", + "n10AHRing", + "n11AHRing", + "n12AHRing", + "nG12AHRing", + "nFRing", + "n4FRing", + "n5FRing", + "n6FRing", + "n7FRing", + "n8FRing", + "n9FRing", + "n10FRing", + "n11FRing", + "n12FRing", + "nG12FRing", + "nFHRing", + "n4FHRing", + "n5FHRing", + "n6FHRing", + "n7FHRing", + "n8FHRing", + "n9FHRing", + "n10FHRing", + "n11FHRing", + "n12FHRing", + "nG12FHRing", + "nFaRing", + "n4FaRing", + "n5FaRing", + "n6FaRing", + "n7FaRing", + "n8FaRing", + "n9FaRing", + "n10FaRing", + "n11FaRing", + "n12FaRing", + "nG12FaRing", + "nFaHRing", + "n4FaHRing", + "n5FaHRing", + "n6FaHRing", + "n7FaHRing", + "n8FaHRing", + "n9FaHRing", + "n10FaHRing", + "n11FaHRing", + "n12FaHRing", + "nG12FaHRing", + "nFARing", + "n4FARing", + "n5FARing", + "n6FARing", + "n7FARing", + "n8FARing", + "n9FARing", + "n10FARing", + "n11FARing", + "n12FARing", + "nG12FARing", + "nFAHRing", + "n4FAHRing", + "n5FAHRing", + "n6FAHRing", + "n7FAHRing", + "n8FAHRing", + "n9FAHRing", + "n10FAHRing", + "n11FAHRing", + "n12FAHRing", + "nG12FAHRing", + "nRot", + "RotRatio", + "SLogP", + "SMR", + "TopoPSA(NO)", + "TopoPSA", + "GGI1", + "GGI2", + "GGI3", + "GGI4", + "GGI5", + "GGI6", + "GGI7", + "GGI8", + "GGI9", + "GGI10", + "JGI1", + "JGI2", + "JGI3", + "JGI4", + "JGI5", + "JGI6", + "JGI7", + "JGI8", + "JGI9", + "JGI10", + "JGT10", + "Diameter", + "Radius", + "TopoShapeIndex", + "PetitjeanIndex", + "Vabc", + "VAdjMat", + "MWC01", + "MWC02", + "MWC03", + "MWC04", + "MWC05", + "MWC06", + "MWC07", + "MWC08", + "MWC09", + "MWC10", + "TMWC10", + "SRW02", + "SRW03", + "SRW04", + "SRW05", + "SRW06", + "SRW07", + "SRW08", + "SRW09", + "SRW10", + "TSRW10", + "MW", + "AMW", + "WPath", + "WPol", + "Zagreb1", + "Zagreb2", + "mZagreb1", + "mZagreb2", +] diff --git a/bofire/data_models/surrogates/api.py b/bofire/data_models/surrogates/api.py index e700bd600..e9dc0f16a 100644 --- a/bofire/data_models/surrogates/api.py +++ b/bofire/data_models/surrogates/api.py @@ -21,6 +21,7 @@ SingleTaskGPSurrogate, ) from bofire.data_models.surrogates.surrogate import Surrogate + from bofire.data_models.surrogates.tanimoto_gp import TanimotoGPSurrogate from bofire.data_models.surrogates.xgb import XGBoostSurrogate AbstractSurrogate = Union[Surrogate, BotorchSurrogate, EmpiricalSurrogate] @@ -34,6 +35,7 @@ SaasSingleTaskGPSurrogate, XGBoostSurrogate, LinearSurrogate, + TanimotoGPSurrogate, ] AnyTrainableSurrogate = Union[ @@ -44,6 +46,7 @@ SaasSingleTaskGPSurrogate, XGBoostSurrogate, LinearSurrogate, + TanimotoGPSurrogate, ] except ImportError: # with the minimal installationwe don't have botorch diff --git a/bofire/data_models/surrogates/botorch.py b/bofire/data_models/surrogates/botorch.py index db8a905cd..120e91456 100644 --- a/bofire/data_models/surrogates/botorch.py +++ b/bofire/data_models/surrogates/botorch.py @@ -4,8 +4,10 @@ from bofire.data_models.features.api import ( CategoricalDescriptorInput, CategoricalInput, + MolecularInput, NumericalInput, ) +from bofire.data_models.molfeatures.api import Fingerprints, MolFeatures from bofire.data_models.surrogates.surrogate import Surrogate @@ -15,6 +17,7 @@ def validate_input_preprocessing_specs(cls, v, values): inputs = values["inputs"] categorical_keys = inputs.get_keys(CategoricalInput, exact=True) descriptor_keys = inputs.get_keys(CategoricalDescriptorInput, exact=True) + molecular_keys = inputs.get_keys(MolecularInput, exact=True) for key in categorical_keys: if ( v.get(key, CategoricalEncodingEnum.ONE_HOT) @@ -41,4 +44,12 @@ def validate_input_preprocessing_specs(cls, v, values): raise ValueError( "Botorch based models have to use internal transforms to preprocess numerical features." ) + # TODO: include descriptors into probabilistic reparam via OneHotToDescriptor input transform + for key in molecular_keys: + mol_encoding = v.get(key, Fingerprints()) + if not isinstance(mol_encoding, MolFeatures): + raise ValueError( + "Botorch based models have to use fingerprints, fragments, fingerprints_fragments, or molecular descriptors for molecular inputs" + ) + v[key] = mol_encoding return v diff --git a/bofire/data_models/surrogates/tanimoto_gp.py b/bofire/data_models/surrogates/tanimoto_gp.py new file mode 100644 index 000000000..1a1df1d89 --- /dev/null +++ b/bofire/data_models/surrogates/tanimoto_gp.py @@ -0,0 +1,29 @@ +from typing import Literal + +from pydantic import Field + +from bofire.data_models.kernels.api import AnyKernel, ScaleKernel +from bofire.data_models.kernels.molecular import TanimotoKernel +from bofire.data_models.priors.api import ( + BOTORCH_NOISE_PRIOR, + BOTORCH_SCALE_PRIOR, + AnyPrior, +) +from bofire.data_models.surrogates.botorch import BotorchSurrogate +from bofire.data_models.surrogates.scaler import ScalerEnum +from bofire.data_models.surrogates.trainable import TrainableSurrogate + + +class TanimotoGPSurrogate(BotorchSurrogate, TrainableSurrogate): + type: Literal["TanimotoGPSurrogate"] = "TanimotoGPSurrogate" + + kernel: AnyKernel = Field( + default_factory=lambda: ScaleKernel( + base_kernel=TanimotoKernel( + ard=True, + ), + outputscale_prior=BOTORCH_SCALE_PRIOR(), + ) + ) + noise_prior: AnyPrior = Field(default_factory=lambda: BOTORCH_NOISE_PRIOR()) + scaler: ScalerEnum = ScalerEnum.IDENTITY diff --git a/bofire/kernels/fingerprint_kernels/base_fingerprint_kernel.py b/bofire/kernels/fingerprint_kernels/base_fingerprint_kernel.py new file mode 100644 index 000000000..5cfee3f9e --- /dev/null +++ b/bofire/kernels/fingerprint_kernels/base_fingerprint_kernel.py @@ -0,0 +1,138 @@ +""" +Module for test_kernels that operate on fingerprint representations (bit vectors or count vectors). +Author: Ryan-Rhys Griffiths and Austin Tripp 2022 +""" +# This code was copied from GAUCHE: https://github.com/leojklarner/gauche/blob/main/gauche/kernels/fingerprint_kernels/base_fingerprint_kernel.py + +import torch +from gpytorch.kernels import Kernel + + +def default_postprocess_script(x): + return x + + +def batch_tanimoto_sim( + x1: torch.Tensor, x2: torch.Tensor, eps: float = 1e-6 +) -> torch.Tensor: + """ + Tanimoto between two batched tensors, across last 2 dimensions. + eps argument ensures numerical stability if all zero tensors are added. + """ + # Tanimoto distance is proportional to () / (||x||^2 + ||y||^2 - ) where x and y are bit vectors + assert x1.ndim >= 2 and x2.ndim >= 2 + dot_prod = torch.matmul(x1, torch.transpose(x2, -1, -2)) + # x1_sum = torch.sum(x1**2, dim=-1, keepdims=True) + # x2_sum = torch.sum(x2**2, dim=-1, keepdims=True) + x1_sum = torch.sum(x1**2, dim=-1).unsqueeze(-1) + x2_sum = torch.sum(x2**2, dim=-1).unsqueeze(-1) + return (dot_prod + eps) / ( + eps + x1_sum + torch.transpose(x2_sum, -1, -2) - dot_prod + ) + + +class BitDistance(torch.nn.Module): + """ + Distance module for bit vector test_kernels. + """ + + def __init__(self, postprocess_script=default_postprocess_script): + super().__init__() + self._postprocess = postprocess_script + + def _sim(self, x1, x2, postprocess, x1_eq_x2=False, metric="tanimoto"): + """ + Computes the similarity between x1 and x2 + + Args: + x1 (Tensor): First set of data where b is a batch dimension. Has shape `n x d` or `b x n x d` + x2 (Tensor): Second set of data where b is a batch dimension. Has shape `m x d` or `b x m x d` + postprocess (bool): Whether to apply a postprocess script + x1_eq_x2 (bool, optional): Is x1 equal to x2. Defaults to False + metric (str): String specifying the similarity metric. One of ['tanimoto']. Defaults to 'tanimoto' + + Raises: + RuntimeError: If tanimoto is not used as the similarity metric. + + Returns: + Tensor: corresponding to the similarity matrix between `x1` and `x2` + """ + # Branch for Tanimoto metric + if metric == "tanimoto": + res = batch_tanimoto_sim(x1, x2) + res.clamp_min_(0) # zero out negative values + return self._postprocess(res) if postprocess else res + else: + raise RuntimeError( + "Similarity metric not supported. Available options are 'tanimoto'" + ) + + +class BitKernel(Kernel): + """ + Base class for test_kernels that operate on bit or count vectors such as ECFP fingerprints or RDKit fragments. + In the typical use case, test_kernels inheriting from this class will specify a similarity metric such as Tanimoto, + MinMax etc. This kernel does not have an `outputscale` parameter. To add a scaling parameter, decorate this kernel + with a `gpytorch.test_kernels.ScaleKernel`. This base :class:`BitKernel` class does not include a lengthscale + parameter `Theta`, in contrast to many common kernel functions. + + Args: + metric (str): The similarity metric to use. One of ['tanimoto']. Defaults to '' + """ + + def __init__(self, metric="", **kwargs): + super().__init__(**kwargs) + self.metric = metric + + def forward(self, x1, x2, **params): + return self.covar_dist(x1, x2, **params) + + def covar_dist( + self, + x1, + x2, + last_dim_is_batch=False, + dist_postprocess_func=default_postprocess_script, + postprocess=True, + **params, + ): + """ + This is a helper method for computing the bit vector similarity between + all pairs of points in x1 and x2. + + Args: + x1 (Tensor): First set of data. Has shape `n x d` or `b1 x ... x bk x n x d` + x2 (Tensor): Second set of data. Has shape `m x d` or `b1 x ... x bk x m x d` + last_dim_is_batch (bool, optional): Is the last dimension of the data a batch dimension or not?. + Defaults to False + postprocess (bool): Whether to apply a postprocess script + + Returns: + Tensor: corresponding to the distance matrix between `x1` and `x2`. + The shape depends on the kernel's mode + * `diag=False` + * `diag=False` and `last_dim_is_batch=True`: (`b x d x n x n`) + * `diag=True` + * `diag=True` and `last_dim_is_batch=True`: (`b x d x n`) + """ + if last_dim_is_batch: + x1 = x1.transpose(-1, -2).unsqueeze(-1) + x2 = x2.transpose(-1, -2).unsqueeze(-1) + + x1_eq_x2 = torch.equal(x1, x2) + + # torch scripts expect tensors + postprocess = torch.tensor(postprocess) + + res = None + + # Cache the Distance object or else JIT will recompile every time + if ( + not self.distance_module + or self.distance_module._postprocess != dist_postprocess_func + ): + self.distance_module = BitDistance(dist_postprocess_func) + + res = self.distance_module._sim(x1, x2, postprocess, x1_eq_x2, self.metric) + + return res diff --git a/bofire/kernels/fingerprint_kernels/tanimoto_kernel.py b/bofire/kernels/fingerprint_kernels/tanimoto_kernel.py new file mode 100644 index 000000000..bca339bd3 --- /dev/null +++ b/bofire/kernels/fingerprint_kernels/tanimoto_kernel.py @@ -0,0 +1,53 @@ +""" +Tanimoto Kernel. Operates on representations including bit vectors e.g. Morgan/ECFP6 fingerprints count vectors e.g. +RDKit fragment features. +""" +# This code was copied from GAUCHE: https://github.com/leojklarner/gauche/blob/main/gauche/kernels/fingerprint_kernels/tanimoto_kernel.py + +import torch + +from bofire.kernels.fingerprint_kernels.base_fingerprint_kernel import BitKernel + + +class TanimotoKernel(BitKernel): + r""" + Computes a covariance matrix based on the Tanimoto kernel between inputs `x1` and `x2`: + + Formula: + .. math:: + + \begin{equation*} + k_{\text{Tanimoto}}(\mathbf{x}, \mathbf{x'}) = \frac{\langle\mathbf{x}, + \mathbf{x'}\rangle}{\left\lVert\mathbf{x}\right\rVert^2 + \left\lVert\mathbf{x'}\right\rVert^2 - + \langle\mathbf{x}, \mathbf{x'}\rangle} + \end{equation*} + + This kernel does not have an `outputscale` parameter. To add a scaling parameter, + decorate this kernel with a `gpytorch.test_kernels.ScaleKernel`. + + Example: + >>> x = torch.randint(0, 2, (10, 5)) + >>> # Non-batch: Simple option + >>> covar_module = gpytorch.kernels.ScaleKernel(TanimotoKernel()) + >>> covar = covar_module(x) # Output: LazyTensor of size (10 x 10) + >>> + >>> batch_x = torch.randint(0, 2, (2, 10, 5)) + >>> # Batch: Simple option + >>> covar_module = gpytorch.kernels.ScaleKernel(TanimotoKernel()) + >>> covar = covar_module(batch_x) # Output: LazyTensor of size (2 x 10 x 10) + """ + is_stationary = False + has_lengthscale = False + + def __init__(self, **kwargs): + super(TanimotoKernel, self).__init__(**kwargs) + self.metric = "tanimoto" + + def forward(self, x1, x2, diag=False, **params): + if diag: + assert x1.size() == x2.size() and torch.equal(x1, x2) + return torch.ones( + *x1.shape[:-2], x1.shape[-2], dtype=x1.dtype, device=x1.device + ) + else: + return self.covar_dist(x1, x2, **params) diff --git a/bofire/kernels/mapper.py b/bofire/kernels/mapper.py index da8b458d0..33bd0c9c5 100644 --- a/bofire/kernels/mapper.py +++ b/bofire/kernels/mapper.py @@ -6,6 +6,7 @@ import bofire.data_models.kernels.api as data_models import bofire.priors.api as priors +from bofire.kernels.fingerprint_kernels.tanimoto_kernel import TanimotoKernel def map_RBFKernel( @@ -113,6 +114,19 @@ def map_ScaleKernel( ) +def map_TanimotoKernel( + data_model: data_models.TanimotoKernel, + batch_shape: torch.Size, + ard_num_dims: int, + active_dims: List[int], +) -> TanimotoKernel: + return TanimotoKernel( + batch_shape=batch_shape, + ard_num_dims=len(active_dims) if data_model.ard else None, + active_dims=active_dims, + ) + + KERNEL_MAP = { data_models.RBFKernel: map_RBFKernel, data_models.MaternKernel: map_MaternKernel, @@ -120,6 +134,7 @@ def map_ScaleKernel( data_models.AdditiveKernel: map_AdditiveKernel, data_models.MultiplicativeKernel: map_MultiplicativeKernel, data_models.ScaleKernel: map_ScaleKernel, + data_models.TanimotoKernel: map_TanimotoKernel, } diff --git a/bofire/surrogates/mapper.py b/bofire/surrogates/mapper.py index ad25842ba..5be8d4f3e 100644 --- a/bofire/surrogates/mapper.py +++ b/bofire/surrogates/mapper.py @@ -19,6 +19,7 @@ data_models.SaasSingleTaskGPSurrogate: SaasSingleTaskGPSurrogate, data_models.XGBoostSurrogate: XGBoostSurrogate, data_models.LinearSurrogate: SingleTaskGPSurrogate, + data_models.TanimotoGPSurrogate: SingleTaskGPSurrogate, } diff --git a/bofire/surrogates/single_task_gp.py b/bofire/surrogates/single_task_gp.py index 28cef4cae..ecf2d893e 100644 --- a/bofire/surrogates/single_task_gp.py +++ b/bofire/surrogates/single_task_gp.py @@ -12,6 +12,8 @@ import bofire.priors.api as priors from bofire.data_models.domain.api import Inputs from bofire.data_models.enum import CategoricalEncodingEnum, OutputFilteringEnum +from bofire.data_models.features.api import TInputTransformSpecs +from bofire.data_models.molfeatures.api import MolFeatures from bofire.data_models.surrogates.api import SingleTaskGPSurrogate as DataModel from bofire.data_models.surrogates.scaler import ScalerEnum from bofire.surrogates.botorch import BotorchSurrogate @@ -21,18 +23,18 @@ def get_scaler( inputs: Inputs, - input_preprocessing_specs: Dict[str, CategoricalEncodingEnum], + input_preprocessing_specs: TInputTransformSpecs, scaler: ScalerEnum, X: pd.DataFrame, -) -> Union[InputStandardize, Normalize]: +) -> Union[InputStandardize, Normalize, None]: """Returns the instanitated scaler object for a set of input features and input_preprocessing_specs. Args: inputs (Inputs): Input features. - input_preprocessing_specs (Dict[str, CategoricalEncodingEnum]): Dictionary how to treat - the categoricals. + input_preprocessing_specs (TInputTransformSpecs): Dictionary how to treat + the categoricals and/or molecules. scaler (ScalerEnum): Enum indicating the scaler of interest. X (pd.DataFrame): The dataset of interest. @@ -50,6 +52,7 @@ def get_scaler( key for key, value in input_preprocessing_specs.items() if value != CategoricalEncodingEnum.DESCRIPTOR + and not isinstance(value, MolFeatures) ] ord_dims = [] diff --git a/bofire/utils/cheminformatics.py b/bofire/utils/cheminformatics.py index da0711434..97d905365 100644 --- a/bofire/utils/cheminformatics.py +++ b/bofire/utils/cheminformatics.py @@ -1,16 +1,25 @@ import warnings -from typing import List +from typing import List, Optional import numpy as np +import pandas as pd try: from rdkit.Chem import AllChem, Descriptors, MolFromSmiles # type: ignore - from sklearn.feature_extraction.text import CountVectorizer + + # from sklearn.feature_extraction.text import CountVectorizer except ImportError: warnings.warn( "rdkit not installed, BoFire's cheminformatics utilities cannot be used." ) +try: + from mordred import Calculator, descriptors +except ImportError: + warnings.warn( + "mordred not installed. Mordred molecular descriptors cannot be used." + ) + # This code is based on GAUCHE: https://github.com/leojklarner/gauche/blob/main/gauche/data_featuriser/featurisation.py @@ -56,7 +65,9 @@ def smiles2fingerprints( return np.asarray(fps) -def smiles2fragments(smiles: List[str]) -> np.ndarray: +def smiles2fragments( + smiles: List[str], fragments_list: Optional[List[str]] = None +) -> np.ndarray: """Transforms smiles to an array of fragments. Args: @@ -68,27 +79,73 @@ def smiles2fragments(smiles: List[str]) -> np.ndarray: # descList[115:] contains fragment-based features only # (https://www.rdkit.org/docs/source/rdkit.Chem.Fragments.html) # Update: in the new RDKit version the indices are [124:] - fragments = {d[0]: d[1] for d in Descriptors.descList[124:]} + if fragments_list is None: + fragments = {d[0]: d[1] for d in Descriptors.descList[124:]} + else: + fragments = { + d[0]: d[1] for d in Descriptors.descList[124:] if d[0] in fragments_list + } + frags = np.zeros((len(smiles), len(fragments))) - for i in range(len(smiles)): - mol = smiles2mol(smiles[i]) + for i, smi in enumerate(smiles): + mol = smiles2mol(smi) features = [fragments[d](mol) for d in fragments] frags[i, :] = features return frags -def smiles2bag_of_characters(smiles: List[str], max_ngram: int = 5) -> np.ndarray: - """Transforms list of smiles to bag of characters. +# def smiles2bag_of_characters(smiles: List[str], max_ngram: int = 5) -> np.ndarray: +# """Transforms list of smiles to bag of characters. +# +# Args: +# smiles (List[str]): List of smiles +# max_ngram (int, optional): Maximal ngram value. Defaults to 5. +# +# Returns: +# np.ndarray: Array holding the bag of characters. +# """ +# for smi in smiles: +# smiles2mol(smi) +# cv = CountVectorizer(ngram_range=(1, max_ngram), analyzer="char", lowercase=False) +# return cv.fit_transform(smiles).toarray() + + +def smiles2mordred(smiles: List[str], descriptors_list: List[str]) -> np.ndarray: + """Transforms list of smiles to mordred moelcular descriptors. Args: smiles (List[str]): List of smiles - max_ngram (int, optional): Maximal ngram value. Defaults to 5. + descriptors_list (List[str]): List of desired mordred descriptors Returns: - np.ndarray: Array holding the bag of characters. + np.ndarray: Array holding the mordred moelcular descriptors. """ - for smi in smiles: - smiles2mol(smi) - cv = CountVectorizer(ngram_range=(1, max_ngram), analyzer="char", lowercase=False) - return cv.fit_transform(smiles).toarray() + mols = [smiles2mol(smi) for smi in smiles] + + calc = Calculator(descriptors, ignore_3D=True) + calc.descriptors = [d for d in calc.descriptors if str(d) in descriptors_list] + + descriptors_df = calc.pandas(mols) + nan_list = [ + pd.to_numeric(descriptors_df[col], errors="coerce").isnull().values.any() + for col in descriptors_df.columns + ] + if any(nan_list): + raise ValueError( + f"Found NaN values in descriptors {list(descriptors_df.columns[nan_list])}" + ) + + return descriptors_df.astype(float).values + + +def smiles2fragments_fingerprints( + smiles: List[str], + bond_radius: int = 5, + n_bits: int = 2048, + fragments_list: Optional[List[str]] = None, +) -> np.ndarray: + fingerprints = smiles2fingerprints(smiles, bond_radius=bond_radius, n_bits=n_bits) + fragments = smiles2fragments(smiles, fragments_list=fragments_list) + + return np.hstack((fingerprints, fragments)) diff --git a/notebooks/fixed.ipynb b/notebooks/fixed.ipynb new file mode 100644 index 000000000..7de73843e --- /dev/null +++ b/notebooks/fixed.ipynb @@ -0,0 +1,355 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/Caskroom/miniforge/base/envs/bofire/lib/python3.10/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[1.0000, 0.8000, 0.5000]])\n", + "tensor(2.3000)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/j30607/sandbox/botorch/botorch/models/gpytorch.py:129: UserWarning: The model inputs are of type torch.float32. It is strongly recommended to use double precision in BoTorch, as this improves both precision and stability and can help avoid numerical errors. See https://github.com/pytorch/botorch/discussions/1444\n", + " warnings.warn(_get_single_precision_warning(X.dtype), UserWarning)\n" + ] + } + ], + "source": [ + "import torch\n", + "from botorch.models import SingleTaskGP\n", + "from botorch.fit import fit_gpytorch_mll\n", + "from botorch.utils import standardize\n", + "from gpytorch.mlls import ExactMarginalLogLikelihood\n", + "from botorch.utils.sampling import get_polytope_samples\n", + "\n", + "def constraint(X):\n", + " return torch.sum(X[...,:2], dim=-1) -1.80\n", + "\n", + "train_X = torch.rand(10, 3)\n", + "Y = 1 - torch.norm(train_X[:,:2] - 0.5, dim=-1, keepdim=True)\n", + "Y = Y + 0.1 * torch.randn_like(Y) # add some noise\n", + "train_Y = standardize(Y)\n", + "\n", + "gp = SingleTaskGP(train_X, train_Y)\n", + "mll = ExactMarginalLogLikelihood(gp.likelihood, gp)\n", + "fit_gpytorch_mll(mll)\n", + "\n", + "from botorch.acquisition import UpperConfidenceBound\n", + "\n", + "UCB = UpperConfidenceBound(gp, beta=0.1)\n", + "\n", + "from botorch.optim import optimize_acqf\n", + "\n", + "\n", + "bounds = torch.stack([torch.zeros(3), torch.ones(3)])\n", + "\n", + "batch_initial_conditions = get_polytope_samples(\n", + " n=5,\n", + " bounds=bounds,\n", + " inequality_constraints=[(torch.tensor([0,1]),torch.tensor([1.,1.]),1.8)],\n", + " n_burnin=1000\n", + ").unsqueeze(-2)\n", + "\n", + "#batch_initial_conditions[...,-1] = 0.51\n", + "\n", + "\n", + "candidate, acq_value = optimize_acqf(\n", + " UCB, \n", + " bounds=bounds, \n", + " q=1, \n", + " num_restarts=5, \n", + " raw_samples=20,\n", + " fixed_features={2:0.5}, \n", + " nonlinear_inequality_constraints = [constraint],\n", + " batch_initial_conditions = batch_initial_conditions\n", + ")\n", + "print(candidate) # tensor([0.4887, 0.5063])\n", + "print(candidate.sum())" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor(1.7622)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#batch_initial_conditions[...,-1] = 0.51\n", + "\n", + "batch_initial_conditions" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.04289999999999994" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "0.2409+0.3020-0.5" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[[0.2409, 0.3020, 0.8284],\n", + " [0.9964, 0.2340, 0.2588]],\n", + "\n", + " [[0.8168, 0.1836, 0.4096],\n", + " [0.9400, 0.1572, 0.7149]],\n", + "\n", + " [[0.8562, 0.6386, 0.2433],\n", + " [0.0539, 0.6275, 0.2009]],\n", + "\n", + " [[0.3394, 0.0418, 0.9832],\n", + " [0.0866, 0.8408, 0.6403]],\n", + "\n", + " [[0.5344, 0.4981, 0.0286],\n", + " [0.3925, 0.0441, 0.2758]],\n", + "\n", + " [[0.4395, 0.7974, 0.1505],\n", + " [0.0244, 0.4612, 0.5825]],\n", + "\n", + " [[0.3437, 0.0486, 0.8325],\n", + " [0.1474, 0.1788, 0.0014]],\n", + "\n", + " [[0.9382, 0.4493, 0.9168],\n", + " [0.3761, 0.5967, 0.8065]]])" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[ 0.0429, 0.7304],\n", + " [ 0.5004, 0.5972],\n", + " [ 0.9947, 0.1813],\n", + " [-0.1188, 0.4273],\n", + " [ 0.5325, -0.0634],\n", + " [ 0.7369, -0.0144],\n", + " [-0.1077, -0.1738],\n", + " [ 0.8875, 0.4728]])" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "torch.sum(test[...,:2], dim=-1)-0.5" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "from bofire.data_models.surrogates.api import TanimotoGPSurrogate, AnySurrogate\n", + "from bofire.data_models.features.api import MolecularInput, ContinuousOutput\n", + "from bofire.data_models.domain.api import Inputs, Outputs\n", + "from bofire.data_models.molfeatures.api import Fragments\n", + "\n", + "import pandas as pd\n", + "\n", + "inputs = Inputs(features=[MolecularInput(key=\"x_1\")])\n", + "outputs = Outputs(features=[ContinuousOutput(key=\"y\")])\n", + "experiments = [\n", + " [\"CC(=O)Oc1ccccc1C(=O)O\", 88.0],\n", + " [\"c1ccccc1\", 35.0],\n", + " [\"[CH3][CH2][OH]\", 69.0],\n", + " [\"N[C@](C)(F)C(=O)O\", 20.0],\n", + "]\n", + "experiments = pd.DataFrame(experiments, columns=[\"x_1\", \"y\"])\n", + "\n", + "surrogate_data = TanimotoGPSurrogate(\n", + " inputs=inputs,\n", + " outputs=outputs,\n", + " input_preprocessing_specs={\"x_1\": Fragments()},\n", + " )\n", + "\n", + "\n", + "# \"kernel, specs\",\n", + "# [\n", + "# (\n", + "# ScaleKernel(base_kernel=TanimotoKernel(ard=False)),\n", + "# {\"x_1\": Fingerprints(n_bits=32)},\n", + "# ),\n", + "# (\n", + "# ScaleKernel(base_kernel=TanimotoKernel(ard=True)),\n", + "# {\"x_1\": Fragments()},\n", + "# ),\n", + "# (\n", + "# ScaleKernel(base_kernel=TanimotoKernel(ard=False)),\n", + "# {\"x_1\": FingerprintsFragments(n_bits=32)},\n", + "# ),\n", + "# (\n", + "# ScaleKernel(base_kernel=RBFKernel(ard=True)),\n", + "# {\"x_1\": MordredDescriptors(descriptors=[\"NssCH2\", \"ATSC2d\"])},\n", + "# ),\n", + "# ],\n", + "# )\n", + "# def test_TanimotoGP(kernel, specs):\n", + "# inputs = Inputs(features=[MolecularInput(key=\"x_1\")])\n", + "# outputs = Outputs(features=[ContinuousOutput(key=\"y\")])\n", + "# experiments = [\n", + "# [\"CC(=O)Oc1ccccc1C(=O)O\", 88.0],\n", + "# [\"c1ccccc1\", 35.0],\n", + "# [\"[CH3][CH2][OH]\", 69.0],\n", + "# [\"N[C@](C)(F)C(=O)O\", 20.0],\n", + "# ]\n", + "# experiments = pd.DataFrame(experiments, columns=[\"x_1\", \"y\"])\n", + "# experiments[\"valid_y\"] = 1\n", + "# model = TanimotoGPSurrogate(\n", + "# inputs=inputs,\n", + "# outputs=outputs,\n", + "# kernel=kernel,\n", + "# input_preprocessing_specs=specs,\n", + "# )" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'x_1': Fingerprints(type='Fingerprints', bond_radius=5, n_bits=2048)}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "surrogate_data.input_preprocessing_specs" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "dump = surrogate_data.dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from pydantic import parse_obj_as" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'x_1': Fragments(type='Fragments', fragments=None)}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dump = surrogate_data.dict()\n", + "parse_obj_as(AnySurrogate,dump).input_preprocessing_specs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "bofire", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/setup.py b/setup.py index 259608950..c532b88e2 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,7 @@ sklearn_dependency, ], "xgb": ["xgboost>=1.7.5"], - "cheminfo": ["rdkit", sklearn_dependency], + "cheminfo": ["rdkit>=2023.3.2", sklearn_dependency, "mordred"], "tests": [ "mock", "mopti", diff --git a/tests/bofire/conftest.py b/tests/bofire/conftest.py index 09700638f..c05d04ed8 100644 --- a/tests/bofire/conftest.py +++ b/tests/bofire/conftest.py @@ -54,5 +54,9 @@ def condition_spec(request) -> specs.Spec: def outlier_detection_spec(request) -> specs.Spec: return request.param + @fixture(params=specs.molfeatures.valids) + def molfeatures_spec(request) -> specs.Spec: + return request.param + except AttributeError: pass diff --git a/tests/bofire/data_models/serialization/test_deserialization.py b/tests/bofire/data_models/serialization/test_deserialization.py index 2091eab8f..b68056663 100644 --- a/tests/bofire/data_models/serialization/test_deserialization.py +++ b/tests/bofire/data_models/serialization/test_deserialization.py @@ -6,6 +6,7 @@ AnyConstraint, AnyFeature, AnyKernel, + AnyMolFeatures, AnyObjective, AnyOutlierDetection, AnyPrior, @@ -85,3 +86,9 @@ def test_outlier_detection_should_be_deserializable(outlier_detection_spec: Spec obj = outlier_detection_spec.obj() deserialized = parse_obj_as(AnyOutlierDetection, obj.dict()) assert obj == deserialized + + +def test_molfeatures_should_be_deserializable(molfeatures_spec: Spec): + obj = molfeatures_spec.obj() + deserialized = parse_obj_as(AnyMolFeatures, obj.dict()) + assert obj == deserialized diff --git a/tests/bofire/data_models/serialization/test_serialization.py b/tests/bofire/data_models/serialization/test_serialization.py index 9a0847eb6..48bd73403 100644 --- a/tests/bofire/data_models/serialization/test_serialization.py +++ b/tests/bofire/data_models/serialization/test_serialization.py @@ -70,3 +70,9 @@ def test_outlier_detection_should_be_serializable(outlier_detection_spec: Spec): spec = outlier_detection_spec.typed_spec() obj = outlier_detection_spec.cls(**spec) assert obj.dict() == spec + + +def test_molfeatures_should_be_serializable(molfeatures_spec: Spec): + spec = molfeatures_spec.typed_spec() + obj = molfeatures_spec.cls(**spec) + assert obj.dict() == spec diff --git a/tests/bofire/data_models/specs/api.py b/tests/bofire/data_models/specs/api.py index 5201d3afc..021b0a558 100644 --- a/tests/bofire/data_models/specs/api.py +++ b/tests/bofire/data_models/specs/api.py @@ -11,6 +11,9 @@ # in case of the minimal installation these import are not available from tests.bofire.data_models.specs.conditions import specs as conditions from tests.bofire.data_models.specs.kernels import specs as kernels + from tests.bofire.data_models.specs.molfeatures import ( + specs as molfeatures, + ) from tests.bofire.data_models.specs.outlier_detection import ( specs as outlier_detection, ) diff --git a/tests/bofire/data_models/specs/features.py b/tests/bofire/data_models/specs/features.py index ec7e430af..f8806b84b 100644 --- a/tests/bofire/data_models/specs/features.py +++ b/tests/bofire/data_models/specs/features.py @@ -1,3 +1,4 @@ +# import importlib import random import uuid @@ -6,6 +7,8 @@ from tests.bofire.data_models.specs.objectives import specs as objectives from tests.bofire.data_models.specs.specs import Specs +# RDKIT_AVAILABLE = importlib.util.find_spec("rdkit") is not None + specs = Specs([]) specs.add_valid( @@ -68,12 +71,6 @@ ], }, ) -specs.add_valid( - features.MolecularInput, - lambda: { - "key": str(uuid.uuid4()), - }, -) specs.add_valid( features.ContinuousOutput, lambda: { @@ -91,3 +88,9 @@ "objective": [0.0, 1.0, 0.0], }, ) +specs.add_valid( + features.MolecularInput, + lambda: { + "key": str(uuid.uuid4()), + }, +) diff --git a/tests/bofire/data_models/specs/kernels.py b/tests/bofire/data_models/specs/kernels.py index a3e6e3605..74fef6cce 100644 --- a/tests/bofire/data_models/specs/kernels.py +++ b/tests/bofire/data_models/specs/kernels.py @@ -56,3 +56,9 @@ ] }, ) +specs.add_valid( + kernels.TanimotoKernel, + lambda: { + "ard": True, + }, +) diff --git a/tests/bofire/data_models/specs/molfeatures.py b/tests/bofire/data_models/specs/molfeatures.py new file mode 100644 index 000000000..d87863a19 --- /dev/null +++ b/tests/bofire/data_models/specs/molfeatures.py @@ -0,0 +1,68 @@ +import importlib +import random +import warnings + +import bofire.data_models.molfeatures.api as molfeatures +from tests.bofire.data_models.specs.specs import Specs + +try: + from rdkit.Chem import Descriptors + + fragments_list = [fragment[0] for fragment in Descriptors.descList[124:]] +except ImportError: + warnings.warn( + "rdkit not installed, BoFire's cheminformatics utilities cannot be used." + ) + +try: + from mordred import Calculator, descriptors + + calc = Calculator(descriptors, ignore_3D=False) + mordred_descriptors = [str(d) for d in calc.descriptors] +except ImportError: + warnings.warn( + "mordred not installed. Mordred molecular descriptors cannot be used." + ) + +RDKIT_AVAILABLE = importlib.util.find_spec("rdkit") is not None +MORDRED_AVAILABLE = importlib.util.find_spec("mordred") is not None + +specs = Specs([]) + +specs.add_valid( + molfeatures.Fingerprints, + lambda: { + "bond_radius": random.randrange(1, 6), + "n_bits": random.randrange(32, 2048), + }, +) + +if RDKIT_AVAILABLE: + specs.add_valid( + molfeatures.Fragments, + lambda: { + "fragments": random.sample( + fragments_list, k=random.randrange(1, len(fragments_list)) + ) + }, + ) + specs.add_valid( + molfeatures.FingerprintsFragments, + lambda: { + "bond_radius": random.randrange(1, 6), + "n_bits": random.randrange(32, 2048), + "fragments": random.sample( + fragments_list, k=random.randrange(1, len(fragments_list)) + ), + }, + ) + + if MORDRED_AVAILABLE: + specs.add_valid( + molfeatures.MordredDescriptors, + lambda: { + "descriptors": random.sample( + mordred_descriptors, k=random.randrange(1, 10) + ) + }, + ) diff --git a/tests/bofire/data_models/specs/surrogates.py b/tests/bofire/data_models/specs/surrogates.py index db5207d08..d757ffac4 100644 --- a/tests/bofire/data_models/specs/surrogates.py +++ b/tests/bofire/data_models/specs/surrogates.py @@ -7,12 +7,15 @@ CategoricalInput, ContinuousInput, ContinuousOutput, + MolecularInput, ) from bofire.data_models.kernels.api import ( HammondDistanceKernel, MaternKernel, ScaleKernel, + TanimotoKernel, ) +from bofire.data_models.molfeatures.api import Fingerprints from bofire.data_models.priors.api import ( BOTORCH_LENGTHCALE_PRIOR, BOTORCH_NOISE_PRIOR, @@ -199,3 +202,29 @@ "hyperconfig": None, }, ) +specs.add_valid( + models.TanimotoGPSurrogate, + lambda: { + "inputs": Inputs( + features=[ + MolecularInput(key="mol1"), + ] + ), + "outputs": Outputs( + features=[ + features.valid(ContinuousOutput).obj(), + ] + ), + "kernel": ScaleKernel( + base_kernel=TanimotoKernel( + ard=True, + ), + outputscale_prior=BOTORCH_SCALE_PRIOR(), + ), + "scaler": ScalerEnum.NORMALIZE, + "noise_prior": BOTORCH_NOISE_PRIOR(), + "input_preprocessing_specs": {"mol1": Fingerprints(n_bits=32, bond_radius=3)}, + "dump": None, + "hyperconfig": None, + }, +) diff --git a/tests/bofire/data_models/test_features.py b/tests/bofire/data_models/test_features.py index 71007ddb5..2739c4ea3 100644 --- a/tests/bofire/data_models/test_features.py +++ b/tests/bofire/data_models/test_features.py @@ -1,3 +1,4 @@ +import importlib import random import numpy as np @@ -21,11 +22,19 @@ MolecularInput, Output, ) +from bofire.data_models.molfeatures.api import ( + Fingerprints, + FingerprintsFragments, + Fragments, + MordredDescriptors, +) from bofire.data_models.objectives.api import MinimizeObjective, Objective from bofire.data_models.surrogates.api import ScalerEnum objective = MinimizeObjective(w=1) +RDKIT_AVAILABLE = importlib.util.find_spec("rdkit") is not None + @pytest.mark.parametrize( "spec, n", @@ -1344,6 +1353,10 @@ def test_inputs_sample(features: Inputs, num_samples, method): ({"x1": CategoricalEncodingEnum.ONE_HOT}), ({"x2": ScalerEnum.NORMALIZE}), ({"x2": CategoricalEncodingEnum.DESCRIPTOR}), + ({"x1": Fingerprints()}), + ({"x2": Fragments()}), + ({"x3": FingerprintsFragments()}), + ({"x3": MordredDescriptors(descriptors=["NssCH2", "ATSC2d"])}), ], ) def test_inputs_validate_transform_specs_invalid(specs): @@ -1393,74 +1406,170 @@ def test_inputs_validate_transform_valid(specs): inps._validate_transform_specs(specs) +@pytest.mark.parametrize( + "specs", + [ + ({"x2": CategoricalEncodingEnum.ONE_HOT}), + ({"x3": CategoricalEncodingEnum.DESCRIPTOR}), + ({"x4": CategoricalEncodingEnum.ONE_HOT}), + ({"x4": ScalerEnum.NORMALIZE}), + ({"x4": CategoricalEncodingEnum.DESCRIPTOR}), + ( + { + "x2": CategoricalEncodingEnum.ONE_HOT, + "x3": CategoricalEncodingEnum.DESCRIPTOR, + } + ), + ], +) +# Invalid when no specs do not contain transform information for x4, or when the transform is not a MolFeatures type +def test_inputs_validate_transform_specs_molecular_input_invalid(specs): + inps = Inputs( + features=[ + ContinuousInput(key="x1", bounds=(0, 1)), + CategoricalInput(key="x2", categories=["apple", "banana"]), + CategoricalDescriptorInput( + key="x3", + categories=["apple", "banana"], + descriptors=["d1", "d2"], + values=[[1, 2], [3, 4]], + ), + MolecularInput(key="x4"), + ] + ) + with pytest.raises(ValueError): + inps._validate_transform_specs(specs) + + +@pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") +@pytest.mark.parametrize( + "specs", + [ + ({"x4": Fingerprints()}), + ({"x4": Fragments()}), + ({"x4": FingerprintsFragments()}), + ({"x4": MordredDescriptors(descriptors=["NssCH2", "ATSC2d"])}), + ( + { + "x2": CategoricalEncodingEnum.ONE_HOT, + "x4": Fingerprints(), + } + ), + ( + { + "x3": CategoricalEncodingEnum.DESCRIPTOR, + "x4": Fingerprints(), + } + ), + ( + { + "x2": CategoricalEncodingEnum.ONE_HOT, + "x3": CategoricalEncodingEnum.DESCRIPTOR, + "x4": Fingerprints(), + } + ), + ], +) +def test_inputs_validate_transform_specs_molecular_input_valid(specs): + inps = Inputs( + features=[ + ContinuousInput(key="x1", bounds=(0, 1)), + CategoricalInput(key="x2", categories=["apple", "banana"]), + CategoricalDescriptorInput( + key="x3", + categories=["apple", "banana"], + descriptors=["d1", "d2"], + values=[[1, 2], [3, 4]], + ), + MolecularInput(key="x4"), + ] + ) + inps._validate_transform_specs(specs) + + +@pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") @pytest.mark.parametrize( "specs, expected_features2idx, expected_features2names", [ ( - {"x2": CategoricalEncodingEnum.ONE_HOT}, - {"x1": (0,), "x2": (2, 3, 4), "x3": (1,)}, + {"x2": CategoricalEncodingEnum.ONE_HOT, "x4": Fingerprints(n_bits=2048)}, + { + "x1": (2048,), + "x2": (2050, 2051, 2052), + "x3": (2049,), + "x4": tuple(range(2048)), + }, { "x1": ("x1",), "x2": ("x2_apple", "x2_banana", "x2_orange"), "x3": ("x3",), + "x4": tuple(f"x4_fingerprint_{i}" for i in range(2048)), }, ), ( - {"x2": CategoricalEncodingEnum.DUMMY}, - {"x1": (0,), "x2": (2, 3), "x3": (1,)}, - {"x1": ("x1",), "x2": ("x2_banana", "x2_orange"), "x3": ("x3",)}, - ), - ( - {"x2": CategoricalEncodingEnum.ORDINAL}, - {"x1": (0,), "x2": (2,), "x3": (1,)}, - {"x1": ("x1",), "x2": ("x2",), "x3": ("x3",)}, - ), - ( - {"x3": CategoricalEncodingEnum.ONE_HOT}, - {"x1": (0,), "x2": (5,), "x3": (1, 2, 3, 4)}, + { + "x2": CategoricalEncodingEnum.DUMMY, + "x4": Fragments(fragments=["fr_unbrch_alkane", "fr_thiocyan"]), + }, + {"x1": (2,), "x2": (4, 5), "x3": (3,), "x4": (0, 1)}, { "x1": ("x1",), - "x2": ("x2",), - "x3": ("x3_apple", "x3_banana", "x3_orange", "x3_cherry"), + "x2": ("x2_banana", "x2_orange"), + "x3": ("x3",), + "x4": ("x4_fr_unbrch_alkane", "x4_fr_thiocyan"), }, ), ( - {"x3": CategoricalEncodingEnum.DESCRIPTOR}, - {"x1": (0,), "x2": (3,), "x3": (1, 2)}, + { + "x2": CategoricalEncodingEnum.ORDINAL, + "x4": FingerprintsFragments( + n_bits=2048, fragments=["fr_unbrch_alkane", "fr_thiocyan"] + ), + }, + { + "x1": (2050,), + "x2": (2052,), + "x3": (2051,), + "x4": tuple(range(2048 + 2)), + }, { "x1": ("x1",), "x2": ("x2",), - "x3": ( - "x3_d1", - "x3_d2", + "x3": ("x3",), + "x4": tuple( + [f"x4_fingerprint_{i}" for i in range(2048)] + + ["x4_fr_unbrch_alkane", "x4_fr_thiocyan"] ), }, ), ( { - "x2": CategoricalEncodingEnum.ONE_HOT, - "x3": CategoricalEncodingEnum.DESCRIPTOR, + "x3": CategoricalEncodingEnum.ONE_HOT, + "x4": MordredDescriptors(descriptors=["NssCH2", "ATSC2d"]), }, - {"x1": (0,), "x2": (3, 4, 5), "x3": (1, 2)}, + {"x1": (2,), "x2": (7,), "x3": (3, 4, 5, 6), "x4": (0, 1)}, { "x1": ("x1",), - "x2": ("x2_apple", "x2_banana", "x2_orange"), - "x3": ( - "x3_d1", - "x3_d2", - ), + "x2": ("x2",), + "x3": ("x3_apple", "x3_banana", "x3_orange", "x3_cherry"), + "x4": ("x4_NssCH2", "x4_ATSC2d"), }, ), ( { "x2": CategoricalEncodingEnum.ONE_HOT, - "x3": CategoricalEncodingEnum.ONE_HOT, + "x3": CategoricalEncodingEnum.DESCRIPTOR, + "x4": MordredDescriptors(descriptors=["NssCH2", "ATSC2d"]), }, - {"x1": (0,), "x2": (5, 6, 7), "x3": (1, 2, 3, 4)}, + {"x1": (2,), "x2": (5, 6, 7), "x3": (3, 4), "x4": (0, 1)}, { "x1": ("x1",), "x2": ("x2_apple", "x2_banana", "x2_orange"), - "x3": ("x3_apple", "x3_banana", "x3_orange", "x3_cherry"), + "x3": ( + "x3_d1", + "x3_d2", + ), + "x4": ("x4_NssCH2", "x4_ATSC2d"), }, ), ], @@ -1478,6 +1587,7 @@ def test_inputs_get_transform_info( descriptors=["d1", "d2"], values=[[1, 2], [3, 4], [5, 6], [7, 8]], ), + MolecularInput(key="x4"), ] ) features2idx, features2names = inps._get_transform_info(specs) @@ -1533,6 +1643,163 @@ def test_inputs_transform(specs): assert_frame_equal(samples, untransformed) +@pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") +@pytest.mark.parametrize( + "specs, expected", + [ + ( + {"x2": CategoricalEncodingEnum.ONE_HOT, "x4": Fingerprints(n_bits=32)}, + { + "x4_fingerprint_0": {0: 1.0, 1: 1.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_1": {0: 1.0, 1: 0.0, 2: 1.0, 3: 1.0}, + "x4_fingerprint_2": {0: 1.0, 1: 0.0, 2: 1.0, 3: 0.0}, + "x4_fingerprint_3": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "x4_fingerprint_4": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_5": {0: 1.0, 1: 1.0, 2: 0.0, 3: 1.0}, + "x4_fingerprint_6": {0: 0.0, 1: 0.0, 2: 1.0, 3: 0.0}, + "x4_fingerprint_7": {0: 1.0, 1: 0.0, 2: 1.0, 3: 1.0}, + "x4_fingerprint_8": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "x4_fingerprint_9": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_10": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "x4_fingerprint_11": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_12": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_13": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "x4_fingerprint_14": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_15": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_16": {0: 1.0, 1: 1.0, 2: 1.0, 3: 0.0}, + "x4_fingerprint_17": {0: 1.0, 1: 1.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_18": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "x4_fingerprint_19": {0: 0.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "x4_fingerprint_20": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_21": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_22": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_23": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "x4_fingerprint_24": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_25": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "x4_fingerprint_26": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_27": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_28": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_29": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "x4_fingerprint_30": {0: 0.0, 1: 0.0, 2: 1.0, 3: 0.0}, + "x4_fingerprint_31": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x1": {0: 0.1, 1: 0.3, 2: 0.5, 3: 1.0}, + "x3": {0: "banana", 1: "orange", 2: "apple", 3: "cherry"}, + "x2_apple": {0: 1.0, 1: 0.0, 2: 1.0, 3: 0.0}, + "x2_banana": {0: 0.0, 1: 1.0, 2: 0.0, 3: 0.0}, + "x2_orange": {0: 0.0, 1: 0.0, 2: 0.0, 3: 1.0}, + }, + ), + ( + { + "x2": CategoricalEncodingEnum.DUMMY, + "x4": Fragments(fragments=["fr_unbrch_alkane", "fr_thiocyan"]), + }, + { + "x4_fr_unbrch_alkane": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fr_thiocyan": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x1": {0: 0.1, 1: 0.3, 2: 0.5, 3: 1.0}, + "x3": {0: "banana", 1: "orange", 2: "apple", 3: "cherry"}, + "x2_banana": {0: 0.0, 1: 1.0, 2: 0.0, 3: 0.0}, + "x2_orange": {0: 0.0, 1: 0.0, 2: 0.0, 3: 1.0}, + }, + ), + ( + { + "x2": CategoricalEncodingEnum.ORDINAL, + "x4": FingerprintsFragments( + n_bits=32, fragments=["fr_unbrch_alkane", "fr_thiocyan"] + ), + }, + { + "x4_fingerprint_0": {0: 1.0, 1: 1.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_1": {0: 1.0, 1: 0.0, 2: 1.0, 3: 1.0}, + "x4_fingerprint_2": {0: 1.0, 1: 0.0, 2: 1.0, 3: 0.0}, + "x4_fingerprint_3": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "x4_fingerprint_4": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_5": {0: 1.0, 1: 1.0, 2: 0.0, 3: 1.0}, + "x4_fingerprint_6": {0: 0.0, 1: 0.0, 2: 1.0, 3: 0.0}, + "x4_fingerprint_7": {0: 1.0, 1: 0.0, 2: 1.0, 3: 1.0}, + "x4_fingerprint_8": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "x4_fingerprint_9": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_10": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "x4_fingerprint_11": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_12": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_13": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "x4_fingerprint_14": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_15": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_16": {0: 1.0, 1: 1.0, 2: 1.0, 3: 0.0}, + "x4_fingerprint_17": {0: 1.0, 1: 1.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_18": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "x4_fingerprint_19": {0: 0.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "x4_fingerprint_20": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_21": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_22": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_23": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "x4_fingerprint_24": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_25": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "x4_fingerprint_26": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_27": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_28": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fingerprint_29": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "x4_fingerprint_30": {0: 0.0, 1: 0.0, 2: 1.0, 3: 0.0}, + "x4_fingerprint_31": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fr_unbrch_alkane": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x4_fr_thiocyan": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "x1": {0: 0.1, 1: 0.3, 2: 0.5, 3: 1.0}, + "x3": {0: "banana", 1: "orange", 2: "apple", 3: "cherry"}, + "x2": {0: 0, 1: 1, 2: 0, 3: 2}, + }, + ), + ( + { + "x2": CategoricalEncodingEnum.ONE_HOT, + "x3": CategoricalEncodingEnum.DESCRIPTOR, + "x4": MordredDescriptors(descriptors=["NssCH2", "ATSC2d"]), + }, + { + "x4_NssCH2": { + 0: 0.5963718820861676, + 1: -1.5, + 2: -0.28395061728395066, + 3: -8.34319526627219, + }, + "x4_ATSC2d": {0: 0.0, 1: 0.0, 2: 1.0, 3: 0.0}, + "x1": {0: 0.1, 1: 0.3, 2: 0.5, 3: 1.0}, + "x3_d1": {0: 3.0, 1: 5.0, 2: 1.0, 3: 7.0}, + "x3_d2": {0: 4.0, 1: 6.0, 2: 2.0, 3: 8.0}, + "x2_apple": {0: 1.0, 1: 0.0, 2: 1.0, 3: 0.0}, + "x2_banana": {0: 0.0, 1: 1.0, 2: 0.0, 3: 0.0}, + "x2_orange": {0: 0.0, 1: 0.0, 2: 0.0, 3: 1.0}, + }, + ), + ], +) +def test_inputs_transform_molecular(specs, expected): + experiments = [ + [0.1, "apple", "banana", "CC(=O)Oc1ccccc1C(=O)O", 88.0], + [0.3, "banana", "orange", "c1ccccc1", 35.0], + [0.5, "apple", "apple", "[CH3][CH2][OH]", 69.0], + [1.0, "orange", "cherry", "N[C@](C)(F)C(=O)O", 20.0], + ] + experiments = pd.DataFrame(experiments, columns=["x1", "x2", "x3", "x4", "y"]) + experiments["valid_y"] = 1 + inps = Inputs( + features=[ + ContinuousInput(key="x1", bounds=(0, 1)), + CategoricalInput(key="x2", categories=["apple", "banana", "orange"]), + CategoricalDescriptorInput( + key="x3", + categories=["apple", "banana", "orange", "cherry"], + descriptors=["d1", "d2"], + values=[[1, 2], [3, 4], [5, 6], [7, 8]], + ), + MolecularInput(key="x4"), + ] + ) + transformed = inps.transform(experiments=experiments, specs=specs) + assert_frame_equal(transformed, pd.DataFrame.from_dict(expected)) + + if1 = specs.features.valid(ContinuousInput).obj(key="if1") if2 = specs.features.valid(ContinuousInput).obj(key="if2", bounds=(3, 3)) if3 = specs.features.valid(CategoricalInput).obj( diff --git a/tests/bofire/data_models/test_kernels.py b/tests/bofire/data_models/test_kernels.py index 3d23ea253..1f7936e76 100644 --- a/tests/bofire/data_models/test_kernels.py +++ b/tests/bofire/data_models/test_kernels.py @@ -6,6 +6,7 @@ import torch from pydantic import parse_obj_as +import bofire import bofire.kernels.api as kernels from bofire.data_models.kernels.api import ( AdditiveKernel, @@ -15,6 +16,7 @@ MultiplicativeKernel, RBFKernel, ScaleKernel, + TanimotoKernel, ) from bofire.data_models.priors.api import BOTORCH_SCALE_PRIOR, GammaPrior @@ -34,6 +36,7 @@ def get_invalids(valid: dict) -> List[dict]: ScaleKernel: gpytorch.kernels.ScaleKernel, AdditiveKernel: gpytorch.kernels.AdditiveKernel, MultiplicativeKernel: gpytorch.kernels.ProductKernel, + TanimotoKernel: bofire.kernels.fingerprint_kernels.tanimoto_kernel.TanimotoKernel, } VALID_RBF_SPEC = { @@ -56,6 +59,11 @@ def get_invalids(valid: dict) -> List[dict]: "kernels": [RBFKernel(), RBFKernel()], } +VALID_TANIMOTO_SPEC = { + "type": "TanimotoKernel", + "ard": True, +} + KERNEL_SPECS = { RBFKernel: { "valids": [ @@ -99,6 +107,12 @@ def get_invalids(valid: dict) -> List[dict]: *get_invalids(VALID_SCALE_SPEC), ], }, + TanimotoKernel: { + "valids": [VALID_TANIMOTO_SPEC], + "invalids": [ + *get_invalids(VALID_TANIMOTO_SPEC), + ], + }, } @@ -207,3 +221,36 @@ def test_continuous_kernel(kernel, ard_num_dims, active_dims, expected_kernel): if isinstance(kernel, gpytorch.kernels.MaternKernel): assert kernel.nu == k.nu + + +@pytest.mark.parametrize( + "kernel, ard_num_dims, active_dims, expected_kernel", + [ + ( + TanimotoKernel(ard=False), + 10, + list(range(5)), + bofire.kernels.fingerprint_kernels.tanimoto_kernel.TanimotoKernel, + ), + ( + TanimotoKernel(ard=True), + 10, + list(range(5)), + bofire.kernels.fingerprint_kernels.tanimoto_kernel.TanimotoKernel, + ), + ], +) +def test_molecular_kernel(kernel, ard_num_dims, active_dims, expected_kernel): + k = kernels.map( + kernel, + batch_shape=torch.Size(), + ard_num_dims=ard_num_dims, + active_dims=active_dims, + ) + assert isinstance(k, expected_kernel) + + if kernel.ard is False: + assert k.ard_num_dims is None + else: + assert k.ard_num_dims == len(active_dims) + assert torch.eq(k.active_dims, torch.tensor(active_dims, dtype=torch.int64)).all() diff --git a/tests/bofire/data_models/test_molecular.py b/tests/bofire/data_models/test_molecular.py index d0394c8cf..60609a697 100644 --- a/tests/bofire/data_models/test_molecular.py +++ b/tests/bofire/data_models/test_molecular.py @@ -1,29 +1,42 @@ import importlib +import warnings +import numpy as np import pandas as pd import pytest -from pandas.testing import assert_series_equal +from pandas.testing import assert_frame_equal, assert_series_equal from bofire.data_models.features.molecular import MolecularInput +from bofire.data_models.molfeatures.api import ( + Fingerprints, + FingerprintsFragments, + Fragments, + MordredDescriptors, +) + +try: + from rdkit.Chem import Descriptors +except ImportError: + warnings.warn( + "rdkit not installed, BoFire's cheminformatics utilities cannot be used." + ) RDKIT_AVAILABLE = importlib.util.find_spec("rdkit") is not None -VALID_SMILES = pd.Series( - [ - "CC(=O)Oc1ccccc1C(=O)O", - "c1ccccc1", - "[CH3][CH2][OH]", - "C-C-O", - "OCC", - "N[C@](C)(F)C(=O)O", - ] -) +smiles = [ + "CC(=O)Oc1ccccc1C(=O)O", + "c1ccccc1", + "[CH3][CH2][OH]", + "N[C@](C)(F)C(=O)O", +] +VALID_SMILES = pd.Series(smiles) +VALID_SMILES.name = "molecule" INVALID_SMILES = pd.Series(["CC(=O)Oc1ccccc1C(=O)O", "c1ccccc1", "abcd"]) @pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") def test_molecular_input_validate_experimental(): - m = MolecularInput(key="molecules") + m = MolecularInput(key="molecule") vals = m.validate_experimental(VALID_SMILES) assert_series_equal(vals, VALID_SMILES) with pytest.raises(ValueError): @@ -32,7 +45,7 @@ def test_molecular_input_validate_experimental(): @pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") def test_molecular_input_validate_candidental(): - m = MolecularInput(key="molecules") + m = MolecularInput(key="molecule") vals = m.validate_candidental(VALID_SMILES) assert_series_equal(vals, VALID_SMILES) with pytest.raises(ValueError): @@ -41,33 +54,490 @@ def test_molecular_input_validate_candidental(): @pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") def test_molecular_input_fixed(): - m = MolecularInput(key="molecules") + m = MolecularInput(key="molecule") assert m.fixed_value() is None assert m.is_fixed() is False @pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") -def test_to_fingerprints(): - m = MolecularInput(key="molecules") - data = m.to_fingerprints(VALID_SMILES) - assert data.shape[0] == 6 +@pytest.mark.parametrize( + "molfeatures, expected", + [ + (Fingerprints(), [f"fingerprint_{i}" for i in range(2048)]), + (Fingerprints(n_bits=32), [f"fingerprint_{i}" for i in range(32)]), + ( + Fragments(), + [rdkit_fragment[0] for rdkit_fragment in Descriptors.descList[124:]], + ), + ( + Fragments(fragments=["fr_unbrch_alkane", "fr_thiocyan"]), + ["fr_unbrch_alkane", "fr_thiocyan"], + ), + ( + FingerprintsFragments(), + [f"fingerprint_{i}" for i in range(2048)] + + [rdkit_fragment[0] for rdkit_fragment in Descriptors.descList[124:]], + ), + ( + FingerprintsFragments( + n_bits=32, fragments=["fr_unbrch_alkane", "fr_thiocyan"] + ), + [f"fingerprint_{i}" for i in range(32)] + + ["fr_unbrch_alkane", "fr_thiocyan"], + ), + (MordredDescriptors(descriptors=["NssCH2", "ATSC2d"]), ["NssCH2", "ATSC2d"]), + ], +) +def test_molfeatures_get_descriptor_names(molfeatures, expected): + assert molfeatures.get_descriptor_names() == expected + + +@pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") +@pytest.mark.parametrize( + "expected, transform_type", + [ + ( + ( + [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + ], + [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 0.0, + 1.0, + 0.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 0.0, + 0.0, + 1.0, + 1.0, + 0.0, + 1.0, + 0.0, + 1.0, + 1.0, + 1.0, + 1.0, + 0.0, + ], + ), + Fingerprints(n_bits=32), + ), + ( + ( + [0.0, 0.0, 0.0, 0.0], + [1.0, 1.0, 2.0, 1.0], + ), + Fragments(fragments=["fr_COO", "fr_COO2", "fr_C_O", "fr_C_O_noCOO"]), + ), + ( + ( + [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + ], + [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 0.0, + 1.0, + 0.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 0.0, + 0.0, + 1.0, + 1.0, + 0.0, + 1.0, + 0.0, + 1.0, + 1.0, + 1.0, + 1.0, + 0.0, + 1.0, + 1.0, + 2.0, + 1.0, + ], + ), + FingerprintsFragments( + n_bits=32, fragments=["fr_COO", "fr_COO2", "fr_C_O", "fr_C_O_noCOO"] + ), + ), + ( + ([-8.34319526627219, 0.0], [0.5963718820861676, 1.0]), + MordredDescriptors(descriptors=["NssCH2", "ATSC2d"]), + ), + ], +) +def test_molecular_descriptor_feature_get_bounds(expected, transform_type): + input_feature = MolecularInput(key="molecule") + lower, upper = input_feature.get_bounds( + transform_type=transform_type, + values=VALID_SMILES, + ) + assert np.allclose(lower, expected[0]) + assert np.allclose(upper, expected[1]) + + +@pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") +@pytest.mark.parametrize( + "transform_type, values", + [ + ( + Fingerprints(n_bits=32), + { + "molecule_fingerprint_0": {0: 1.0, 1: 1.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_1": {0: 1.0, 1: 0.0, 2: 1.0, 3: 1.0}, + "molecule_fingerprint_2": {0: 1.0, 1: 0.0, 2: 1.0, 3: 0.0}, + "molecule_fingerprint_3": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "molecule_fingerprint_4": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_5": {0: 1.0, 1: 1.0, 2: 0.0, 3: 1.0}, + "molecule_fingerprint_6": {0: 0.0, 1: 0.0, 2: 1.0, 3: 0.0}, + "molecule_fingerprint_7": {0: 1.0, 1: 0.0, 2: 1.0, 3: 1.0}, + "molecule_fingerprint_8": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "molecule_fingerprint_9": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_10": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "molecule_fingerprint_11": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_12": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_13": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "molecule_fingerprint_14": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_15": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_16": {0: 1.0, 1: 1.0, 2: 1.0, 3: 0.0}, + "molecule_fingerprint_17": {0: 1.0, 1: 1.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_18": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "molecule_fingerprint_19": {0: 0.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "molecule_fingerprint_20": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_21": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_22": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_23": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "molecule_fingerprint_24": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_25": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "molecule_fingerprint_26": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_27": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_28": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_29": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "molecule_fingerprint_30": {0: 0.0, 1: 0.0, 2: 1.0, 3: 0.0}, + "molecule_fingerprint_31": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + }, + ), + ( + Fragments(fragments=["fr_unbrch_alkane", "fr_thiocyan"]), + { + "molecule_fr_unbrch_alkane": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fr_thiocyan": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + }, + ), + ( + FingerprintsFragments( + n_bits=32, fragments=["fr_unbrch_alkane", "fr_thiocyan"] + ), + { + "molecule_fingerprint_0": {0: 1.0, 1: 1.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_1": {0: 1.0, 1: 0.0, 2: 1.0, 3: 1.0}, + "molecule_fingerprint_2": {0: 1.0, 1: 0.0, 2: 1.0, 3: 0.0}, + "molecule_fingerprint_3": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "molecule_fingerprint_4": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_5": {0: 1.0, 1: 1.0, 2: 0.0, 3: 1.0}, + "molecule_fingerprint_6": {0: 0.0, 1: 0.0, 2: 1.0, 3: 0.0}, + "molecule_fingerprint_7": {0: 1.0, 1: 0.0, 2: 1.0, 3: 1.0}, + "molecule_fingerprint_8": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "molecule_fingerprint_9": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_10": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "molecule_fingerprint_11": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_12": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_13": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "molecule_fingerprint_14": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_15": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_16": {0: 1.0, 1: 1.0, 2: 1.0, 3: 0.0}, + "molecule_fingerprint_17": {0: 1.0, 1: 1.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_18": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "molecule_fingerprint_19": {0: 0.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "molecule_fingerprint_20": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_21": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_22": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_23": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "molecule_fingerprint_24": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_25": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "molecule_fingerprint_26": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_27": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_28": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fingerprint_29": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "molecule_fingerprint_30": {0: 0.0, 1: 0.0, 2: 1.0, 3: 0.0}, + "molecule_fingerprint_31": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fr_unbrch_alkane": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "molecule_fr_thiocyan": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + }, + ), + ( + MordredDescriptors(descriptors=["NssCH2", "ATSC2d"]), + { + "molecule_NssCH2": { + 0: 0.5963718820861676, + 1: -1.5, + 2: -0.28395061728395066, + 3: -8.34319526627219, + }, + "molecule_ATSC2d": {0: 0.0, 1: 0.0, 2: 1.0, 3: 0.0}, + }, + ), + ], +) +def test_molecular_input_to_descriptor_encoding(transform_type, values): + input_feature = MolecularInput(key="molecule") + + encoded = input_feature.to_descriptor_encoding(transform_type, VALID_SMILES) + assert len(encoded.columns) == len(transform_type.get_descriptor_names()) + assert len(encoded) == len(smiles) + assert_frame_equal(encoded, pd.DataFrame.from_dict(values)) + + +@pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") +def test_molfeatures_type_get_descriptor_values_fingerprints(): + values = { + "fingerprint_0": {0: 1.0, 1: 1.0, 2: 0.0, 3: 0.0}, + "fingerprint_1": {0: 1.0, 1: 0.0, 2: 1.0, 3: 1.0}, + "fingerprint_2": {0: 1.0, 1: 0.0, 2: 1.0, 3: 0.0}, + "fingerprint_3": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "fingerprint_4": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_5": {0: 1.0, 1: 1.0, 2: 0.0, 3: 1.0}, + "fingerprint_6": {0: 0.0, 1: 0.0, 2: 1.0, 3: 0.0}, + "fingerprint_7": {0: 1.0, 1: 0.0, 2: 1.0, 3: 1.0}, + "fingerprint_8": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "fingerprint_9": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_10": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "fingerprint_11": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_12": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_13": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "fingerprint_14": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_15": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_16": {0: 1.0, 1: 1.0, 2: 1.0, 3: 0.0}, + "fingerprint_17": {0: 1.0, 1: 1.0, 2: 0.0, 3: 0.0}, + "fingerprint_18": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "fingerprint_19": {0: 0.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "fingerprint_20": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_21": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_22": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_23": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "fingerprint_24": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_25": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "fingerprint_26": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_27": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_28": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_29": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "fingerprint_30": {0: 0.0, 1: 0.0, 2: 1.0, 3: 0.0}, + "fingerprint_31": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + } + + molfeature = Fingerprints(n_bits=32) + generated = molfeature.get_descriptor_values(VALID_SMILES) + assert_frame_equal(generated, pd.DataFrame.from_dict(values)) + + +@pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") +def test_molfeatures_type_get_descriptor_values_fragments(): + values = { + "fr_unbrch_alkane": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fr_thiocyan": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + } + + molfeature = Fragments(fragments=["fr_unbrch_alkane", "fr_thiocyan"]) + generated = molfeature.get_descriptor_values(VALID_SMILES) + assert_frame_equal(generated, pd.DataFrame.from_dict(values)) + + +@pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") +@pytest.mark.parametrize( + "fragment_list", + [ + (["fr_unbrch_alkane','fr_unbrch_alkane', 'fr_thiocyan"]), + (["frag','fr_unbrch_alkane', 'fr_thiocyan"]), + ], +) +def test_molfeatures_type_fragments_invalid(fragment_list): with pytest.raises(ValueError): - m.to_fingerprints(INVALID_SMILES) + FingerprintsFragments(fragments=fragment_list) @pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") -def test_to_bag_of_characters(): - m = MolecularInput(key="molecules") - data = m.to_bag_of_characters(VALID_SMILES) - assert data.shape[0] == 6 +def test_molfeatures_type_get_descriptor_values_fingerprintsfragments(): + values = { + "fingerprint_0": {0: 1.0, 1: 1.0, 2: 0.0, 3: 0.0}, + "fingerprint_1": {0: 1.0, 1: 0.0, 2: 1.0, 3: 1.0}, + "fingerprint_2": {0: 1.0, 1: 0.0, 2: 1.0, 3: 0.0}, + "fingerprint_3": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "fingerprint_4": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_5": {0: 1.0, 1: 1.0, 2: 0.0, 3: 1.0}, + "fingerprint_6": {0: 0.0, 1: 0.0, 2: 1.0, 3: 0.0}, + "fingerprint_7": {0: 1.0, 1: 0.0, 2: 1.0, 3: 1.0}, + "fingerprint_8": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "fingerprint_9": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_10": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "fingerprint_11": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_12": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_13": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "fingerprint_14": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_15": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_16": {0: 1.0, 1: 1.0, 2: 1.0, 3: 0.0}, + "fingerprint_17": {0: 1.0, 1: 1.0, 2: 0.0, 3: 0.0}, + "fingerprint_18": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "fingerprint_19": {0: 0.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "fingerprint_20": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_21": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_22": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_23": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "fingerprint_24": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_25": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "fingerprint_26": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_27": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_28": {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fingerprint_29": {0: 1.0, 1: 0.0, 2: 0.0, 3: 1.0}, + "fingerprint_30": {0: 0.0, 1: 0.0, 2: 1.0, 3: 0.0}, + "fingerprint_31": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fr_unbrch_alkane": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + "fr_thiocyan": {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}, + } + + molfeature = FingerprintsFragments( + n_bits=32, fragments=["fr_unbrch_alkane", "fr_thiocyan"] + ) + generated = molfeature.get_descriptor_values(VALID_SMILES) + assert_frame_equal(generated, pd.DataFrame.from_dict(values)) + + +@pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") +@pytest.mark.parametrize( + "fragment_list", + [ + (["fr_unbrch_alkane','fr_unbrch_alkane', 'fr_thiocyan"]), + (["frag','fr_unbrch_alkane', 'fr_thiocyan"]), + ], +) +def test_molfeatures_type_fingerprintsfragments_invalid(fragment_list): with pytest.raises(ValueError): - m.to_bag_of_characters(INVALID_SMILES) + FingerprintsFragments(fragments=fragment_list) + + +@pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") +def test_molfeatures_type_get_descriptor_values_mordreddescriptors(): + values = { + "NssCH2": { + 0: 0.5963718820861676, + 1: -1.5, + 2: -0.28395061728395066, + 3: -8.34319526627219, + }, + "ATSC2d": {0: 0.0, 1: 0.0, 2: 1.0, 3: 0.0}, + } + + molfeature = MordredDescriptors(descriptors=["NssCH2", "ATSC2d"]) + generated = molfeature.get_descriptor_values(VALID_SMILES) + assert_frame_equal(generated, pd.DataFrame.from_dict(values)) +@pytest.mark.parametrize( + "mordred_list", + [ + (["NssCH2", "NssCH2", "ATSC2d"]), + (["desc", "NssCH2", "ATSC2d"]), + ], +) @pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") -def test_to_fragments(): - m = MolecularInput(key="molecules") - data = m.to_fragments(VALID_SMILES) - assert data.shape[0] == 6 +def test_molfeatures_type_mordreddescriptors_invalid(mordred_list): with pytest.raises(ValueError): - m.to_fragments(INVALID_SMILES) + MordredDescriptors(descriptors=mordred_list) diff --git a/tests/bofire/data_models/test_molfeatures.py b/tests/bofire/data_models/test_molfeatures.py new file mode 100644 index 000000000..28319a219 --- /dev/null +++ b/tests/bofire/data_models/test_molfeatures.py @@ -0,0 +1,25 @@ +import importlib + +import pytest + +import bofire.data_models.molfeatures.names as names + +RDKIT_AVAILABLE = importlib.util.find_spec("rdkit") is not None + + +@pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") +def test_framents(): + from rdkit.Chem import Descriptors + + assert names.fragments == [ + rdkit_fragment[0] for rdkit_fragment in Descriptors.descList[124:] + ] + + +@pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") +def test_mordred(): + from mordred import Calculator + from mordred import descriptors as mordred_descriptors + + calc = Calculator(mordred_descriptors, ignore_3D=False) + assert names.mordred == [str(d) for d in calc.descriptors] diff --git a/tests/bofire/surrogates/test_gps.py b/tests/bofire/surrogates/test_gps.py index 3156e57ce..6e0312f9c 100644 --- a/tests/bofire/surrogates/test_gps.py +++ b/tests/bofire/surrogates/test_gps.py @@ -1,3 +1,6 @@ +import importlib + +import pandas as pd import pytest import torch from botorch.models import MixedSingleTaskGP, SingleTaskGP @@ -20,12 +23,20 @@ CategoricalInput, ContinuousInput, ContinuousOutput, + MolecularInput, ) from bofire.data_models.kernels.api import ( HammondDistanceKernel, MaternKernel, RBFKernel, ScaleKernel, + TanimotoKernel, +) +from bofire.data_models.molfeatures.api import ( + Fingerprints, + FingerprintsFragments, + Fragments, + MordredDescriptors, ) from bofire.data_models.priors.api import ( BOTORCH_LENGTHCALE_PRIOR, @@ -41,10 +52,13 @@ SingleTaskGPHyperconfig, SingleTaskGPSurrogate, ) +from bofire.data_models.surrogates.tanimoto_gp import TanimotoGPSurrogate from bofire.data_models.surrogates.trainable import metrics2objectives from bofire.surrogates.single_task_gp import get_scaler from bofire.utils.torch_tools import tkwargs +RDKIT_AVAILABLE = importlib.util.find_spec("rdkit") is not None + @pytest.mark.parametrize( "scaler_enum, input_preprocessing_specs, expected_scaler, expected_indices, expected_offset, expected_coefficient", @@ -300,6 +314,69 @@ def test_SingleTaskGPHyperconfig(): ) +@pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") +@pytest.mark.parametrize( + "kernel, specs", + [ + ( + ScaleKernel(base_kernel=TanimotoKernel(ard=False)), + {"x_1": Fingerprints(n_bits=32)}, + ), + ( + ScaleKernel(base_kernel=TanimotoKernel(ard=True)), + {"x_1": Fragments()}, + ), + ( + ScaleKernel(base_kernel=TanimotoKernel(ard=False)), + {"x_1": FingerprintsFragments(n_bits=32)}, + ), + ( + ScaleKernel(base_kernel=RBFKernel(ard=True)), + {"x_1": MordredDescriptors(descriptors=["NssCH2", "ATSC2d"])}, + ), + ], +) +def test_TanimotoGP(kernel, specs): + inputs = Inputs(features=[MolecularInput(key="x_1")]) + outputs = Outputs(features=[ContinuousOutput(key="y")]) + experiments = [ + ["CC(=O)Oc1ccccc1C(=O)O", 88.0], + ["c1ccccc1", 35.0], + ["[CH3][CH2][OH]", 69.0], + ["N[C@](C)(F)C(=O)O", 20.0], + ] + experiments = pd.DataFrame(experiments, columns=["x_1", "y"]) + experiments["valid_y"] = 1 + model = TanimotoGPSurrogate( + inputs=inputs, + outputs=outputs, + kernel=kernel, + input_preprocessing_specs=specs, + ) + model = surrogates.map(model) + model.fit(experiments) + # dump the model + dump = model.dumps() + # make predictions + preds = model.predict(experiments.iloc[:-1]) + assert preds.shape == (3, 2) + # check that model is composed correctly + assert isinstance(model.model, SingleTaskGP) + assert isinstance(model.model.outcome_transform, Standardize) + assert model.is_compatibilized is False + # reload the model from dump and check for equality in predictions + model2 = TanimotoGPSurrogate( + inputs=inputs, + outputs=outputs, + kernel=kernel, + input_preprocessing_specs=specs, + ) + model2 = surrogates.map(model2) + model2.loads(dump) + preds2 = model2.predict(experiments.iloc[:-1]) + assert_frame_equal(preds, preds2) + + def test_MixedGPModel_invalid_preprocessing(): inputs = Inputs( features=[ diff --git a/tests/bofire/utils/test_cheminformatics.py b/tests/bofire/utils/test_cheminformatics.py index 13b224b44..5375f5db5 100644 --- a/tests/bofire/utils/test_cheminformatics.py +++ b/tests/bofire/utils/test_cheminformatics.py @@ -1,12 +1,14 @@ import importlib +import numpy as np import pytest -from bofire.utils.cheminformatics import ( - smiles2bag_of_characters, +from bofire.utils.cheminformatics import ( # smiles2bag_of_characters, smiles2fingerprints, smiles2fragments, + smiles2fragments_fingerprints, smiles2mol, + smiles2mordred, ) RDKIT_AVAILABLE = importlib.util.find_spec("rdkit") is not None @@ -21,23 +23,352 @@ def test_smiles2mol(): smiles2mol("CC(=O)Oc1ccccc1C(=O)O") -@pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") -def test_smiles2bag_of_characters(): - smiles = ["CC(=O)Oc1ccccc1C(=O)O", "c1ccccc1"] - desc = smiles2bag_of_characters(smiles=smiles) - assert desc.shape[0] == 2 +smiles = [ + "CC(=O)Oc1ccccc1C(=O)O", + "c1ccccc1", + "[CH3][CH2][OH]", + "N[C@](C)(F)C(=O)O", +] @pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") def test_smiles2fingerprints(): - smiles = ["CC(=O)Oc1ccccc1C(=O)O", "c1ccccc1"] - desc = smiles2fingerprints(smiles=smiles, n_bits=512) - assert desc.shape[0] == 2 - assert desc.shape[1] == 512 + values = np.array( + [ + [ + 1, + 1, + 1, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 1, + 1, + 0, + 1, + 0, + 1, + 1, + 1, + 1, + 0, + 0, + 0, + 1, + 1, + 0, + 1, + 0, + 1, + 1, + 1, + 0, + 0, + ], + [ + 1, + 0, + 0, + 0, + 0, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ], + [ + 0, + 1, + 1, + 0, + 0, + 0, + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 0, + ], + [ + 0, + 1, + 0, + 1, + 0, + 1, + 0, + 1, + 1, + 0, + 1, + 0, + 0, + 1, + 0, + 0, + 0, + 0, + 1, + 1, + 0, + 0, + 0, + 1, + 0, + 1, + 0, + 0, + 0, + 1, + 0, + 0, + ], + ] + ) + desc = smiles2fingerprints(smiles=smiles, n_bits=32) + assert desc.shape[0] == 4 + assert desc.shape[1] == 32 + np.testing.assert_array_equal(desc, values) @pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") def test_smiles2fragments(): - smiles = ["CC(=O)Oc1ccccc1C(=O)O", "c1ccccc1"] - desc = smiles2fragments(smiles=smiles) - assert desc.shape[0] == 2 + values = np.array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]) + + desc = smiles2fragments( + smiles=smiles, fragments_list=["fr_unbrch_alkane", "fr_thiocyan"] + ) + assert desc.shape[0] == 4 + assert desc.shape[1] == 2 + np.testing.assert_array_equal(desc, values) + + +@pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") +def test_smiles2fragments_fingerprints(): + values = np.array( + [ + [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 0.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 0.0, + 1.0, + 0.0, + 1.0, + 1.0, + 1.0, + 1.0, + 0.0, + 0.0, + 0.0, + 1.0, + 1.0, + 0.0, + 1.0, + 0.0, + 1.0, + 1.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + ], + [ + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + ], + [ + 0.0, + 1.0, + 1.0, + 0.0, + 0.0, + 0.0, + 1.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1.0, + 0.0, + 0.0, + 0.0, + ], + [ + 0.0, + 1.0, + 0.0, + 1.0, + 0.0, + 1.0, + 0.0, + 1.0, + 1.0, + 0.0, + 1.0, + 0.0, + 0.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1.0, + 1.0, + 0.0, + 0.0, + 0.0, + 1.0, + 0.0, + 1.0, + 0.0, + 0.0, + 0.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + ], + ] + ) + + desc = smiles2fragments_fingerprints( + smiles=smiles, n_bits=32, fragments_list=["fr_unbrch_alkane", "fr_thiocyan"] + ) + assert desc.shape[0] == 4 + assert desc.shape[1] == 32 + 2 + np.testing.assert_array_equal(desc, values) + + +@pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") +def test_smiles2mordred(): + values = np.array( + [ + [0.5963718820861676, 0.0], + [-1.5, 0.0], + [-0.28395061728395066, 1.0], + [-8.34319526627219, 0.0], + ] + ) + + desc = smiles2mordred(smiles=smiles, descriptors_list=["NssCH2", "ATSC2d"]) + assert desc.shape[0] == 4 + assert desc.shape[1] == 2 + np.testing.assert_array_almost_equal(desc, values) + + +# @pytest.mark.skipif(not RDKIT_AVAILABLE, reason="requires rdkit") +# def test_smiles2bag_of_characters(): +# smiles = ["CC(=O)Oc1ccccc1C(=O)O", "c1ccccc1"] +# desc = smiles2bag_of_characters(smiles=smiles) +# assert desc.shape[0] == 2