From b5522e19eb3ddf93716979266e02e123be77e425 Mon Sep 17 00:00:00 2001 From: Jose Pablo Folch Date: Wed, 21 Feb 2024 16:40:08 +0000 Subject: [PATCH 01/14] initial attempt to incorporate MultiTask GPs --- .gitignore | 4 + bofire/benchmarks/single.py | 69 ++++++++++++++ bofire/data_models/priors/api.py | 7 ++ bofire/data_models/priors/lkj.py | 21 +++++ bofire/data_models/surrogates/api.py | 6 ++ .../data_models/surrogates/multi_task_gp.py | 94 +++++++++++++++++++ bofire/priors/mapper.py | 7 ++ bofire/surrogates/mapper.py | 2 + bofire/surrogates/multi_task_gp.py | 73 ++++++++++++++ tutorials/multi_task_gp_testing.py | 51 ++++++++++ 10 files changed, 334 insertions(+) create mode 100644 bofire/data_models/priors/lkj.py create mode 100644 bofire/data_models/surrogates/multi_task_gp.py create mode 100644 bofire/surrogates/multi_task_gp.py create mode 100644 tutorials/multi_task_gp_testing.py diff --git a/.gitignore b/.gitignore index eb8e1e93c..85a383d0c 100644 --- a/.gitignore +++ b/.gitignore @@ -133,3 +133,7 @@ dmypy.json # generated version file bofire/version.py + +# OS generated files +.DS_Store +.DS_Store? \ No newline at end of file diff --git a/bofire/benchmarks/single.py b/bofire/benchmarks/single.py index def5ea3f7..7eec321f8 100644 --- a/bofire/benchmarks/single.py +++ b/bofire/benchmarks/single.py @@ -375,6 +375,75 @@ def get_optima(self) -> pd.DataFrame: ) +class MultiFidelityHimmelblau(Benchmark): + """Himmelblau function for testing optimization algorithms + Link to the definition: https://en.wikipedia.org/wiki/Himmelblau%27s_function + """ + + def __init__(self, use_constraints: bool = False, **kwargs): + """Initialiszes class of type Himmelblau. + + Args: + best_possible_f (float, optional): Not implemented yet. Defaults to 0.0. + use_constraints (bool, optional): Whether constraints should be used or not (Not implemented yet.). Defaults to False. + + Raises: + ValueError: As constraints are not implemeted yet, a True value for use_constraints yields a ValueError. + """ + super().__init__(**kwargs) + self.use_constraints = use_constraints + inputs = [] + + inputs.append(DiscreteInput(key="fid", values=[0, 1])) + inputs.append(ContinuousInput(key="x_1", bounds=(-6, 6))) + inputs.append(ContinuousInput(key="x_2", bounds=(-6, 6))) + + objective = MinimizeObjective(w=1.0) + output_feature = ContinuousOutput(key="y", objective=objective) + if self.use_constraints: + raise ValueError("Not implemented yet!") + self._domain = Domain( + inputs=Inputs(features=inputs), + outputs=Outputs(features=[output_feature]), + ) + + def _f(self, X: pd.DataFrame, **kwargs) -> pd.DataFrame: + """Evaluates benchmark function. + + Args: + X (pd.DataFrame): Input values. Columns are x_1 and x_2 + + Returns: + pd.DataFrame: y values of the function. Columns are y and valid_y. + """ + X_temp = X.eval( + "y=((x_1**2 + x_2 - 11)**2+(x_1 + x_2**2 -7)**2) + fid * x_1 * x_2", + inplace=False, + ) + Y = pd.DataFrame({"y": X_temp["y"], "valid_y": 1}) + return Y + + def get_optima(self) -> pd.DataFrame: + """Returns positions of optima of the benchmark function. + + Returns: + pd.DataFrame: x values of optima. Colums are x_1 and x_2 + """ + x = np.array( + [ + [3.0, 2.0], + [-2.805118, 3.131312], + [-3.779310, -3.283186], + [3.584428, -1.848126], + ] + ) + y = np.zeros(4) + return pd.DataFrame( + np.c_[x, y], + columns=self.domain.inputs.get_keys() + self.domain.outputs.get_keys(), + ) + + class DiscreteHimmelblau(Himmelblau): def __init__(self, **kwargs): super().__init__(**kwargs) diff --git a/bofire/data_models/priors/api.py b/bofire/data_models/priors/api.py index 37f80bca8..bfa12d710 100644 --- a/bofire/data_models/priors/api.py +++ b/bofire/data_models/priors/api.py @@ -2,6 +2,7 @@ from typing import Union from bofire.data_models.priors.gamma import GammaPrior +from bofire.data_models.priors.lkj import LKJPrior from bofire.data_models.priors.normal import NormalPrior from bofire.data_models.priors.prior import Prior @@ -17,6 +18,9 @@ BOTORCH_LENGTHCALE_PRIOR = partial(GammaPrior, concentration=3.0, rate=6.0) BOTORCH_NOISE_PRIOR = partial(GammaPrior, concentration=1.1, rate=0.05) BOTORCH_SCALE_PRIOR = partial(GammaPrior, concentration=2.0, rate=0.15) +BOTORCH_LKJ_PRIOR = partial( + LKJPrior, n_tasks=1, eta=2.0, sd_prior=GammaPrior(concentration=2.0, rate=0.15) +) # mbo priors # By default BoTorch places a highly informative prior on the kernel lengthscales, @@ -25,3 +29,6 @@ MBO_LENGTHCALE_PRIOR = partial(GammaPrior, concentration=2.0, rate=0.2) MBO_NOISE_PRIOR = partial(GammaPrior, concentration=2.0, rate=4.0) MBO_OUTPUTSCALE_PRIOR = partial(GammaPrior, concentration=2.0, rate=4.0) +MBO_LKJ_PRIOR = partial( + LKJPrior, n_tasks=1, eta=2.0, sd_prior=GammaPrior(concentration=2.0, rate=0.15) +) diff --git a/bofire/data_models/priors/lkj.py b/bofire/data_models/priors/lkj.py new file mode 100644 index 000000000..833b93838 --- /dev/null +++ b/bofire/data_models/priors/lkj.py @@ -0,0 +1,21 @@ +from typing import Literal + +from pydantic import PositiveFloat + +from bofire.data_models.priors.gamma import GammaPrior +from bofire.data_models.priors.prior import Prior + + +class LKJPrior(Prior): + """LKJ prior over correlation matrices. Allows to specify the shape of the prior. + + Attributes: + n(int): number of dimensions of the correlation matrix + eta(PositiveFloat): shape parameter of the LKJ distribution + sd_prior(Prior): prior over the standard deviations of the correlation matrix + """ + + type: Literal["LKJPrior"] = "LKJPrior" + n_tasks: int + eta: PositiveFloat + sd_prior: GammaPrior diff --git a/bofire/data_models/surrogates/api.py b/bofire/data_models/surrogates/api.py index b53d95a4c..7adfb9c69 100644 --- a/bofire/data_models/surrogates/api.py +++ b/bofire/data_models/surrogates/api.py @@ -18,6 +18,10 @@ MixedTanimotoGPSurrogate, ) from bofire.data_models.surrogates.mlp import MLPEnsemble + from bofire.data_models.surrogates.multi_task_gp import ( + MultiTaskGPHyperconfig, + MultiTaskGPSurrogate, + ) from bofire.data_models.surrogates.polynomial import PolynomialSurrogate from bofire.data_models.surrogates.random_forest import RandomForestSurrogate from bofire.data_models.surrogates.single_task_gp import ( @@ -43,6 +47,7 @@ LinearSurrogate, PolynomialSurrogate, TanimotoGPSurrogate, + MultiTaskGPSurrogate, ] AnyTrainableSurrogate = Union[ @@ -56,6 +61,7 @@ LinearSurrogate, PolynomialSurrogate, TanimotoGPSurrogate, + MultiTaskGPSurrogate, ] except ImportError: # with the minimal installationwe don't have botorch diff --git a/bofire/data_models/surrogates/multi_task_gp.py b/bofire/data_models/surrogates/multi_task_gp.py new file mode 100644 index 000000000..dd6f14968 --- /dev/null +++ b/bofire/data_models/surrogates/multi_task_gp.py @@ -0,0 +1,94 @@ +from typing import Annotated, Literal, Optional + +import pandas as pd +from pydantic import Field + +from bofire.data_models.domain.api import Inputs +from bofire.data_models.enum import RegressionMetricsEnum +from bofire.data_models.features.api import CategoricalInput +from bofire.data_models.kernels.api import ( + AnyKernel, + MaternKernel, + RBFKernel, +) +from bofire.data_models.priors.api import ( + BOTORCH_LENGTHCALE_PRIOR, + BOTORCH_LKJ_PRIOR, + BOTORCH_NOISE_PRIOR, + MBO_LENGTHCALE_PRIOR, + MBO_NOISE_PRIOR, + AnyPrior, +) +from bofire.data_models.priors.lkj import LKJPrior + +# from bofire.data_models.strategies.api import FactorialStrategy +from bofire.data_models.surrogates.trainable import Hyperconfig +from bofire.data_models.surrogates.trainable_botorch import TrainableBotorchSurrogate + + +class MultiTaskGPHyperconfig(Hyperconfig): + type: Literal["MultiTaskGPHyperconfig"] = "MultiTaskGPHyperconfig" + inputs: Inputs = Inputs( + features=[ + CategoricalInput( + key="kernel", categories=["rbf", "matern_1.5", "matern_2.5"] + ), + CategoricalInput(key="prior", categories=["mbo", "botorch"]), + CategoricalInput(key="ard", categories=["True", "False"]), + ] + ) + target_metric: RegressionMetricsEnum = RegressionMetricsEnum.MAE + hyperstrategy: Literal[ + "FactorialStrategy", "SoboStrategy", "RandomStrategy" + ] = "FactorialStrategy" + + @staticmethod + def _update_hyperparameters( + surrogate_data: "MultiTaskGPSurrogate", hyperparameters: pd.Series + ): + def matern_25(ard: bool, lengthscale_prior: AnyPrior) -> MaternKernel: + return MaternKernel(nu=2.5, lengthscale_prior=lengthscale_prior, ard=ard) + + def matern_15(ard: bool, lengthscale_prior: AnyPrior) -> MaternKernel: + return MaternKernel(nu=1.5, lengthscale_prior=lengthscale_prior, ard=ard) + + if hyperparameters.prior == "mbo": + noise_prior, lengthscale_prior = (MBO_NOISE_PRIOR(), MBO_LENGTHCALE_PRIOR()) + else: + noise_prior, lengthscale_prior = ( + BOTORCH_NOISE_PRIOR(), + BOTORCH_LENGTHCALE_PRIOR(), + ) + + surrogate_data.noise_prior = noise_prior + if hyperparameters.kernel == "rbf": + surrogate_data.kernel = ( + RBFKernel(ard=hyperparameters.ard, lengthscale_prior=lengthscale_prior), + ) + elif hyperparameters.kernel == "matern_2.5": + surrogate_data.kernel = matern_25( + ard=hyperparameters.ard, lengthscale_prior=lengthscale_prior + ) + elif hyperparameters.kernel == "matern_1.5": + surrogate_data.kernel = matern_15( + ard=hyperparameters.ard, lengthscale_prior=lengthscale_prior + ) + else: + raise ValueError(f"Kernel {hyperparameters.kernel} not known.") + + +class MultiTaskGPSurrogate(TrainableBotorchSurrogate): + type: Literal["MultiTaskGPSurrogate"] = "MultiTaskGPSurrogate" + n_tasks: Annotated[int, Field(ge=1)] = 1 + kernel: AnyKernel = Field( + default_factory=lambda: MaternKernel( + ard=True, + nu=2.5, + lengthscale_prior=BOTORCH_LENGTHCALE_PRIOR(), + ) + ) + noise_prior: AnyPrior = Field(default_factory=lambda: BOTORCH_NOISE_PRIOR()) + lkj_prior: LKJPrior = Field(default_factory=lambda: BOTORCH_LKJ_PRIOR()) + hyperconfig: Optional[MultiTaskGPHyperconfig] = Field( + default_factory=lambda: MultiTaskGPHyperconfig() + ) diff --git a/bofire/priors/mapper.py b/bofire/priors/mapper.py index 48e4cb028..ae55cca9d 100644 --- a/bofire/priors/mapper.py +++ b/bofire/priors/mapper.py @@ -13,9 +13,16 @@ def map_GammaPrior(data_model: data_models.GammaPrior) -> gpytorch.priors.GammaP ) +def map_LKJPrior(data_model: data_models.LKJPrior) -> gpytorch.priors.LKJPrior: + return gpytorch.priors.LKJCovariancePrior( + n=data_model.n_tasks, eta=data_model.eta, sd_prior=map(data_model.sd_prior) + ) + + PRIOR_MAP = { data_models.NormalPrior: map_NormalPrior, data_models.GammaPrior: map_GammaPrior, + data_models.LKJPrior: map_LKJPrior, } diff --git a/bofire/surrogates/mapper.py b/bofire/surrogates/mapper.py index 3ab102f00..52542c794 100644 --- a/bofire/surrogates/mapper.py +++ b/bofire/surrogates/mapper.py @@ -6,6 +6,7 @@ from bofire.surrogates.mixed_single_task_gp import MixedSingleTaskGPSurrogate from bofire.surrogates.mixed_tanimoto_gp import MixedTanimotoGPSurrogate from bofire.surrogates.mlp import MLPEnsemble +from bofire.surrogates.multi_task_gp import MultiTaskGPSurrogate from bofire.surrogates.random_forest import RandomForestSurrogate from bofire.surrogates.single_task_gp import SingleTaskGPSurrogate from bofire.surrogates.surrogate import Surrogate @@ -23,6 +24,7 @@ data_models.LinearSurrogate: SingleTaskGPSurrogate, data_models.PolynomialSurrogate: SingleTaskGPSurrogate, data_models.TanimotoGPSurrogate: SingleTaskGPSurrogate, + data_models.MultiTaskGPSurrogate: MultiTaskGPSurrogate, } diff --git a/bofire/surrogates/multi_task_gp.py b/bofire/surrogates/multi_task_gp.py new file mode 100644 index 000000000..eabfa9024 --- /dev/null +++ b/bofire/surrogates/multi_task_gp.py @@ -0,0 +1,73 @@ +from typing import Dict, Optional + +import botorch +import pandas as pd +import torch +from botorch.fit import fit_gpytorch_mll +from botorch.models.transforms.outcome import Standardize +from gpytorch.mlls import ExactMarginalLogLikelihood + +import bofire.kernels.api as kernels +import bofire.priors.api as priors +from bofire.data_models.enum import OutputFilteringEnum + +# from bofire.data_models.molfeatures.api import MolFeatures +from bofire.data_models.surrogates.api import MultiTaskGPSurrogate as DataModel +from bofire.data_models.surrogates.scaler import ScalerEnum +from bofire.surrogates.botorch import BotorchSurrogate +from bofire.surrogates.trainable import TrainableSurrogate +from bofire.surrogates.utils import get_scaler +from bofire.utils.torch_tools import tkwargs + + +class MultiTaskGPSurrogate(BotorchSurrogate, TrainableSurrogate): + def __init__( + self, + data_model: DataModel, + **kwargs, + ): + self.n_tasks = data_model.n_tasks + self.kernel = data_model.kernel + self.scaler = data_model.scaler + self.output_scaler = data_model.output_scaler + self.noise_prior = data_model.noise_prior + self.lkj_prior = data_model.lkj_prior + # set the number of tasks in the prior + self.lkj_prior.n_tasks = self.n_tasks + super().__init__(data_model=data_model, **kwargs) + + model: Optional[botorch.models.MultiTaskGP] = None + _output_filtering: OutputFilteringEnum = OutputFilteringEnum.ALL + training_specs: Dict = {} + + def _fit(self, X: pd.DataFrame, Y: pd.DataFrame): + scaler = get_scaler(self.inputs, self.input_preprocessing_specs, self.scaler, X) + transformed_X = self.inputs.transform(X, self.input_preprocessing_specs) + + tX, tY = torch.from_numpy(transformed_X.values).to(**tkwargs), torch.from_numpy( + Y.values + ).to(**tkwargs) + + self.model = botorch.models.MultiTaskGP( # type: ignore + train_X=tX, + train_Y=tY, + task_feature=X.columns.get_loc("fid"), # obtain the fidelity index + covar_module=kernels.map( + self.kernel, + batch_shape=torch.Size(), + active_dims=list( + range(tX.shape[1] - 1) + ), # kernel is for input space so we subtract one for the fidelity index + ard_num_dims=1, # this keyword is ingored + ), + task_covar_prior=priors.map(self.lkj_prior), + outcome_transform=Standardize(m=tY.shape[-1]) + if self.output_scaler == ScalerEnum.STANDARDIZE + else None, + input_transform=scaler, + ) + + self.model.likelihood.noise_covar.noise_prior = priors.map(self.noise_prior) # type: ignore + + mll = ExactMarginalLogLikelihood(self.model.likelihood, self.model) + fit_gpytorch_mll(mll, options=self.training_specs, max_attempts=10) diff --git a/tutorials/multi_task_gp_testing.py b/tutorials/multi_task_gp_testing.py new file mode 100644 index 000000000..c89b45126 --- /dev/null +++ b/tutorials/multi_task_gp_testing.py @@ -0,0 +1,51 @@ +import json + +from pydantic import parse_obj_as + +import bofire.surrogates.api as surrogates +from bofire.benchmarks.single import MultiFidelityHimmelblau +from bofire.data_models.surrogates.api import ( + AnySurrogate, + MultiTaskGPSurrogate, +) + +benchmark = MultiFidelityHimmelblau() +samples = benchmark.domain.inputs.sample(n=50) +experiments = benchmark.f(samples, return_complete=True) + +# make fid the columns in order [fid, x_1, x_2, y, valid_y] +experiments = experiments[["fid", "x_1", "x_2", "y", "valid_y"]] + +input_features = benchmark.domain.inputs +output_features = benchmark.domain.outputs + +# we setup the data model, here a Multi Task GP +surrogate_data = MultiTaskGPSurrogate( + inputs=input_features, outputs=output_features, n_tasks=2 +) + +# we generate the json spec +jspec = surrogate_data.json() + +surrogate_data = parse_obj_as(AnySurrogate, json.loads(jspec)) + +surrogate = surrogates.map(surrogate_data) + +surrogate.fit(experiments=experiments) + +# dump it +dump = surrogate.dumps() + +# predict with it +df_predictions = surrogate.predict(experiments) +# transform to spec +predictions = surrogate.to_predictions(predictions=df_predictions) + +surrogate_data = parse_obj_as(AnySurrogate, json.loads(jspec)) +surrogate = surrogates.map(surrogate_data) +# surrogate.loads(dump) + +# predict with it +df_predictions2 = surrogate.predict(experiments) +# transform to spec +predictions2 = surrogate.to_predictions(predictions=df_predictions2) From 12363afd98bbbe1ed8a57f640c8cd639b3873498 Mon Sep 17 00:00:00 2001 From: Jose Pablo Folch Date: Tue, 27 Feb 2024 17:01:43 +0000 Subject: [PATCH 02/14] added TaskInputs functionality --- bofire/benchmarks/single.py | 5 +-- bofire/data_models/features/api.py | 3 ++ bofire/data_models/features/tasks.py | 32 +++++++++++++++++++ .../data_models/surrogates/multi_task_gp.py | 4 +-- bofire/surrogates/multi_task_gp.py | 21 +++++++++++- 5 files changed, 60 insertions(+), 5 deletions(-) create mode 100644 bofire/data_models/features/tasks.py diff --git a/bofire/benchmarks/single.py b/bofire/benchmarks/single.py index 7eec321f8..693fb6562 100644 --- a/bofire/benchmarks/single.py +++ b/bofire/benchmarks/single.py @@ -17,6 +17,7 @@ ContinuousInput, ContinuousOutput, DiscreteInput, + TaskInput, ) from bofire.data_models.objectives.api import MaximizeObjective, MinimizeObjective from bofire.utils.torch_tools import tkwargs @@ -394,7 +395,7 @@ def __init__(self, use_constraints: bool = False, **kwargs): self.use_constraints = use_constraints inputs = [] - inputs.append(DiscreteInput(key="fid", values=[0, 1])) + inputs.append(TaskInput(key="task_id", n_tasks=2, fidelities=[0, 1])) inputs.append(ContinuousInput(key="x_1", bounds=(-6, 6))) inputs.append(ContinuousInput(key="x_2", bounds=(-6, 6))) @@ -417,7 +418,7 @@ def _f(self, X: pd.DataFrame, **kwargs) -> pd.DataFrame: pd.DataFrame: y values of the function. Columns are y and valid_y. """ X_temp = X.eval( - "y=((x_1**2 + x_2 - 11)**2+(x_1 + x_2**2 -7)**2) + fid * x_1 * x_2", + "y=((x_1**2 + x_2 - 11)**2+(x_1 + x_2**2 -7)**2) + (1 - task_id) * x_1 * x_2", inplace=False, ) Y = pd.DataFrame({"y": X_temp["y"], "valid_y": 1}) diff --git a/bofire/data_models/features/api.py b/bofire/data_models/features/api.py index 5dc9b7bd8..2927c76f7 100644 --- a/bofire/data_models/features/api.py +++ b/bofire/data_models/features/api.py @@ -13,6 +13,7 @@ MolecularInput, ) from bofire.data_models.features.numerical import NumericalInput +from bofire.data_models.features.tasks import TaskInput AbstractFeature = Union[ Feature, @@ -32,6 +33,7 @@ CategoricalDescriptorInput, MolecularInput, CategoricalMolecularInput, + TaskInput, ] AnyInput = Union[ @@ -42,6 +44,7 @@ CategoricalDescriptorInput, MolecularInput, CategoricalMolecularInput, + TaskInput, ] AnyOutput = Union[ContinuousOutput, CategoricalOutput] diff --git a/bofire/data_models/features/tasks.py b/bofire/data_models/features/tasks.py new file mode 100644 index 000000000..6fcc9d4b6 --- /dev/null +++ b/bofire/data_models/features/tasks.py @@ -0,0 +1,32 @@ +from typing import List, Literal + +import numpy as np +from pydantic import model_validator, validator + +from bofire.data_models.features.api import DiscreteInput + + +class TaskInput(DiscreteInput): + type: Literal["TaskInput"] = "TaskInput" + n_tasks: int + fidelities: List[int] + + @validator("fidelities") + def validate_fidelities(cls, fidelities: List[int], values): + # if fidelities is None: + # return [0 for _ in range(self.n_tasks)] + if len(fidelities) != values["n_tasks"]: + raise ValueError( + "Length of fidelity lists must be equal to the number of tasks" + ) + if list(set(fidelities)) != list(range(np.max(fidelities) + 1)): + raise ValueError( + "Fidelities must be a list containing integers, starting from 0 and increasing by 1" + ) + return fidelities + + @model_validator(mode="before") + def validate_values(cls, values): + if "n_tasks" in values: + values["values"] = list(range(values["n_tasks"])) + return values diff --git a/bofire/data_models/surrogates/multi_task_gp.py b/bofire/data_models/surrogates/multi_task_gp.py index dd6f14968..a6994f981 100644 --- a/bofire/data_models/surrogates/multi_task_gp.py +++ b/bofire/data_models/surrogates/multi_task_gp.py @@ -13,8 +13,8 @@ ) from bofire.data_models.priors.api import ( BOTORCH_LENGTHCALE_PRIOR, - BOTORCH_LKJ_PRIOR, BOTORCH_NOISE_PRIOR, + LKJ_PRIOR, MBO_LENGTHCALE_PRIOR, MBO_NOISE_PRIOR, AnyPrior, @@ -88,7 +88,7 @@ class MultiTaskGPSurrogate(TrainableBotorchSurrogate): ) ) noise_prior: AnyPrior = Field(default_factory=lambda: BOTORCH_NOISE_PRIOR()) - lkj_prior: LKJPrior = Field(default_factory=lambda: BOTORCH_LKJ_PRIOR()) + lkj_prior: LKJPrior = Field(default_factory=lambda: LKJ_PRIOR()) hyperconfig: Optional[MultiTaskGPHyperconfig] = Field( default_factory=lambda: MultiTaskGPHyperconfig() ) diff --git a/bofire/surrogates/multi_task_gp.py b/bofire/surrogates/multi_task_gp.py index eabfa9024..d5bdb60ca 100644 --- a/bofire/surrogates/multi_task_gp.py +++ b/bofire/surrogates/multi_task_gp.py @@ -1,6 +1,7 @@ from typing import Dict, Optional import botorch +import numpy as np import pandas as pd import torch from botorch.fit import fit_gpytorch_mll @@ -34,6 +35,12 @@ def __init__( self.lkj_prior = data_model.lkj_prior # set the number of tasks in the prior self.lkj_prior.n_tasks = self.n_tasks + # obtain the name of the task feature + for feature in data_model.inputs.features: + if feature.type == "TaskInput": + self.task_feature_key = feature.key + break + super().__init__(data_model=data_model, **kwargs) model: Optional[botorch.models.MultiTaskGP] = None @@ -51,7 +58,9 @@ def _fit(self, X: pd.DataFrame, Y: pd.DataFrame): self.model = botorch.models.MultiTaskGP( # type: ignore train_X=tX, train_Y=tY, - task_feature=X.columns.get_loc("fid"), # obtain the fidelity index + task_feature=X.columns.get_loc( + self.task_feature_key + ), # obtain the fidelity index covar_module=kernels.map( self.kernel, batch_shape=torch.Size(), @@ -71,3 +80,13 @@ def _fit(self, X: pd.DataFrame, Y: pd.DataFrame): mll = ExactMarginalLogLikelihood(self.model.likelihood, self.model) fit_gpytorch_mll(mll, options=self.training_specs, max_attempts=10) + + def _predict(self, transformed_X: pd.DataFrame): + # transform to tensor + X = torch.from_numpy(transformed_X.values).to(**tkwargs) + with torch.no_grad(): + preds = self.model.posterior(X=X, observation_noise=False).mean.cpu().detach().numpy() # type: ignore + vars = self.model.posterior(X=X, observation_noise=False).variance.cpu().detach().numpy() # type: ignore + # add the observation noise to the stds + stds = np.sqrt(vars + self.model.likelihood.noise.cpu().detach().numpy()) + return preds, stds From 47c4c96c70969ff87618c66cef6791a00182f544 Mon Sep 17 00:00:00 2001 From: Jose Pablo Folch Date: Tue, 27 Feb 2024 17:03:13 +0000 Subject: [PATCH 03/14] lkj prior clean-up --- bofire/data_models/priors/api.py | 9 ++++----- bofire/data_models/priors/lkj.py | 2 +- tutorials/multi_task_gp_testing.py | 6 ++++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/bofire/data_models/priors/api.py b/bofire/data_models/priors/api.py index bfa12d710..2697c856e 100644 --- a/bofire/data_models/priors/api.py +++ b/bofire/data_models/priors/api.py @@ -18,9 +18,6 @@ BOTORCH_LENGTHCALE_PRIOR = partial(GammaPrior, concentration=3.0, rate=6.0) BOTORCH_NOISE_PRIOR = partial(GammaPrior, concentration=1.1, rate=0.05) BOTORCH_SCALE_PRIOR = partial(GammaPrior, concentration=2.0, rate=0.15) -BOTORCH_LKJ_PRIOR = partial( - LKJPrior, n_tasks=1, eta=2.0, sd_prior=GammaPrior(concentration=2.0, rate=0.15) -) # mbo priors # By default BoTorch places a highly informative prior on the kernel lengthscales, @@ -29,6 +26,8 @@ MBO_LENGTHCALE_PRIOR = partial(GammaPrior, concentration=2.0, rate=0.2) MBO_NOISE_PRIOR = partial(GammaPrior, concentration=2.0, rate=4.0) MBO_OUTPUTSCALE_PRIOR = partial(GammaPrior, concentration=2.0, rate=4.0) -MBO_LKJ_PRIOR = partial( - LKJPrior, n_tasks=1, eta=2.0, sd_prior=GammaPrior(concentration=2.0, rate=0.15) + +# prior for multitask kernel +LKJ_PRIOR = partial( + LKJPrior, eta=2.0, sd_prior=GammaPrior(concentration=2.0, rate=0.15) ) diff --git a/bofire/data_models/priors/lkj.py b/bofire/data_models/priors/lkj.py index 833b93838..4e0988c90 100644 --- a/bofire/data_models/priors/lkj.py +++ b/bofire/data_models/priors/lkj.py @@ -16,6 +16,6 @@ class LKJPrior(Prior): """ type: Literal["LKJPrior"] = "LKJPrior" - n_tasks: int eta: PositiveFloat sd_prior: GammaPrior + n_tasks: int = 1 diff --git a/tutorials/multi_task_gp_testing.py b/tutorials/multi_task_gp_testing.py index c89b45126..d78ac6a42 100644 --- a/tutorials/multi_task_gp_testing.py +++ b/tutorials/multi_task_gp_testing.py @@ -14,7 +14,7 @@ experiments = benchmark.f(samples, return_complete=True) # make fid the columns in order [fid, x_1, x_2, y, valid_y] -experiments = experiments[["fid", "x_1", "x_2", "y", "valid_y"]] +experiments = experiments[["task_id", "x_1", "x_2", "y", "valid_y"]] input_features = benchmark.domain.inputs output_features = benchmark.domain.outputs @@ -43,9 +43,11 @@ surrogate_data = parse_obj_as(AnySurrogate, json.loads(jspec)) surrogate = surrogates.map(surrogate_data) -# surrogate.loads(dump) +surrogate.loads(dump) # predict with it df_predictions2 = surrogate.predict(experiments) # transform to spec predictions2 = surrogate.to_predictions(predictions=df_predictions2) + +assert predictions.equals(predictions2) From 92efb870af86b78fcd40029962f92bc6f0a118b1 Mon Sep 17 00:00:00 2001 From: Jose Pablo Folch Date: Tue, 27 Feb 2024 22:26:38 +0000 Subject: [PATCH 04/14] TaskInput inheriting from Categorical --- bofire/benchmarks/single.py | 19 ++++++++--- bofire/data_models/features/tasks.py | 32 ++++++++++++++++--- .../data_models/surrogates/multi_task_gp.py | 3 +- bofire/surrogates/multi_task_gp.py | 30 +++++++++++++---- tutorials/multi_task_gp_testing.py | 29 ++++++++--------- 5 files changed, 79 insertions(+), 34 deletions(-) diff --git a/bofire/benchmarks/single.py b/bofire/benchmarks/single.py index 693fb6562..5d774617e 100644 --- a/bofire/benchmarks/single.py +++ b/bofire/benchmarks/single.py @@ -395,7 +395,7 @@ def __init__(self, use_constraints: bool = False, **kwargs): self.use_constraints = use_constraints inputs = [] - inputs.append(TaskInput(key="task_id", n_tasks=2, fidelities=[0, 1])) + inputs.append(TaskInput(key="task_id", categories=["task_1", "task_2"])) inputs.append(ContinuousInput(key="x_1", bounds=(-6, 6))) inputs.append(ContinuousInput(key="x_2", bounds=(-6, 6))) @@ -417,11 +417,20 @@ def _f(self, X: pd.DataFrame, **kwargs) -> pd.DataFrame: Returns: pd.DataFrame: y values of the function. Columns are y and valid_y. """ - X_temp = X.eval( - "y=((x_1**2 + x_2 - 11)**2+(x_1 + x_2**2 -7)**2) + (1 - task_id) * x_1 * x_2", - inplace=False, + # initialize y outputs + Y = pd.DataFrame({"y": np.zeros(len(X)), "valid_y": 0}) + # evaluate task 1 + X_temp = X.query("task_id == 'task_1'").eval( + "y=((x_1**2 + x_2 - 11)**2+(x_1 + x_2**2 -7)**2)", inplace=False ) - Y = pd.DataFrame({"y": X_temp["y"], "valid_y": 1}) + Y.loc[X_temp.index, "y"] = X_temp["y"] + Y.loc[X_temp.index, "valid_y"] = 1 + # evaluate task 2 + X_temp = X.query("task_id == 'task_2'").eval( + "y=((x_1**2 + x_2 - 11)**2+(x_1 + x_2**2 -7)**2) + x_1 * x_2", inplace=False + ) + Y.loc[X_temp.index, "y"] = X_temp["y"] + Y.loc[X_temp.index, "valid_y"] = 1 return Y def get_optima(self) -> pd.DataFrame: diff --git a/bofire/data_models/features/tasks.py b/bofire/data_models/features/tasks.py index 6fcc9d4b6..0574a7bf6 100644 --- a/bofire/data_models/features/tasks.py +++ b/bofire/data_models/features/tasks.py @@ -1,17 +1,18 @@ -from typing import List, Literal +from typing import List, Literal, Optional import numpy as np -from pydantic import model_validator, validator +from pydantic import Field, field_validator, model_validator +from typing_extensions import Annotated -from bofire.data_models.features.api import DiscreteInput +from bofire.data_models.features.api import CategoricalInput, DiscreteInput -class TaskInput(DiscreteInput): +class TaskInputDiscrete(DiscreteInput): type: Literal["TaskInput"] = "TaskInput" n_tasks: int fidelities: List[int] - @validator("fidelities") + @field_validator("fidelities") def validate_fidelities(cls, fidelities: List[int], values): # if fidelities is None: # return [0 for _ in range(self.n_tasks)] @@ -30,3 +31,24 @@ def validate_values(cls, values): if "n_tasks" in values: values["values"] = list(range(values["n_tasks"])) return values + + +class TaskInput(CategoricalInput): + type: Literal["TaskInputCategorical"] = "TaskInput" + fidelities: Annotated[Optional[List[int]], Field(validate_default=True)] = None + + @field_validator("fidelities") + def validate_fidelities(cls, fidelities: List[int], values): + if "categories" in values.data: + n_tasks = len(values.data["categories"]) + if fidelities is None: + return [0 for _ in range(n_tasks)] + if len(fidelities) != n_tasks: + raise ValueError( + "Length of fidelity lists must be equal to the number of tasks" + ) + if list(set(fidelities)) != list(range(np.max(fidelities) + 1)): + raise ValueError( + "Fidelities must be a list containing integers, starting from 0 and increasing by 1" + ) + return fidelities diff --git a/bofire/data_models/surrogates/multi_task_gp.py b/bofire/data_models/surrogates/multi_task_gp.py index a6994f981..4cdc23341 100644 --- a/bofire/data_models/surrogates/multi_task_gp.py +++ b/bofire/data_models/surrogates/multi_task_gp.py @@ -1,4 +1,4 @@ -from typing import Annotated, Literal, Optional +from typing import Literal, Optional import pandas as pd from pydantic import Field @@ -79,7 +79,6 @@ def matern_15(ard: bool, lengthscale_prior: AnyPrior) -> MaternKernel: class MultiTaskGPSurrogate(TrainableBotorchSurrogate): type: Literal["MultiTaskGPSurrogate"] = "MultiTaskGPSurrogate" - n_tasks: Annotated[int, Field(ge=1)] = 1 kernel: AnyKernel = Field( default_factory=lambda: MaternKernel( ard=True, diff --git a/bofire/surrogates/multi_task_gp.py b/bofire/surrogates/multi_task_gp.py index d5bdb60ca..5b7ab7ef8 100644 --- a/bofire/surrogates/multi_task_gp.py +++ b/bofire/surrogates/multi_task_gp.py @@ -5,12 +5,14 @@ import pandas as pd import torch from botorch.fit import fit_gpytorch_mll +from botorch.models.transforms.input import OneHotToNumeric from botorch.models.transforms.outcome import Standardize from gpytorch.mlls import ExactMarginalLogLikelihood import bofire.kernels.api as kernels import bofire.priors.api as priors from bofire.data_models.enum import OutputFilteringEnum +from bofire.data_models.features.api import TaskInput # from bofire.data_models.molfeatures.api import MolFeatures from bofire.data_models.surrogates.api import MultiTaskGPSurrogate as DataModel @@ -27,7 +29,7 @@ def __init__( data_model: DataModel, **kwargs, ): - self.n_tasks = data_model.n_tasks + self.n_tasks = len(data_model.inputs.get(TaskInput).features[0].categories) self.kernel = data_model.kernel self.scaler = data_model.scaler self.output_scaler = data_model.output_scaler @@ -36,10 +38,7 @@ def __init__( # set the number of tasks in the prior self.lkj_prior.n_tasks = self.n_tasks # obtain the name of the task feature - for feature in data_model.inputs.features: - if feature.type == "TaskInput": - self.task_feature_key = feature.key - break + self.task_feature_key = data_model.inputs.get_keys(TaskInput)[0] super().__init__(data_model=data_model, **kwargs) @@ -55,8 +54,24 @@ def _fit(self, X: pd.DataFrame, Y: pd.DataFrame): Y.values ).to(**tkwargs) + features2idx, _ = self.inputs._get_transform_info( + self.input_preprocessing_specs + ) + + task_features = { + features2idx[self.task_feature_key][0]: len( + features2idx[self.task_feature_key] + ) + } + + self.o2n = OneHotToNumeric( + dim=tX.shape[1], + categorical_features=task_features, + transform_on_train=False, + ) + self.model = botorch.models.MultiTaskGP( # type: ignore - train_X=tX, + train_X=self.o2n.transform(tX), train_Y=tY, task_feature=X.columns.get_loc( self.task_feature_key @@ -65,7 +80,7 @@ def _fit(self, X: pd.DataFrame, Y: pd.DataFrame): self.kernel, batch_shape=torch.Size(), active_dims=list( - range(tX.shape[1] - 1) + range(self.o2n.transform(tX).shape[1] - 1) ), # kernel is for input space so we subtract one for the fidelity index ard_num_dims=1, # this keyword is ingored ), @@ -84,6 +99,7 @@ def _fit(self, X: pd.DataFrame, Y: pd.DataFrame): def _predict(self, transformed_X: pd.DataFrame): # transform to tensor X = torch.from_numpy(transformed_X.values).to(**tkwargs) + X = self.o2n.transform(X) with torch.no_grad(): preds = self.model.posterior(X=X, observation_noise=False).mean.cpu().detach().numpy() # type: ignore vars = self.model.posterior(X=X, observation_noise=False).variance.cpu().detach().numpy() # type: ignore diff --git a/tutorials/multi_task_gp_testing.py b/tutorials/multi_task_gp_testing.py index d78ac6a42..ce090b2cb 100644 --- a/tutorials/multi_task_gp_testing.py +++ b/tutorials/multi_task_gp_testing.py @@ -1,11 +1,7 @@ -import json - -from pydantic import parse_obj_as - import bofire.surrogates.api as surrogates from bofire.benchmarks.single import MultiFidelityHimmelblau +from bofire.data_models.enum import CategoricalEncodingEnum from bofire.data_models.surrogates.api import ( - AnySurrogate, MultiTaskGPSurrogate, ) @@ -21,33 +17,36 @@ # we setup the data model, here a Multi Task GP surrogate_data = MultiTaskGPSurrogate( - inputs=input_features, outputs=output_features, n_tasks=2 + inputs=input_features, + outputs=output_features, + input_preprocessing_specs={"task_id": CategoricalEncodingEnum.ONE_HOT}, ) # we generate the json spec -jspec = surrogate_data.json() +# jspec = surrogate_data.json() -surrogate_data = parse_obj_as(AnySurrogate, json.loads(jspec)) +# surrogate_data = parse_obj_as(MultiTaskGPSurrogate, json.loads(jspec)) +# surrogate_data = TypeAdapter(MultiTaskGPSurrogate).validate_python(json.loads(jspec)) surrogate = surrogates.map(surrogate_data) surrogate.fit(experiments=experiments) # dump it -dump = surrogate.dumps() +# dump = surrogate.dumps() # predict with it df_predictions = surrogate.predict(experiments) # transform to spec predictions = surrogate.to_predictions(predictions=df_predictions) -surrogate_data = parse_obj_as(AnySurrogate, json.loads(jspec)) -surrogate = surrogates.map(surrogate_data) -surrogate.loads(dump) +# surrogate_data = parse_obj_as(AnySurrogate, json.loads(jspec)) +# surrogate = surrogates.map(surrogate_data) +# surrogate.loads(dump) # predict with it -df_predictions2 = surrogate.predict(experiments) +# df_predictions2 = surrogate.predict(experiments) # transform to spec -predictions2 = surrogate.to_predictions(predictions=df_predictions2) +# predictions2 = surrogate.to_predictions(predictions=df_predictions2) -assert predictions.equals(predictions2) +# assert predictions.equals(predictions2) From 989cd3c2866d0d1134d81466800909e3490c087d Mon Sep 17 00:00:00 2001 From: Jose Pablo Folch Date: Tue, 5 Mar 2024 18:32:10 +0000 Subject: [PATCH 05/14] validation changes to MultiTaskGP --- bofire/benchmarks/api.py | 13 ++++- bofire/benchmarks/single.py | 2 +- bofire/data_models/surrogates/api.py | 8 ++- bofire/data_models/surrogates/botorch.py | 5 ++ .../data_models/surrogates/multi_task_gp.py | 50 ++++++++++++++++--- bofire/surrogates/multi_task_gp.py | 30 ++++------- 6 files changed, 76 insertions(+), 32 deletions(-) diff --git a/bofire/benchmarks/api.py b/bofire/benchmarks/api.py index 9d9dd1257..136a1bdac 100644 --- a/bofire/benchmarks/api.py +++ b/bofire/benchmarks/api.py @@ -4,7 +4,16 @@ from bofire.benchmarks.benchmark import Benchmark, GenericBenchmark from bofire.benchmarks.hyperopt import Hyperopt from bofire.benchmarks.multi import C2DTLZ2, DTLZ2, ZDT1, CrossCoupling, SnarBenchmark -from bofire.benchmarks.single import Ackley, Branin, Branin30, Hartmann, Himmelblau +from bofire.benchmarks.single import ( + Ackley, + Branin, + Branin30, + Hartmann, + Himmelblau, + MultiTaskHimmelblau, +) AnyMultiBenchmark = Union[C2DTLZ2, DTLZ2, ZDT1, CrossCoupling, SnarBenchmark] -AnySingleBenchmark = Union[Ackley, Branin, Branin30, Hartmann, Himmelblau] +AnySingleBenchmark = Union[ + Ackley, Branin, Branin30, Hartmann, Himmelblau, MultiTaskHimmelblau +] diff --git a/bofire/benchmarks/single.py b/bofire/benchmarks/single.py index 4a55484b1..edbe6cad5 100644 --- a/bofire/benchmarks/single.py +++ b/bofire/benchmarks/single.py @@ -382,7 +382,7 @@ def get_optima(self) -> pd.DataFrame: ) -class MultiFidelityHimmelblau(Benchmark): +class MultiTaskHimmelblau(Benchmark): """Himmelblau function for testing optimization algorithms Link to the definition: https://en.wikipedia.org/wiki/Himmelblau%27s_function """ diff --git a/bofire/data_models/surrogates/api.py b/bofire/data_models/surrogates/api.py index 0642a4d88..fb5bff2bb 100644 --- a/bofire/data_models/surrogates/api.py +++ b/bofire/data_models/surrogates/api.py @@ -18,6 +18,10 @@ MLPEnsemble, RegressionMLPEnsemble, ) +from bofire.data_models.surrogates.multi_task_gp import ( + MultiTaskGPHyperconfig, + MultiTaskGPSurrogate, +) from bofire.data_models.surrogates.polynomial import PolynomialSurrogate from bofire.data_models.surrogates.random_forest import RandomForestSurrogate from bofire.data_models.surrogates.scaler import ScalerEnum @@ -61,5 +65,5 @@ TanimotoGPSurrogate, MultiTaskGPSurrogate, ] - -AnyClassificationSurrogate = ClassificationMLPEnsemble \ No newline at end of file + +AnyClassificationSurrogate = ClassificationMLPEnsemble diff --git a/bofire/data_models/surrogates/botorch.py b/bofire/data_models/surrogates/botorch.py index f10917a58..a5c7039f7 100644 --- a/bofire/data_models/surrogates/botorch.py +++ b/bofire/data_models/surrogates/botorch.py @@ -15,6 +15,11 @@ class BotorchSurrogate(Surrogate): @field_validator("input_preprocessing_specs") @classmethod def validate_input_preprocessing_specs(cls, v, info): + # when validator for inputs fails, this validator is still checked and causes an Exception error instead of a ValueError + # fix this by checking if inputs is in info.data + if "inputs" not in info.data: + return + inputs = info.data["inputs"] categorical_keys = inputs.get_keys(CategoricalInput, exact=True) descriptor_keys = inputs.get_keys(CategoricalDescriptorInput, exact=True) diff --git a/bofire/data_models/surrogates/multi_task_gp.py b/bofire/data_models/surrogates/multi_task_gp.py index 4cdc23341..dabf324bd 100644 --- a/bofire/data_models/surrogates/multi_task_gp.py +++ b/bofire/data_models/surrogates/multi_task_gp.py @@ -1,11 +1,16 @@ -from typing import Literal, Optional +from typing import Literal, Optional, Type import pandas as pd -from pydantic import Field +from pydantic import Field, field_validator from bofire.data_models.domain.api import Inputs -from bofire.data_models.enum import RegressionMetricsEnum -from bofire.data_models.features.api import CategoricalInput +from bofire.data_models.enum import CategoricalEncodingEnum, RegressionMetricsEnum +from bofire.data_models.features.api import ( + AnyOutput, + CategoricalInput, + ContinuousOutput, + TaskInput, +) from bofire.data_models.kernels.api import ( AnyKernel, MaternKernel, @@ -62,8 +67,8 @@ def matern_15(ard: bool, lengthscale_prior: AnyPrior) -> MaternKernel: surrogate_data.noise_prior = noise_prior if hyperparameters.kernel == "rbf": - surrogate_data.kernel = ( - RBFKernel(ard=hyperparameters.ard, lengthscale_prior=lengthscale_prior), + surrogate_data.kernel = RBFKernel( + ard=hyperparameters.ard, lengthscale_prior=lengthscale_prior ) elif hyperparameters.kernel == "matern_2.5": surrogate_data.kernel = matern_25( @@ -91,3 +96,36 @@ class MultiTaskGPSurrogate(TrainableBotorchSurrogate): hyperconfig: Optional[MultiTaskGPHyperconfig] = Field( default_factory=lambda: MultiTaskGPHyperconfig() ) + + @classmethod + def is_output_implemented(cls, my_type: Type[AnyOutput]) -> bool: + """Abstract method to check output type for surrogate models + Args: + my_type: continuous or categorical output + Returns: + bool: True if the output type is valid for the surrogate chosen, False otherwise + """ + return isinstance(my_type, type(ContinuousOutput)) + + @field_validator("inputs", mode="before") + @classmethod + def validate_task_inputs(cls, v, info): + if len(v.get_keys(TaskInput)) != 1: + raise ValueError("Exactly one task input is required for multi-task GPs.") + return v + + @field_validator("input_preprocessing_specs") + @classmethod + def validate_encoding(cls, v, info): + # also validate that the task feature has ordinal encoding + if "inputs" not in info.data: + return + task_feature_id = info.data["inputs"].get_keys(TaskInput)[0] + if v.get(task_feature_id) is None: + v[task_feature_id] = CategoricalEncodingEnum.ORDINAL + elif v[task_feature_id] != CategoricalEncodingEnum.ORDINAL: + raise ValueError( + f"The task feature {task_feature_id} has to be encoded as ordinal." + ) + + return v diff --git a/bofire/surrogates/multi_task_gp.py b/bofire/surrogates/multi_task_gp.py index 5b7ab7ef8..31e700425 100644 --- a/bofire/surrogates/multi_task_gp.py +++ b/bofire/surrogates/multi_task_gp.py @@ -5,7 +5,6 @@ import pandas as pd import torch from botorch.fit import fit_gpytorch_mll -from botorch.models.transforms.input import OneHotToNumeric from botorch.models.transforms.outcome import Standardize from gpytorch.mlls import ExactMarginalLogLikelihood @@ -54,24 +53,8 @@ def _fit(self, X: pd.DataFrame, Y: pd.DataFrame): Y.values ).to(**tkwargs) - features2idx, _ = self.inputs._get_transform_info( - self.input_preprocessing_specs - ) - - task_features = { - features2idx[self.task_feature_key][0]: len( - features2idx[self.task_feature_key] - ) - } - - self.o2n = OneHotToNumeric( - dim=tX.shape[1], - categorical_features=task_features, - transform_on_train=False, - ) - self.model = botorch.models.MultiTaskGP( # type: ignore - train_X=self.o2n.transform(tX), + train_X=tX, train_Y=tY, task_feature=X.columns.get_loc( self.task_feature_key @@ -80,17 +63,19 @@ def _fit(self, X: pd.DataFrame, Y: pd.DataFrame): self.kernel, batch_shape=torch.Size(), active_dims=list( - range(self.o2n.transform(tX).shape[1] - 1) + range(tX.shape[1] - 1) ), # kernel is for input space so we subtract one for the fidelity index ard_num_dims=1, # this keyword is ingored ), - task_covar_prior=priors.map(self.lkj_prior), outcome_transform=Standardize(m=tY.shape[-1]) if self.output_scaler == ScalerEnum.STANDARDIZE else None, input_transform=scaler, ) + self.model.task_covar_module.register_prior( + "IndexKernelPrior", priors.map(self.lkj_prior), _index_kernel_prior_closure + ) self.model.likelihood.noise_covar.noise_prior = priors.map(self.noise_prior) # type: ignore mll = ExactMarginalLogLikelihood(self.model.likelihood, self.model) @@ -99,10 +84,13 @@ def _fit(self, X: pd.DataFrame, Y: pd.DataFrame): def _predict(self, transformed_X: pd.DataFrame): # transform to tensor X = torch.from_numpy(transformed_X.values).to(**tkwargs) - X = self.o2n.transform(X) with torch.no_grad(): preds = self.model.posterior(X=X, observation_noise=False).mean.cpu().detach().numpy() # type: ignore vars = self.model.posterior(X=X, observation_noise=False).variance.cpu().detach().numpy() # type: ignore # add the observation noise to the stds stds = np.sqrt(vars + self.model.likelihood.noise.cpu().detach().numpy()) return preds, stds + + +def _index_kernel_prior_closure(m): + return m._eval_covar_matrix() From ffea47cd4a91301289f3d05c643c2883b756799d Mon Sep 17 00:00:00 2001 From: Jose Pablo Folch Date: Tue, 5 Mar 2024 18:34:01 +0000 Subject: [PATCH 06/14] tests for multitask gp --- tests/bofire/surrogates/test_gps.py | 141 +++++++++++++++++++++++++++- tutorials/multi_task_gp_testing.py | 52 ---------- 2 files changed, 139 insertions(+), 54 deletions(-) delete mode 100644 tutorials/multi_task_gp_testing.py diff --git a/tests/bofire/surrogates/test_gps.py b/tests/bofire/surrogates/test_gps.py index 5642d3b14..494e07357 100644 --- a/tests/bofire/surrogates/test_gps.py +++ b/tests/bofire/surrogates/test_gps.py @@ -3,7 +3,7 @@ import pandas as pd import pytest import torch -from botorch.models import MixedSingleTaskGP, SingleTaskGP +from botorch.models import MixedSingleTaskGP, MultiTaskGP, SingleTaskGP from botorch.models.transforms.input import ( ChainedInputTransform, InputStandardize, @@ -15,7 +15,7 @@ from pydantic import ValidationError import bofire.surrogates.api as surrogates -from bofire.benchmarks.api import Himmelblau +from bofire.benchmarks.api import Himmelblau, MultiTaskHimmelblau from bofire.data_models.domain.api import Inputs, Outputs from bofire.data_models.enum import CategoricalEncodingEnum, RegressionMetricsEnum from bofire.data_models.features.api import ( @@ -23,6 +23,7 @@ ContinuousInput, ContinuousOutput, MolecularInput, + TaskInput, ) from bofire.data_models.kernels.api import ( HammondDistanceKernel, @@ -41,6 +42,7 @@ ) from bofire.data_models.surrogates.api import ( MixedSingleTaskGPSurrogate, + MultiTaskGPSurrogate, ScalerEnum, SingleTaskGPHyperconfig, SingleTaskGPSurrogate, @@ -281,6 +283,43 @@ def test_SingleTaskGPHyperconfig(): ) +def test_MultiTaskGPHyperconfig(): + # we test here also the basic trainable + benchmark = MultiTaskHimmelblau() + surrogate_data_no_hy = MultiTaskGPSurrogate( + inputs=benchmark.domain.inputs, + outputs=benchmark.domain.outputs, + hyperconfig=None, + ) + + with pytest.raises(ValueError, match="No hyperconfig available."): + surrogate_data_no_hy.update_hyperparameters( + benchmark.domain.inputs.sample(1).loc[0] + ) + # test that correct stuff is written + surrogate_data = MultiTaskGPSurrogate( + inputs=benchmark.domain.inputs, outputs=benchmark.domain.outputs + ) + candidate = surrogate_data.hyperconfig.inputs.sample(1).loc[0] + surrogate_data.update_hyperparameters(candidate) + + assert surrogate_data.kernel.ard == (candidate["ard"] == "True") + if candidate.kernel == "matern_1.5": + assert isinstance(surrogate_data.kernel, MaternKernel) + assert surrogate_data.kernel.nu == 1.5 + elif candidate.kernel == "matern_2.5": + assert isinstance(surrogate_data.kernel, MaternKernel) + assert surrogate_data.kernel.nu == 2.5 + else: + assert isinstance(surrogate_data.kernel, RBFKernel) + if candidate.prior == "mbo": + assert surrogate_data.noise_prior == MBO_NOISE_PRIOR() + assert surrogate_data.kernel.lengthscale_prior == MBO_LENGTHCALE_PRIOR() + else: + assert surrogate_data.noise_prior == BOTORCH_NOISE_PRIOR() + assert surrogate_data.kernel.lengthscale_prior == BOTORCH_LENGTHCALE_PRIOR() + + def test_MixedSingleTaskGPHyperconfig(): inputs = Inputs( features=[ @@ -321,6 +360,45 @@ def test_MixedSingleTaskGPHyperconfig(): ) +def test_MultiTask_input_preprocessing(): + # test that if wrong encoding is used, there is an error + inputs = Inputs( + features=[ContinuousInput(key="x", bounds=(-1, 1))] + + [TaskInput(key="task_id", categories=["1", "2"])] + ) + outputs = Outputs(features=[ContinuousOutput(key="y")]) + with pytest.raises(ValueError): + data_model = MultiTaskGPSurrogate( + inputs=inputs, + outputs=outputs, + input_preprocessing_specs={"task_id": CategoricalEncodingEnum.ONE_HOT}, + ) + + # test that if there is no task input, there is an error + inputs = Inputs( + features=[ContinuousInput(key="x", bounds=(-1, 1))] + + [CategoricalInput(key="task_id", categories=["1", "2"])] + ) + outputs = Outputs(features=[ContinuousOutput(key="y")]) + with pytest.raises(ValueError): + data_model = MultiTaskGPSurrogate( + inputs=inputs, + outputs=outputs, + input_preprocessing_specs={"task_id": CategoricalEncodingEnum.ORDINAL}, + ) + + # test that if no input_preprocessing_specs are provided, the ordinal encoding is used + inputs = Inputs( + features=[ContinuousInput(key="x", bounds=(-1, 1))] + + [TaskInput(key="task_id", categories=["1", "2"])] + ) + outputs = Outputs(features=[ContinuousOutput(key="y")]) + data_model = MultiTaskGPSurrogate(inputs=inputs, outputs=outputs) + assert data_model.input_preprocessing_specs == { + "task_id": CategoricalEncodingEnum.ORDINAL + } + + def test_MixedSingleTaskGPModel_invalid_preprocessing(): inputs = Inputs( features=[ @@ -426,6 +504,65 @@ def test_MixedSingleTaskGPModel(kernel, scaler, output_scaler): assert_frame_equal(preds, preds2) +@pytest.mark.parametrize( + "kernel, scaler, output_scaler", + [ + (RBFKernel(ard=True), ScalerEnum.NORMALIZE, ScalerEnum.STANDARDIZE), + (RBFKernel(ard=False), ScalerEnum.STANDARDIZE, ScalerEnum.STANDARDIZE), + (RBFKernel(ard=False), ScalerEnum.IDENTITY, ScalerEnum.IDENTITY), + ], +) +def test_MultiTaskGPModel(kernel, scaler, output_scaler): + benchmark = MultiTaskHimmelblau() + inputs = benchmark.domain.inputs + outputs = benchmark.domain.outputs + experiments = benchmark.f(inputs.sample(10), return_complete=True) + + model = MultiTaskGPSurrogate( + inputs=inputs, + outputs=outputs, + scaler=scaler, + output_scaler=output_scaler, + kernel=kernel, + ) + + model = surrogates.map(model) + with pytest.raises(ValueError): + model.dumps() + model.fit(experiments) + # dump the model + dump = model.dumps() + # make predictions + samples = inputs.sample(5) + preds = model.predict(samples) + assert preds.shape == (5, 2) + # check that model is composed correctly + assert isinstance(model.model, MultiTaskGP) + if output_scaler == ScalerEnum.STANDARDIZE: + assert isinstance(model.model.outcome_transform, Standardize) + elif output_scaler == ScalerEnum.IDENTITY: + assert not hasattr(model.model, "outcome_transform") + if scaler == ScalerEnum.NORMALIZE: + assert isinstance(model.model.input_transform, Normalize) + elif scaler == ScalerEnum.STANDARDIZE: + assert isinstance(model.model.input_transform, InputStandardize) + else: + assert not hasattr(model.model, "input_transform") + assert model.is_compatibilized is False + # reload the model from dump and check for equality in predictions + model2 = MultiTaskGPSurrogate( + inputs=inputs, + outputs=outputs, + kernel=kernel, + scaler=scaler, + output_scaler=output_scaler, + ) + model2 = surrogates.map(model2) + model2.loads(dump) + preds2 = model2.predict(samples) + assert_frame_equal(preds, preds2) + + @pytest.mark.parametrize( "kernel, scaler, output_scaler", [ diff --git a/tutorials/multi_task_gp_testing.py b/tutorials/multi_task_gp_testing.py deleted file mode 100644 index ce090b2cb..000000000 --- a/tutorials/multi_task_gp_testing.py +++ /dev/null @@ -1,52 +0,0 @@ -import bofire.surrogates.api as surrogates -from bofire.benchmarks.single import MultiFidelityHimmelblau -from bofire.data_models.enum import CategoricalEncodingEnum -from bofire.data_models.surrogates.api import ( - MultiTaskGPSurrogate, -) - -benchmark = MultiFidelityHimmelblau() -samples = benchmark.domain.inputs.sample(n=50) -experiments = benchmark.f(samples, return_complete=True) - -# make fid the columns in order [fid, x_1, x_2, y, valid_y] -experiments = experiments[["task_id", "x_1", "x_2", "y", "valid_y"]] - -input_features = benchmark.domain.inputs -output_features = benchmark.domain.outputs - -# we setup the data model, here a Multi Task GP -surrogate_data = MultiTaskGPSurrogate( - inputs=input_features, - outputs=output_features, - input_preprocessing_specs={"task_id": CategoricalEncodingEnum.ONE_HOT}, -) - -# we generate the json spec -# jspec = surrogate_data.json() - -# surrogate_data = parse_obj_as(MultiTaskGPSurrogate, json.loads(jspec)) -# surrogate_data = TypeAdapter(MultiTaskGPSurrogate).validate_python(json.loads(jspec)) - -surrogate = surrogates.map(surrogate_data) - -surrogate.fit(experiments=experiments) - -# dump it -# dump = surrogate.dumps() - -# predict with it -df_predictions = surrogate.predict(experiments) -# transform to spec -predictions = surrogate.to_predictions(predictions=df_predictions) - -# surrogate_data = parse_obj_as(AnySurrogate, json.loads(jspec)) -# surrogate = surrogates.map(surrogate_data) -# surrogate.loads(dump) - -# predict with it -# df_predictions2 = surrogate.predict(experiments) -# transform to spec -# predictions2 = surrogate.to_predictions(predictions=df_predictions2) - -# assert predictions.equals(predictions2) From cff6817c2f89e6028812963ad51edee80e2a60b7 Mon Sep 17 00:00:00 2001 From: Jose Pablo Folch Date: Tue, 12 Mar 2024 18:02:11 +0000 Subject: [PATCH 07/14] added input specs, limited lkj prior usage --- bofire/data_models/priors/api.py | 2 +- bofire/data_models/priors/lkj.py | 2 +- .../data_models/surrogates/multi_task_gp.py | 21 +++++++-- bofire/surrogates/multi_task_gp.py | 21 ++++++--- tests/bofire/data_models/specs/priors.py | 44 +++++++++++++++++++ tests/bofire/data_models/specs/surrogates.py | 35 +++++++++++++++ tests/bofire/surrogates/test_gps.py | 19 +++++--- 7 files changed, 127 insertions(+), 17 deletions(-) diff --git a/bofire/data_models/priors/api.py b/bofire/data_models/priors/api.py index 2697c856e..468f7c68a 100644 --- a/bofire/data_models/priors/api.py +++ b/bofire/data_models/priors/api.py @@ -29,5 +29,5 @@ # prior for multitask kernel LKJ_PRIOR = partial( - LKJPrior, eta=2.0, sd_prior=GammaPrior(concentration=2.0, rate=0.15) + LKJPrior, shape=2.0, sd_prior=GammaPrior(concentration=2.0, rate=0.15) ) diff --git a/bofire/data_models/priors/lkj.py b/bofire/data_models/priors/lkj.py index 4e0988c90..f5ba206e5 100644 --- a/bofire/data_models/priors/lkj.py +++ b/bofire/data_models/priors/lkj.py @@ -16,6 +16,6 @@ class LKJPrior(Prior): """ type: Literal["LKJPrior"] = "LKJPrior" - eta: PositiveFloat + shape: PositiveFloat sd_prior: GammaPrior n_tasks: int = 1 diff --git a/bofire/data_models/surrogates/multi_task_gp.py b/bofire/data_models/surrogates/multi_task_gp.py index dabf324bd..85661eb51 100644 --- a/bofire/data_models/surrogates/multi_task_gp.py +++ b/bofire/data_models/surrogates/multi_task_gp.py @@ -19,7 +19,6 @@ from bofire.data_models.priors.api import ( BOTORCH_LENGTHCALE_PRIOR, BOTORCH_NOISE_PRIOR, - LKJ_PRIOR, MBO_LENGTHCALE_PRIOR, MBO_NOISE_PRIOR, AnyPrior, @@ -92,7 +91,7 @@ class MultiTaskGPSurrogate(TrainableBotorchSurrogate): ) ) noise_prior: AnyPrior = Field(default_factory=lambda: BOTORCH_NOISE_PRIOR()) - lkj_prior: LKJPrior = Field(default_factory=lambda: LKJ_PRIOR()) + task_prior: Optional[LKJPrior] = Field(default_factory=lambda: None) hyperconfig: Optional[MultiTaskGPHyperconfig] = Field( default_factory=lambda: MultiTaskGPHyperconfig() ) @@ -110,6 +109,18 @@ def is_output_implemented(cls, my_type: Type[AnyOutput]) -> bool: @field_validator("inputs", mode="before") @classmethod def validate_task_inputs(cls, v, info): + if isinstance(v, dict): + if "inputs" in v: + check_types = [ + 1 if feat["type"] == "TaskInput" else 0 for feat in v["features"] + ] + if sum(check_types) != 1: + raise ValueError( + "Exactly one task input is required for multi-task GPs." + ) + + return v + if len(v.get_keys(TaskInput)) != 1: raise ValueError("Exactly one task input is required for multi-task GPs.") return v @@ -119,7 +130,11 @@ def validate_task_inputs(cls, v, info): def validate_encoding(cls, v, info): # also validate that the task feature has ordinal encoding if "inputs" not in info.data: - return + return v + + if len(info.data["inputs"].get_keys(TaskInput)) == 0: + return v + task_feature_id = info.data["inputs"].get_keys(TaskInput)[0] if v.get(task_feature_id) is None: v[task_feature_id] = CategoricalEncodingEnum.ORDINAL diff --git a/bofire/surrogates/multi_task_gp.py b/bofire/surrogates/multi_task_gp.py index 31e700425..537f9cd4b 100644 --- a/bofire/surrogates/multi_task_gp.py +++ b/bofire/surrogates/multi_task_gp.py @@ -1,3 +1,4 @@ +import warnings from typing import Dict, Optional import botorch @@ -12,6 +13,7 @@ import bofire.priors.api as priors from bofire.data_models.enum import OutputFilteringEnum from bofire.data_models.features.api import TaskInput +from bofire.data_models.priors.api import LKJPrior # from bofire.data_models.molfeatures.api import MolFeatures from bofire.data_models.surrogates.api import MultiTaskGPSurrogate as DataModel @@ -33,9 +35,10 @@ def __init__( self.scaler = data_model.scaler self.output_scaler = data_model.output_scaler self.noise_prior = data_model.noise_prior - self.lkj_prior = data_model.lkj_prior - # set the number of tasks in the prior - self.lkj_prior.n_tasks = self.n_tasks + self.task_prior = data_model.task_prior + if isinstance(data_model.task_prior, LKJPrior): + # set the number of tasks in the prior + self.task_prior.n_tasks = self.n_tasks # obtain the name of the task feature self.task_feature_key = data_model.inputs.get_keys(TaskInput)[0] @@ -73,9 +76,15 @@ def _fit(self, X: pd.DataFrame, Y: pd.DataFrame): input_transform=scaler, ) - self.model.task_covar_module.register_prior( - "IndexKernelPrior", priors.map(self.lkj_prior), _index_kernel_prior_closure - ) + if isinstance(self.task_prior, LKJPrior): + warnings.warn( + "The LKJ prior has issues when sampling from the prior, prior has been defaulted to None.", + UserWarning, + ) + # once the issue is fixed, the following line should be uncommented + # self.model.task_covar_module.register_prior( + # "IndexKernelPrior", priors.map(self.lkj_prior), _index_kernel_prior_closure + # ) self.model.likelihood.noise_covar.noise_prior = priors.map(self.noise_prior) # type: ignore mll = ExactMarginalLogLikelihood(self.model.likelihood, self.model) diff --git a/tests/bofire/data_models/specs/priors.py b/tests/bofire/data_models/specs/priors.py index f6e3d2f95..577208012 100644 --- a/tests/bofire/data_models/specs/priors.py +++ b/tests/bofire/data_models/specs/priors.py @@ -43,3 +43,47 @@ }, error=ValidationError, ) + +specs.add_valid( + priors.LKJPrior, + lambda: { + "n_tasks": random.randint(1, 10), + "shape": random.random(), + "sd_prior": { + "type": "GammaPrior", + "concentration": random.random(), + "rate": random.random(), + }, + }, +) + +for shape in [-1, 0]: + specs.add_invalid( + priors.LKJPrior, + lambda: { + "n_tasks": random.randint(1, 10), + "shape": shape, # noqa: B023 + "sd_prior": { + "type": "GammaPrior", + "concentration": random.random(), + "rate": random.random(), + }, + }, + error=ValidationError, + ) + +for concentration in [-1, 0]: + for rate in [-1, 0]: + specs.add_invalid( + priors.LKJPrior, + lambda: { + "n_tasks": random.randint(1, 10), + "shape": random.random(), + "sd_prior": { + "type": "GammaPrior", + "concentration": concentration, # noqa: B023 + "rate": rate, # noqa: B023 + }, + }, + error=ValidationError, + ) diff --git a/tests/bofire/data_models/specs/surrogates.py b/tests/bofire/data_models/specs/surrogates.py index 6c8b8a9fa..4241e8f52 100644 --- a/tests/bofire/data_models/specs/surrogates.py +++ b/tests/bofire/data_models/specs/surrogates.py @@ -9,6 +9,7 @@ ContinuousInput, ContinuousOutput, MolecularInput, + TaskInput, ) from bofire.data_models.kernels.api import ( HammondDistanceKernel, @@ -27,6 +28,7 @@ ScalerEnum, SumAggregation, ) +from bofire.data_models.surrogates.multi_task_gp import MultiTaskGPHyperconfig from bofire.data_models.surrogates.single_task_gp import SingleTaskGPHyperconfig from tests.bofire.data_models.specs.features import specs as features from tests.bofire.data_models.specs.specs import Specs @@ -394,3 +396,36 @@ "hyperconfig": None, }, ) + +specs.add_valid( + models.MultiTaskGPSurrogate, + lambda: { + "inputs": Inputs( + features=[ + features.valid(ContinuousInput).obj(), + ] + + [TaskInput(key="task", categories=["a", "b", "c"])] + ).model_dump(), + "outputs": Outputs( + features=[ + features.valid(ContinuousOutput).obj(), + ] + ).model_dump(), + "kernel": ScaleKernel( + base_kernel=MaternKernel( + ard=True, nu=2.5, lengthscale_prior=BOTORCH_LENGTHCALE_PRIOR() + ), + outputscale_prior=BOTORCH_SCALE_PRIOR(), + ).model_dump(), + "aggregations": None, + "scaler": ScalerEnum.NORMALIZE, + "output_scaler": ScalerEnum.STANDARDIZE, + "noise_prior": BOTORCH_NOISE_PRIOR().model_dump(), + "task_prior": None, + "input_preprocessing_specs": { + "task": CategoricalEncodingEnum.ORDINAL, + }, + "dump": None, + "hyperconfig": MultiTaskGPHyperconfig().model_dump(), + }, +) diff --git a/tests/bofire/surrogates/test_gps.py b/tests/bofire/surrogates/test_gps.py index 494e07357..4aa6c5970 100644 --- a/tests/bofire/surrogates/test_gps.py +++ b/tests/bofire/surrogates/test_gps.py @@ -36,6 +36,7 @@ BOTORCH_LENGTHCALE_PRIOR, BOTORCH_NOISE_PRIOR, BOTORCH_SCALE_PRIOR, + LKJ_PRIOR, MBO_LENGTHCALE_PRIOR, MBO_NOISE_PRIOR, MBO_OUTPUTSCALE_PRIOR, @@ -505,14 +506,14 @@ def test_MixedSingleTaskGPModel(kernel, scaler, output_scaler): @pytest.mark.parametrize( - "kernel, scaler, output_scaler", + "kernel, scaler, output_scaler, task_prior", [ - (RBFKernel(ard=True), ScalerEnum.NORMALIZE, ScalerEnum.STANDARDIZE), - (RBFKernel(ard=False), ScalerEnum.STANDARDIZE, ScalerEnum.STANDARDIZE), - (RBFKernel(ard=False), ScalerEnum.IDENTITY, ScalerEnum.IDENTITY), + (RBFKernel(ard=True), ScalerEnum.NORMALIZE, ScalerEnum.STANDARDIZE, None), + (RBFKernel(ard=False), ScalerEnum.STANDARDIZE, ScalerEnum.STANDARDIZE, None), + (RBFKernel(ard=False), ScalerEnum.IDENTITY, ScalerEnum.IDENTITY, LKJ_PRIOR()), ], ) -def test_MultiTaskGPModel(kernel, scaler, output_scaler): +def test_MultiTaskGPModel(kernel, scaler, output_scaler, task_prior): benchmark = MultiTaskHimmelblau() inputs = benchmark.domain.inputs outputs = benchmark.domain.outputs @@ -524,12 +525,18 @@ def test_MultiTaskGPModel(kernel, scaler, output_scaler): scaler=scaler, output_scaler=output_scaler, kernel=kernel, + task_prior=task_prior, ) model = surrogates.map(model) with pytest.raises(ValueError): model.dumps() - model.fit(experiments) + # if task_prior is not None, a warning should be raised + if task_prior is not None: + with pytest.warns(UserWarning): + model.fit(experiments) + else: + model.fit(experiments) # dump the model dump = model.dumps() # make predictions From 3df737e702626b5da43f53aad17e579588264cc1 Mon Sep 17 00:00:00 2001 From: Jose Pablo Folch Date: Tue, 2 Apr 2024 21:01:34 -0600 Subject: [PATCH 08/14] added testing for multitask himbbelblau --- bofire/benchmarks/single.py | 20 +++++++++----------- tests/bofire/benchmarks/test_single.py | 3 +++ 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/bofire/benchmarks/single.py b/bofire/benchmarks/single.py index edbe6cad5..00ba5819b 100644 --- a/bofire/benchmarks/single.py +++ b/bofire/benchmarks/single.py @@ -443,19 +443,17 @@ def get_optima(self) -> pd.DataFrame: """Returns positions of optima of the benchmark function. Returns: - pd.DataFrame: x values of optima. Colums are x_1 and x_2 + pd.DataFrame: x values of optima. Colums are x_1, x_2, task_id """ - x = np.array( - [ - [3.0, 2.0], - [-2.805118, 3.131312], - [-3.779310, -3.283186], - [3.584428, -1.848126], - ] - ) - y = np.zeros(4) + out = [ + [3.0, 2.0, "task_1", 0], + [-2.805118, 3.131312, "task_1", 0], + [-3.779310, -3.283186, "task_1", 0], + [3.584428, -1.848126, "task_1", 0], + ] + return pd.DataFrame( - np.c_[x, y], + out, columns=self.domain.inputs.get_keys() + self.domain.outputs.get_keys(), ) diff --git a/tests/bofire/benchmarks/test_single.py b/tests/bofire/benchmarks/test_single.py index 8ce3ad88f..6a0ca58a6 100644 --- a/tests/bofire/benchmarks/test_single.py +++ b/tests/bofire/benchmarks/test_single.py @@ -8,6 +8,7 @@ DiscreteHimmelblau, Hartmann, Himmelblau, + MultiTaskHimmelblau, _CategoricalDiscreteHimmelblau, ) @@ -41,6 +42,8 @@ def test_hartmann(): (Branin, False, {}), (Branin30, True, {}), (Branin30, False, {}), + (MultiTaskHimmelblau, False, {}), + (MultiTaskHimmelblau, True, {}), # TO DO: Implement feature that tests Ackley for categorical and descriptive inputs. # (Ackley, {"categorical": True}), # (Ackley, {"descriptor": True}), From 0378dd7e6d1694097caf9e63a3d36c7dc6de0f07 Mon Sep 17 00:00:00 2001 From: Jose Pablo Folch Date: Tue, 2 Apr 2024 21:02:36 -0600 Subject: [PATCH 09/14] removed old tasks file, added LKJ test --- bofire/data_models/features/tasks.py | 54 ---------------------------- bofire/priors/mapper.py | 2 +- 2 files changed, 1 insertion(+), 55 deletions(-) delete mode 100644 bofire/data_models/features/tasks.py diff --git a/bofire/data_models/features/tasks.py b/bofire/data_models/features/tasks.py deleted file mode 100644 index 0574a7bf6..000000000 --- a/bofire/data_models/features/tasks.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import List, Literal, Optional - -import numpy as np -from pydantic import Field, field_validator, model_validator -from typing_extensions import Annotated - -from bofire.data_models.features.api import CategoricalInput, DiscreteInput - - -class TaskInputDiscrete(DiscreteInput): - type: Literal["TaskInput"] = "TaskInput" - n_tasks: int - fidelities: List[int] - - @field_validator("fidelities") - def validate_fidelities(cls, fidelities: List[int], values): - # if fidelities is None: - # return [0 for _ in range(self.n_tasks)] - if len(fidelities) != values["n_tasks"]: - raise ValueError( - "Length of fidelity lists must be equal to the number of tasks" - ) - if list(set(fidelities)) != list(range(np.max(fidelities) + 1)): - raise ValueError( - "Fidelities must be a list containing integers, starting from 0 and increasing by 1" - ) - return fidelities - - @model_validator(mode="before") - def validate_values(cls, values): - if "n_tasks" in values: - values["values"] = list(range(values["n_tasks"])) - return values - - -class TaskInput(CategoricalInput): - type: Literal["TaskInputCategorical"] = "TaskInput" - fidelities: Annotated[Optional[List[int]], Field(validate_default=True)] = None - - @field_validator("fidelities") - def validate_fidelities(cls, fidelities: List[int], values): - if "categories" in values.data: - n_tasks = len(values.data["categories"]) - if fidelities is None: - return [0 for _ in range(n_tasks)] - if len(fidelities) != n_tasks: - raise ValueError( - "Length of fidelity lists must be equal to the number of tasks" - ) - if list(set(fidelities)) != list(range(np.max(fidelities) + 1)): - raise ValueError( - "Fidelities must be a list containing integers, starting from 0 and increasing by 1" - ) - return fidelities diff --git a/bofire/priors/mapper.py b/bofire/priors/mapper.py index ae55cca9d..1f782df22 100644 --- a/bofire/priors/mapper.py +++ b/bofire/priors/mapper.py @@ -15,7 +15,7 @@ def map_GammaPrior(data_model: data_models.GammaPrior) -> gpytorch.priors.GammaP def map_LKJPrior(data_model: data_models.LKJPrior) -> gpytorch.priors.LKJPrior: return gpytorch.priors.LKJCovariancePrior( - n=data_model.n_tasks, eta=data_model.eta, sd_prior=map(data_model.sd_prior) + n=data_model.n_tasks, eta=data_model.shape, sd_prior=map(data_model.sd_prior) ) From 6823a08495083b6417670ec49fffda9e6797955a Mon Sep 17 00:00:00 2001 From: Jose Pablo Folch Date: Tue, 2 Apr 2024 21:03:50 -0600 Subject: [PATCH 10/14] reformatted multitask testing and functionality --- .../data_models/surrogates/multi_task_gp.py | 19 +- bofire/surrogates/multi_task_gp.py | 2 +- tests/bofire/data_models/specs/surrogates.py | 69 +++++++ tests/bofire/priors/test_mapper.py | 17 +- tests/bofire/surrogates/test_gps.py | 148 +-------------- tests/bofire/surrogates/test_multitask_gps.py | 169 ++++++++++++++++++ 6 files changed, 261 insertions(+), 163 deletions(-) create mode 100644 tests/bofire/surrogates/test_multitask_gps.py diff --git a/bofire/data_models/surrogates/multi_task_gp.py b/bofire/data_models/surrogates/multi_task_gp.py index 85661eb51..66dbf83c1 100644 --- a/bofire/data_models/surrogates/multi_task_gp.py +++ b/bofire/data_models/surrogates/multi_task_gp.py @@ -106,24 +106,13 @@ def is_output_implemented(cls, my_type: Type[AnyOutput]) -> bool: """ return isinstance(my_type, type(ContinuousOutput)) - @field_validator("inputs", mode="before") + @field_validator("inputs") @classmethod - def validate_task_inputs(cls, v, info): - if isinstance(v, dict): - if "inputs" in v: - check_types = [ - 1 if feat["type"] == "TaskInput" else 0 for feat in v["features"] - ] - if sum(check_types) != 1: - raise ValueError( - "Exactly one task input is required for multi-task GPs." - ) + def validate_task_inputs(cls, inputs: Inputs): - return v - - if len(v.get_keys(TaskInput)) != 1: + if len(inputs.get_keys(TaskInput)) != 1: raise ValueError("Exactly one task input is required for multi-task GPs.") - return v + return inputs @field_validator("input_preprocessing_specs") @classmethod diff --git a/bofire/surrogates/multi_task_gp.py b/bofire/surrogates/multi_task_gp.py index 537f9cd4b..8cd73a751 100644 --- a/bofire/surrogates/multi_task_gp.py +++ b/bofire/surrogates/multi_task_gp.py @@ -59,7 +59,7 @@ def _fit(self, X: pd.DataFrame, Y: pd.DataFrame): self.model = botorch.models.MultiTaskGP( # type: ignore train_X=tX, train_Y=tY, - task_feature=X.columns.get_loc( + task_feature=transformed_X.columns.get_loc( self.task_feature_key ), # obtain the fidelity index covar_module=kernels.map( diff --git a/tests/bofire/data_models/specs/surrogates.py b/tests/bofire/data_models/specs/surrogates.py index 4241e8f52..d44abf27b 100644 --- a/tests/bofire/data_models/specs/surrogates.py +++ b/tests/bofire/data_models/specs/surrogates.py @@ -429,3 +429,72 @@ "hyperconfig": MultiTaskGPHyperconfig().model_dump(), }, ) + +# if wrong encoding (one-hot) is used, there should be a validation error +specs.add_invalid( + models.MultiTaskGPSurrogate, + lambda: { + "inputs": Inputs( + features=[ + features.valid(ContinuousInput).obj(), + ] + + [TaskInput(key="task", categories=["a", "b", "c"])] + ).model_dump(), + "outputs": Outputs( + features=[ + features.valid(ContinuousOutput).obj(), + ] + ).model_dump(), + "kernel": ScaleKernel( + base_kernel=MaternKernel( + ard=True, nu=2.5, lengthscale_prior=BOTORCH_LENGTHCALE_PRIOR() + ), + outputscale_prior=BOTORCH_SCALE_PRIOR(), + ).model_dump(), + "aggregations": None, + "scaler": ScalerEnum.NORMALIZE, + "output_scaler": ScalerEnum.STANDARDIZE, + "noise_prior": BOTORCH_NOISE_PRIOR().model_dump(), + "task_prior": None, + "input_preprocessing_specs": { + "task": CategoricalEncodingEnum.ONE_HOT, + }, + "dump": None, + "hyperconfig": MultiTaskGPHyperconfig().model_dump(), + }, + error=ValueError, +) + +# if there is no task input, there should be a validation error +specs.add_invalid( + models.MultiTaskGPSurrogate, + lambda: { + "inputs": Inputs( + features=[ + features.valid(ContinuousInput).obj(), + ] + ).model_dump(), + "outputs": Outputs( + features=[ + features.valid(ContinuousOutput).obj(), + ] + ).model_dump(), + "kernel": ScaleKernel( + base_kernel=MaternKernel( + ard=True, nu=2.5, lengthscale_prior=BOTORCH_LENGTHCALE_PRIOR() + ), + outputscale_prior=BOTORCH_SCALE_PRIOR(), + ).model_dump(), + "aggregations": None, + "scaler": ScalerEnum.NORMALIZE, + "output_scaler": ScalerEnum.STANDARDIZE, + "noise_prior": BOTORCH_NOISE_PRIOR().model_dump(), + "task_prior": None, + "input_preprocessing_specs": { + "task": CategoricalEncodingEnum.ORDINAL, + }, + "dump": None, + "hyperconfig": MultiTaskGPHyperconfig().model_dump(), + }, + error=ValueError, +) diff --git a/tests/bofire/priors/test_mapper.py b/tests/bofire/priors/test_mapper.py index 22fc9e47a..5a59a4e99 100644 --- a/tests/bofire/priors/test_mapper.py +++ b/tests/bofire/priors/test_mapper.py @@ -2,7 +2,7 @@ import pytest import bofire.priors.api as priors -from bofire.data_models.priors.api import GammaPrior, NormalPrior +from bofire.data_models.priors.api import GammaPrior, LKJPrior, NormalPrior @pytest.mark.parametrize( @@ -19,3 +19,18 @@ def test_map(prior, expected_prior): if key == "type": continue assert value == getattr(gprior, key) + + +def test_lkj_map(): + prior = LKJPrior( + n_tasks=3, shape=0.4, sd_prior=GammaPrior(concentration=2.0, rate=0.2) + ) + expected_prior = gpytorch.priors.LKJPrior + + gprior = priors.map(prior) + assert isinstance(gprior, expected_prior) + assert prior.n_tasks == gprior.correlation_prior.n + assert prior.shape == gprior.correlation_prior.concentration + assert isinstance(gprior.sd_prior, gpytorch.priors.GammaPrior) + assert prior.sd_prior.concentration == gprior.sd_prior.concentration + assert prior.sd_prior.rate == gprior.sd_prior.rate diff --git a/tests/bofire/surrogates/test_gps.py b/tests/bofire/surrogates/test_gps.py index 4aa6c5970..5642d3b14 100644 --- a/tests/bofire/surrogates/test_gps.py +++ b/tests/bofire/surrogates/test_gps.py @@ -3,7 +3,7 @@ import pandas as pd import pytest import torch -from botorch.models import MixedSingleTaskGP, MultiTaskGP, SingleTaskGP +from botorch.models import MixedSingleTaskGP, SingleTaskGP from botorch.models.transforms.input import ( ChainedInputTransform, InputStandardize, @@ -15,7 +15,7 @@ from pydantic import ValidationError import bofire.surrogates.api as surrogates -from bofire.benchmarks.api import Himmelblau, MultiTaskHimmelblau +from bofire.benchmarks.api import Himmelblau from bofire.data_models.domain.api import Inputs, Outputs from bofire.data_models.enum import CategoricalEncodingEnum, RegressionMetricsEnum from bofire.data_models.features.api import ( @@ -23,7 +23,6 @@ ContinuousInput, ContinuousOutput, MolecularInput, - TaskInput, ) from bofire.data_models.kernels.api import ( HammondDistanceKernel, @@ -36,14 +35,12 @@ BOTORCH_LENGTHCALE_PRIOR, BOTORCH_NOISE_PRIOR, BOTORCH_SCALE_PRIOR, - LKJ_PRIOR, MBO_LENGTHCALE_PRIOR, MBO_NOISE_PRIOR, MBO_OUTPUTSCALE_PRIOR, ) from bofire.data_models.surrogates.api import ( MixedSingleTaskGPSurrogate, - MultiTaskGPSurrogate, ScalerEnum, SingleTaskGPHyperconfig, SingleTaskGPSurrogate, @@ -284,43 +281,6 @@ def test_SingleTaskGPHyperconfig(): ) -def test_MultiTaskGPHyperconfig(): - # we test here also the basic trainable - benchmark = MultiTaskHimmelblau() - surrogate_data_no_hy = MultiTaskGPSurrogate( - inputs=benchmark.domain.inputs, - outputs=benchmark.domain.outputs, - hyperconfig=None, - ) - - with pytest.raises(ValueError, match="No hyperconfig available."): - surrogate_data_no_hy.update_hyperparameters( - benchmark.domain.inputs.sample(1).loc[0] - ) - # test that correct stuff is written - surrogate_data = MultiTaskGPSurrogate( - inputs=benchmark.domain.inputs, outputs=benchmark.domain.outputs - ) - candidate = surrogate_data.hyperconfig.inputs.sample(1).loc[0] - surrogate_data.update_hyperparameters(candidate) - - assert surrogate_data.kernel.ard == (candidate["ard"] == "True") - if candidate.kernel == "matern_1.5": - assert isinstance(surrogate_data.kernel, MaternKernel) - assert surrogate_data.kernel.nu == 1.5 - elif candidate.kernel == "matern_2.5": - assert isinstance(surrogate_data.kernel, MaternKernel) - assert surrogate_data.kernel.nu == 2.5 - else: - assert isinstance(surrogate_data.kernel, RBFKernel) - if candidate.prior == "mbo": - assert surrogate_data.noise_prior == MBO_NOISE_PRIOR() - assert surrogate_data.kernel.lengthscale_prior == MBO_LENGTHCALE_PRIOR() - else: - assert surrogate_data.noise_prior == BOTORCH_NOISE_PRIOR() - assert surrogate_data.kernel.lengthscale_prior == BOTORCH_LENGTHCALE_PRIOR() - - def test_MixedSingleTaskGPHyperconfig(): inputs = Inputs( features=[ @@ -361,45 +321,6 @@ def test_MixedSingleTaskGPHyperconfig(): ) -def test_MultiTask_input_preprocessing(): - # test that if wrong encoding is used, there is an error - inputs = Inputs( - features=[ContinuousInput(key="x", bounds=(-1, 1))] - + [TaskInput(key="task_id", categories=["1", "2"])] - ) - outputs = Outputs(features=[ContinuousOutput(key="y")]) - with pytest.raises(ValueError): - data_model = MultiTaskGPSurrogate( - inputs=inputs, - outputs=outputs, - input_preprocessing_specs={"task_id": CategoricalEncodingEnum.ONE_HOT}, - ) - - # test that if there is no task input, there is an error - inputs = Inputs( - features=[ContinuousInput(key="x", bounds=(-1, 1))] - + [CategoricalInput(key="task_id", categories=["1", "2"])] - ) - outputs = Outputs(features=[ContinuousOutput(key="y")]) - with pytest.raises(ValueError): - data_model = MultiTaskGPSurrogate( - inputs=inputs, - outputs=outputs, - input_preprocessing_specs={"task_id": CategoricalEncodingEnum.ORDINAL}, - ) - - # test that if no input_preprocessing_specs are provided, the ordinal encoding is used - inputs = Inputs( - features=[ContinuousInput(key="x", bounds=(-1, 1))] - + [TaskInput(key="task_id", categories=["1", "2"])] - ) - outputs = Outputs(features=[ContinuousOutput(key="y")]) - data_model = MultiTaskGPSurrogate(inputs=inputs, outputs=outputs) - assert data_model.input_preprocessing_specs == { - "task_id": CategoricalEncodingEnum.ORDINAL - } - - def test_MixedSingleTaskGPModel_invalid_preprocessing(): inputs = Inputs( features=[ @@ -505,71 +426,6 @@ def test_MixedSingleTaskGPModel(kernel, scaler, output_scaler): assert_frame_equal(preds, preds2) -@pytest.mark.parametrize( - "kernel, scaler, output_scaler, task_prior", - [ - (RBFKernel(ard=True), ScalerEnum.NORMALIZE, ScalerEnum.STANDARDIZE, None), - (RBFKernel(ard=False), ScalerEnum.STANDARDIZE, ScalerEnum.STANDARDIZE, None), - (RBFKernel(ard=False), ScalerEnum.IDENTITY, ScalerEnum.IDENTITY, LKJ_PRIOR()), - ], -) -def test_MultiTaskGPModel(kernel, scaler, output_scaler, task_prior): - benchmark = MultiTaskHimmelblau() - inputs = benchmark.domain.inputs - outputs = benchmark.domain.outputs - experiments = benchmark.f(inputs.sample(10), return_complete=True) - - model = MultiTaskGPSurrogate( - inputs=inputs, - outputs=outputs, - scaler=scaler, - output_scaler=output_scaler, - kernel=kernel, - task_prior=task_prior, - ) - - model = surrogates.map(model) - with pytest.raises(ValueError): - model.dumps() - # if task_prior is not None, a warning should be raised - if task_prior is not None: - with pytest.warns(UserWarning): - model.fit(experiments) - else: - model.fit(experiments) - # dump the model - dump = model.dumps() - # make predictions - samples = inputs.sample(5) - preds = model.predict(samples) - assert preds.shape == (5, 2) - # check that model is composed correctly - assert isinstance(model.model, MultiTaskGP) - if output_scaler == ScalerEnum.STANDARDIZE: - assert isinstance(model.model.outcome_transform, Standardize) - elif output_scaler == ScalerEnum.IDENTITY: - assert not hasattr(model.model, "outcome_transform") - if scaler == ScalerEnum.NORMALIZE: - assert isinstance(model.model.input_transform, Normalize) - elif scaler == ScalerEnum.STANDARDIZE: - assert isinstance(model.model.input_transform, InputStandardize) - else: - assert not hasattr(model.model, "input_transform") - assert model.is_compatibilized is False - # reload the model from dump and check for equality in predictions - model2 = MultiTaskGPSurrogate( - inputs=inputs, - outputs=outputs, - kernel=kernel, - scaler=scaler, - output_scaler=output_scaler, - ) - model2 = surrogates.map(model2) - model2.loads(dump) - preds2 = model2.predict(samples) - assert_frame_equal(preds, preds2) - - @pytest.mark.parametrize( "kernel, scaler, output_scaler", [ diff --git a/tests/bofire/surrogates/test_multitask_gps.py b/tests/bofire/surrogates/test_multitask_gps.py new file mode 100644 index 000000000..60c6910ec --- /dev/null +++ b/tests/bofire/surrogates/test_multitask_gps.py @@ -0,0 +1,169 @@ +import importlib + +import pytest +from botorch.models import MultiTaskGP +from botorch.models.transforms.input import ( + InputStandardize, + Normalize, +) +from botorch.models.transforms.outcome import Standardize +from pandas.testing import assert_frame_equal + +import bofire.surrogates.api as surrogates +from bofire.benchmarks.api import MultiTaskHimmelblau +from bofire.data_models.domain.api import Inputs, Outputs +from bofire.data_models.enum import CategoricalEncodingEnum +from bofire.data_models.features.api import ( + CategoricalInput, + ContinuousInput, + ContinuousOutput, + TaskInput, +) +from bofire.data_models.kernels.api import ( + MaternKernel, + RBFKernel, +) +from bofire.data_models.priors.api import ( + BOTORCH_LENGTHCALE_PRIOR, + BOTORCH_NOISE_PRIOR, + LKJ_PRIOR, + MBO_LENGTHCALE_PRIOR, + MBO_NOISE_PRIOR, +) +from bofire.data_models.surrogates.api import ( + MultiTaskGPSurrogate, + ScalerEnum, +) + +RDKIT_AVAILABLE = importlib.util.find_spec("rdkit") is not None + + +def test_MultiTaskGPHyperconfig(): + # we test here also the basic trainable + benchmark = MultiTaskHimmelblau() + surrogate_data_no_hy = MultiTaskGPSurrogate( + inputs=benchmark.domain.inputs, + outputs=benchmark.domain.outputs, + hyperconfig=None, + ) + + with pytest.raises(ValueError, match="No hyperconfig available."): + surrogate_data_no_hy.update_hyperparameters( + benchmark.domain.inputs.sample(1).loc[0] + ) + # test that correct stuff is written + surrogate_data = MultiTaskGPSurrogate( + inputs=benchmark.domain.inputs, outputs=benchmark.domain.outputs + ) + candidate = surrogate_data.hyperconfig.inputs.sample(1).loc[0] + surrogate_data.update_hyperparameters(candidate) + + assert surrogate_data.kernel.ard == (candidate["ard"] == "True") + if candidate.kernel == "matern_1.5": + assert isinstance(surrogate_data.kernel, MaternKernel) + assert surrogate_data.kernel.nu == 1.5 + elif candidate.kernel == "matern_2.5": + assert isinstance(surrogate_data.kernel, MaternKernel) + assert surrogate_data.kernel.nu == 2.5 + else: + assert isinstance(surrogate_data.kernel, RBFKernel) + if candidate.prior == "mbo": + assert surrogate_data.noise_prior == MBO_NOISE_PRIOR() + assert surrogate_data.kernel.lengthscale_prior == MBO_LENGTHCALE_PRIOR() + else: + assert surrogate_data.noise_prior == BOTORCH_NOISE_PRIOR() + assert surrogate_data.kernel.lengthscale_prior == BOTORCH_LENGTHCALE_PRIOR() + + +def test_MultiTask_input_preprocessing(): + # test that if no input_preprocessing_specs are provided, the ordinal encoding is used + inputs = Inputs( + features=[ContinuousInput(key="x", bounds=(-1, 1))] + + [TaskInput(key="task_id", categories=["1", "2"])] + ) + outputs = Outputs(features=[ContinuousOutput(key="y")]) + data_model = MultiTaskGPSurrogate(inputs=inputs, outputs=outputs) + assert data_model.input_preprocessing_specs == { + "task_id": CategoricalEncodingEnum.ORDINAL + } + + # test that if we have a categorical input, one-hot encoding is correctly applied + inputs = Inputs( + features=[ContinuousInput(key="x", bounds=(-1, 1))] + + [CategoricalInput(key="categories", categories=["1", "2"])] + + [TaskInput(key="task_id", categories=["1", "2"])] + ) + outputs = Outputs(features=[ContinuousOutput(key="y")]) + data_model = MultiTaskGPSurrogate( + inputs=inputs, + outputs=outputs, + ) + assert data_model.input_preprocessing_specs == { + "categories": CategoricalEncodingEnum.ONE_HOT, + "task_id": CategoricalEncodingEnum.ORDINAL, + } + + +@pytest.mark.parametrize( + "kernel, scaler, output_scaler, task_prior", + [ + (RBFKernel(ard=True), ScalerEnum.NORMALIZE, ScalerEnum.STANDARDIZE, None), + (RBFKernel(ard=False), ScalerEnum.STANDARDIZE, ScalerEnum.STANDARDIZE, None), + (RBFKernel(ard=False), ScalerEnum.IDENTITY, ScalerEnum.IDENTITY, LKJ_PRIOR()), + ], +) +def test_MultiTaskGPModel(kernel, scaler, output_scaler, task_prior): + benchmark = MultiTaskHimmelblau() + inputs = benchmark.domain.inputs + outputs = benchmark.domain.outputs + experiments = benchmark.f(inputs.sample(10), return_complete=True) + + model = MultiTaskGPSurrogate( + inputs=inputs, + outputs=outputs, + scaler=scaler, + output_scaler=output_scaler, + kernel=kernel, + task_prior=task_prior, + ) + + model = surrogates.map(model) + with pytest.raises(ValueError): + model.dumps() + # if task_prior is not None, a warning should be raised + if task_prior is not None: + with pytest.warns(UserWarning): + model.fit(experiments) + else: + model.fit(experiments) + # dump the model + dump = model.dumps() + # make predictions + samples = inputs.sample(5) + preds = model.predict(samples) + assert preds.shape == (5, 2) + # check that model is composed correctly + assert isinstance(model.model, MultiTaskGP) + if output_scaler == ScalerEnum.STANDARDIZE: + assert isinstance(model.model.outcome_transform, Standardize) + elif output_scaler == ScalerEnum.IDENTITY: + assert not hasattr(model.model, "outcome_transform") + if scaler == ScalerEnum.NORMALIZE: + assert isinstance(model.model.input_transform, Normalize) + elif scaler == ScalerEnum.STANDARDIZE: + assert isinstance(model.model.input_transform, InputStandardize) + else: + assert not hasattr(model.model, "input_transform") + assert model.is_compatibilized is False + # reload the model from dump and check for equality in predictions + model2 = MultiTaskGPSurrogate( + inputs=inputs, + outputs=outputs, + kernel=kernel, + scaler=scaler, + output_scaler=output_scaler, + ) + model2 = surrogates.map(model2) + model2.loads(dump) + preds2 = model2.predict(samples) + assert_frame_equal(preds, preds2) From 85bf6fd4d219345c7b38923137f64b86c39bcdfb Mon Sep 17 00:00:00 2001 From: Jose Pablo Folch Date: Tue, 2 Apr 2024 21:56:47 -0600 Subject: [PATCH 11/14] fixed kernel and prior serialization errors --- bofire/data_models/priors/api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bofire/data_models/priors/api.py b/bofire/data_models/priors/api.py index 468f7c68a..b7bea6466 100644 --- a/bofire/data_models/priors/api.py +++ b/bofire/data_models/priors/api.py @@ -11,6 +11,7 @@ AnyPrior = Union[ GammaPrior, NormalPrior, + LKJPrior, ] # default priors of interest From 6b3d613e23347141760bc207dfc30f45b061d5aa Mon Sep 17 00:00:00 2001 From: Jose Pablo Folch Date: Tue, 2 Apr 2024 23:10:51 -0600 Subject: [PATCH 12/14] fixed serialization test for surrogates --- bofire/data_models/priors/lkj.py | 2 +- bofire/data_models/surrogates/api.py | 6 ++++-- bofire/data_models/surrogates/botorch_surrogates.py | 2 ++ bofire/surrogates/api.py | 1 + tests/bofire/data_models/specs/priors.py | 2 +- 5 files changed, 9 insertions(+), 4 deletions(-) diff --git a/bofire/data_models/priors/lkj.py b/bofire/data_models/priors/lkj.py index f5ba206e5..f1fb55ca1 100644 --- a/bofire/data_models/priors/lkj.py +++ b/bofire/data_models/priors/lkj.py @@ -18,4 +18,4 @@ class LKJPrior(Prior): type: Literal["LKJPrior"] = "LKJPrior" shape: PositiveFloat sd_prior: GammaPrior - n_tasks: int = 1 + n_tasks: int = 2 diff --git a/bofire/data_models/surrogates/api.py b/bofire/data_models/surrogates/api.py index fb5bff2bb..c462f7960 100644 --- a/bofire/data_models/surrogates/api.py +++ b/bofire/data_models/surrogates/api.py @@ -43,7 +43,8 @@ SingleTaskGPSurrogate, MixedSingleTaskGPSurrogate, MixedTanimotoGPSurrogate, - MLPEnsemble, + ClassificationMLPEnsemble, + RegressionMLPEnsemble, SaasSingleTaskGPSurrogate, XGBoostSurrogate, LinearSurrogate, @@ -57,7 +58,8 @@ SingleTaskGPSurrogate, MixedSingleTaskGPSurrogate, MixedTanimotoGPSurrogate, - MLPEnsemble, + ClassificationMLPEnsemble, + RegressionMLPEnsemble, SaasSingleTaskGPSurrogate, XGBoostSurrogate, LinearSurrogate, diff --git a/bofire/data_models/surrogates/botorch_surrogates.py b/bofire/data_models/surrogates/botorch_surrogates.py index c77b0d9d7..39f2e38e2 100644 --- a/bofire/data_models/surrogates/botorch_surrogates.py +++ b/bofire/data_models/surrogates/botorch_surrogates.py @@ -18,6 +18,7 @@ ClassificationMLPEnsemble, RegressionMLPEnsemble, ) +from bofire.data_models.surrogates.multi_task_gp import MultiTaskGPSurrogate from bofire.data_models.surrogates.polynomial import PolynomialSurrogate from bofire.data_models.surrogates.random_forest import RandomForestSurrogate from bofire.data_models.surrogates.single_task_gp import SingleTaskGPSurrogate @@ -36,6 +37,7 @@ TanimotoGPSurrogate, LinearSurrogate, PolynomialSurrogate, + MultiTaskGPSurrogate, ] diff --git a/bofire/surrogates/api.py b/bofire/surrogates/api.py index a2a935380..4afaca2e0 100644 --- a/bofire/surrogates/api.py +++ b/bofire/surrogates/api.py @@ -8,6 +8,7 @@ MLPEnsemble, RegressionMLPEnsemble, ) +from bofire.surrogates.multi_task_gp import MultiTaskGPSurrogate from bofire.surrogates.random_forest import RandomForestSurrogate from bofire.surrogates.single_task_gp import SingleTaskGPSurrogate from bofire.surrogates.surrogate import Surrogate diff --git a/tests/bofire/data_models/specs/priors.py b/tests/bofire/data_models/specs/priors.py index 577208012..eb2d56dc1 100644 --- a/tests/bofire/data_models/specs/priors.py +++ b/tests/bofire/data_models/specs/priors.py @@ -47,7 +47,7 @@ specs.add_valid( priors.LKJPrior, lambda: { - "n_tasks": random.randint(1, 10), + "n_tasks": random.randint(2, 10), "shape": random.random(), "sd_prior": { "type": "GammaPrior", From bb4f36ae40079676013d192d065d09f2302533bd Mon Sep 17 00:00:00 2001 From: Jose Pablo Folch Date: Tue, 9 Apr 2024 13:57:03 -0600 Subject: [PATCH 13/14] black reformatted linting errors --- bofire/strategies/predictives/botorch.py | 12 ++++++++---- bofire/strategies/random.py | 6 +++--- bofire/surrogates/multi_task_gp.py | 8 +++++--- tests/bofire/data_models/specs/surrogates.py | 1 - 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/bofire/strategies/predictives/botorch.py b/bofire/strategies/predictives/botorch.py index 87b347347..e413bbb10 100644 --- a/bofire/strategies/predictives/botorch.py +++ b/bofire/strategies/predictives/botorch.py @@ -110,10 +110,14 @@ def _get_optimizer_options(self) -> Dict[str, int]: Dict[str, int]: The dictionary with the settings. """ return { - "batch_limit": self.batch_limit - if len(self.domain.constraints.get([NChooseKConstraint, ProductConstraint])) - == 0 - else 1, # type: ignore + "batch_limit": ( + self.batch_limit + if len( + self.domain.constraints.get([NChooseKConstraint, ProductConstraint]) + ) + == 0 + else 1 + ), # type: ignore "maxiter": self.maxiter, } diff --git a/bofire/strategies/random.py b/bofire/strategies/random.py index 98e806a37..4dca36ab4 100644 --- a/bofire/strategies/random.py +++ b/bofire/strategies/random.py @@ -290,9 +290,9 @@ def _sample_from_polytope( n=1, q=n, bounds=bounds.to(**tkwargs), - inequality_constraints=unfixed_ineqs - if len(unfixed_ineqs) > 0 # type: ignore - else None, + inequality_constraints=( + unfixed_ineqs if len(unfixed_ineqs) > 0 else None # type: ignore + ), equality_constraints=combined_eqs if len(combined_eqs) > 0 else None, n_burnin=n_burnin, thinning=n_thinning, diff --git a/bofire/surrogates/multi_task_gp.py b/bofire/surrogates/multi_task_gp.py index 8cd73a751..9a6faf7fd 100644 --- a/bofire/surrogates/multi_task_gp.py +++ b/bofire/surrogates/multi_task_gp.py @@ -70,9 +70,11 @@ def _fit(self, X: pd.DataFrame, Y: pd.DataFrame): ), # kernel is for input space so we subtract one for the fidelity index ard_num_dims=1, # this keyword is ingored ), - outcome_transform=Standardize(m=tY.shape[-1]) - if self.output_scaler == ScalerEnum.STANDARDIZE - else None, + outcome_transform=( + Standardize(m=tY.shape[-1]) + if self.output_scaler == ScalerEnum.STANDARDIZE + else None + ), input_transform=scaler, ) diff --git a/tests/bofire/data_models/specs/surrogates.py b/tests/bofire/data_models/specs/surrogates.py index c88000dfd..7027a3fbb 100644 --- a/tests/bofire/data_models/specs/surrogates.py +++ b/tests/bofire/data_models/specs/surrogates.py @@ -565,4 +565,3 @@ }, error=ValueError, ) - \ No newline at end of file From ce340dc9ea2a7001cf6bddd85ef8a87740711718 Mon Sep 17 00:00:00 2001 From: Jose Pablo Folch Date: Tue, 9 Apr 2024 21:02:59 -0600 Subject: [PATCH 14/14] fixed linting issues --- bofire/surrogates/multi_task_gp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bofire/surrogates/multi_task_gp.py b/bofire/surrogates/multi_task_gp.py index 9a6faf7fd..f04f3433e 100644 --- a/bofire/surrogates/multi_task_gp.py +++ b/bofire/surrogates/multi_task_gp.py @@ -36,7 +36,7 @@ def __init__( self.output_scaler = data_model.output_scaler self.noise_prior = data_model.noise_prior self.task_prior = data_model.task_prior - if isinstance(data_model.task_prior, LKJPrior): + if isinstance(self.task_prior, LKJPrior): # set the number of tasks in the prior self.task_prior.n_tasks = self.n_tasks # obtain the name of the task feature @@ -99,7 +99,7 @@ def _predict(self, transformed_X: pd.DataFrame): preds = self.model.posterior(X=X, observation_noise=False).mean.cpu().detach().numpy() # type: ignore vars = self.model.posterior(X=X, observation_noise=False).variance.cpu().detach().numpy() # type: ignore # add the observation noise to the stds - stds = np.sqrt(vars + self.model.likelihood.noise.cpu().detach().numpy()) + stds = np.sqrt(vars + self.model.likelihood.noise.cpu().detach().numpy()) # type: ignore return preds, stds