Implement possibility to run hyperparameter opts in the strategy (#287)

experimental-design · Sep 21, 2023 · b320d85 · b320d85
1 parent da20f2d
commit b320d85
Show file tree

Hide file tree

Showing 33 changed files with 849 additions and 741 deletions.
diff --git a/bofire/benchmarks/api.py b/bofire/benchmarks/api.py
@@ -1,8 +1,8 @@
 from typing import Union
 
 from bofire.benchmarks.aspen_benchmark import Aspen_benchmark
-from bofire.benchmarks.benchmark import Benchmark, GenericBenchmark, run
-from bofire.benchmarks.hyperopt import Hyperopt, hyperoptimize
+from bofire.benchmarks.benchmark import Benchmark, GenericBenchmark
+from bofire.benchmarks.hyperopt import Hyperopt
 from bofire.benchmarks.multi import C2DTLZ2, DTLZ2, ZDT1, CrossCoupling, SnarBenchmark
 from bofire.benchmarks.single import Ackley, Branin, Branin30, Hartmann, Himmelblau
 

diff --git a/bofire/benchmarks/benchmark.py b/bofire/benchmarks/benchmark.py
@@ -1,21 +1,14 @@
-import json
-import os
 from abc import abstractmethod
-from copy import deepcopy
-from typing import Callable, List, Literal, Optional, Protocol, Tuple, Union
+from typing import Callable, Literal, Optional, Tuple, Union
 
 import numpy as np
 import pandas as pd
-from multiprocess.pool import Pool
 from pydantic import Field, PositiveFloat
 from scipy.stats import norm, uniform
-from tqdm import tqdm
 from typing_extensions import Annotated
 
-import bofire.strategies.api as strategies
 from bofire.data_models.base import BaseModel
 from bofire.data_models.domain.api import Domain
-from bofire.data_models.strategies.api import AnyStrategy
 
 
 class OutlierPrior(BaseModel):
@@ -100,121 +93,3 @@ def __init__(
 
     def _f(self, candidates: pd.DataFrame) -> pd.DataFrame:
         return self.func(candidates)
-
-
-class StrategyFactory(Protocol):
-    def __call__(self, domain: Domain) -> AnyStrategy:
-        ...
-
-
-def _single_run(
-    run_idx: int,
-    benchmark: Benchmark,
-    strategy_factory: StrategyFactory,
-    n_iterations: int,
-    metric: Callable[[Domain, pd.DataFrame], float],
-    n_candidates_per_proposals: int,
-    safe_intervall: int,
-    initial_sampler: Optional[
-        Union[Callable[[Domain], pd.DataFrame], pd.DataFrame]
-    ] = None,
-) -> Tuple[pd.DataFrame, pd.Series]:
-    def autosafe_results(benchmark):
-        """Safes results into a .json file to prevent data loss during time-expensive optimization runs.
-        Autosave should operate every 10 iterations.
-
-        Args:
-            benchmark: Benchmark function that is suposed be evaluated.
-        """
-
-        benchmark_name = benchmark.__class__.__name__
-        # Create a folder for autosaves, if not already exists.
-        if not os.path.exists("bofire_autosaves/" + benchmark_name):
-            os.makedirs("bofire_autosaves/" + benchmark_name)
-
-        filename = (
-            "bofire_autosaves/" + benchmark_name + "/run" + str(run_idx) + ".json"
-        )
-        parsed_domain = benchmark.domain.json()
-        with open(filename, "w") as file:
-            json.dump(parsed_domain, file)
-
-    # sample initial values
-    if initial_sampler is not None:
-        if isinstance(initial_sampler, Callable):
-            X = initial_sampler(benchmark.domain)
-            XY = benchmark.f(X, return_complete=True)
-        else:
-            XY = initial_sampler
-    strategy_data = strategy_factory(domain=benchmark.domain)
-    # map it
-    strategy = strategies.map(strategy_data)  # type: ignore
-    # tell it
-    if initial_sampler is not None:
-        strategy.tell(XY)  # type: ignore
-    metric_values = np.zeros(n_iterations)
-    pbar = tqdm(range(n_iterations), position=run_idx)
-    for i in pbar:
-        X = strategy.ask(candidate_count=n_candidates_per_proposals)
-        X = X[benchmark.domain.inputs.get_keys()]
-        Y = benchmark.f(X)
-        XY = pd.concat([X, Y], axis=1)
-        # pd.concat() changes datatype of str to np.int32 if column contains whole numbers.
-        # colum needs to be converted back to str to be added to the benchmark domain.
-        strategy.tell(XY)
-        metric_values[i] = metric(strategy.domain, strategy.experiments)  # type: ignore
-        pbar.set_description(
-            f"run {run_idx:02d} with current best {metric_values[i]:0.3f}"
-        )
-        if (i + 1) % safe_intervall == 0:
-            autosafe_results(benchmark=benchmark)
-    return strategy.experiments, pd.Series(metric_values)  # type: ignore
-
-
-def run(
-    benchmark: Benchmark,
-    strategy_factory: StrategyFactory,
-    n_iterations: int,
-    metric: Callable[[Domain, pd.DataFrame], float],
-    initial_sampler: Optional[Callable[[Domain], pd.DataFrame]] = None,
-    n_candidates_per_proposal: int = 1,
-    n_runs: int = 5,
-    n_procs: int = 5,
-    safe_intervall: int = 1000,
-) -> List[Tuple[pd.DataFrame, pd.Series]]:
-    """Run a benchmark problem several times in parallel
-
-    Args:
-        benchmark: problem to be benchmarked
-        strategy_factory: creates the strategy to be benchmarked on the benchmark problem
-        n_iterations: number of times the strategy is asked
-        metric: measure of success, e.g, best value found so far for single objective or
-                hypervolume for multi-objective
-        initial_sampler: Creates initial data
-        n_candidates: also known as batch size, number of proposals made at once by the strategy
-        n_runs: number of runs
-        n_procs: number of parallel processes to execute the runs
-
-    Returns:
-        per run, a tuple with the benchmark object containing the proposed data and metric values
-    """
-
-    def make_args(run_idx: int):
-        return (
-            run_idx,
-            deepcopy(benchmark),
-            strategy_factory,
-            n_iterations,
-            metric,
-            n_candidates_per_proposal,
-            safe_intervall,
-            initial_sampler,
-        )
-
-    if n_procs == 1:
-        results = [_single_run(*make_args(i)) for i in range(n_runs)]
-    else:
-        p = Pool(min(n_procs, n_runs))
-        results = [p.apply_async(_single_run, make_args(i)) for i in range(n_runs)]
-        results = [r.get() for r in results]
-    return results
diff --git a/bofire/benchmarks/hyperopt.py b/bofire/benchmarks/hyperopt.py
@@ -1,19 +1,10 @@
-import warnings
-from typing import Optional, Tuple
+from typing import Optional
 
 import pandas as pd
 
-import bofire.strategies.api as strategies
 import bofire.surrogates.api as surrogates
-from bofire.benchmarks.benchmark import Benchmark, run
+from bofire.benchmarks.benchmark import Benchmark
 from bofire.data_models.domain.api import Domain
-from bofire.data_models.enum import RegressionMetricsEnum
-from bofire.data_models.objectives.api import MinimizeObjective
-from bofire.data_models.strategies.api import (
-    FactorialStrategy,
-    RandomStrategy,
-    SoboStrategy,
-)
 from bofire.data_models.surrogates.api import AnyTrainableSurrogate
 
 
@@ -59,74 +50,3 @@ def _f(self, candidates: pd.DataFrame) -> pd.DataFrame:
                 )
         results[f"valid_{self.target_metric.value}"] = 1  # type: ignore
         return results  # type: ignore
-
-
-def hyperoptimize(
-    surrogate_data: AnyTrainableSurrogate,
-    training_data: pd.DataFrame,
-    folds: int,
-    random_state: Optional[int] = None,
-) -> Tuple[AnyTrainableSurrogate, pd.DataFrame]:
-    if surrogate_data.hyperconfig is None:
-        warnings.warn(
-            "No hyperopt is possible as no hyperopt config is available. Returning initial config."
-        )
-        return surrogate_data, pd.DataFrame({e.name: [] for e in RegressionMetricsEnum})
-
-    def best(domain: Domain, experiments: pd.DataFrame) -> float:
-        return (
-            experiments[domain.outputs[0].key].min()
-            if isinstance(domain.outputs[0].objective, MinimizeObjective)
-            else experiments[domain.outputs[0].key].max()
-        )
-
-    def sample(domain):
-        datamodel = RandomStrategy(domain=domain)
-        sampler = strategies.map(data_model=datamodel)
-        sampled = sampler.ask(len(domain.inputs) + 1)
-        return sampled
-
-    benchmark = Hyperopt(
-        surrogate_data=surrogate_data,
-        training_data=training_data,
-        folds=folds,
-        random_state=random_state,
-    )
-
-    if surrogate_data.hyperconfig.hyperstrategy == "FactorialStrategy":  # type: ignore
-        strategy = strategies.map(FactorialStrategy(domain=benchmark.domain))
-        experiments = benchmark.f(
-            strategy.ask(candidate_count=None), return_complete=True
-        )
-    else:
-        experiments = run(
-            benchmark=benchmark,
-            strategy_factory=RandomStrategy
-            if surrogate_data.hyperconfig.hyperstrategy == "RandomStrategy"  # type: ignore
-            else SoboStrategy,  # type: ignore
-            metric=best,
-            n_runs=1,
-            n_iterations=surrogate_data.hyperconfig.n_iterations  # type: ignore
-            - len(benchmark.domain.inputs)
-            - 1,
-            initial_sampler=sample,
-            n_procs=1,
-        )[0][0]
-
-    # analyze the results and get the best
-    experiments = experiments.sort_values(
-        by=benchmark.target_metric.name,
-        ascending=True
-        if isinstance(benchmark.domain.outputs[0].objective, MinimizeObjective)
-        else False,
-    )
-
-    surrogate_data.update_hyperparameters(experiments.iloc[0])
-
-    return (
-        surrogate_data,
-        experiments[
-            surrogate_data.hyperconfig.domain.inputs.get_keys()
-            + [e.name for e in RegressionMetricsEnum]
-        ],
-    )
diff --git a/bofire/data_models/strategies/predictives/botorch.py b/bofire/data_models/strategies/predictives/botorch.py
@@ -1,6 +1,6 @@
-from typing import Optional, Type
+from typing import Annotated, Optional, Type
 
-from pydantic import PositiveInt, root_validator, validator
+from pydantic import Field, PositiveInt, root_validator, validator
 
 from bofire.data_models.constraints.api import (
     Constraint,
@@ -10,9 +10,7 @@
 from bofire.data_models.domain.api import Domain, Outputs
 from bofire.data_models.enum import CategoricalEncodingEnum, CategoricalMethodEnum
 from bofire.data_models.features.api import CategoricalDescriptorInput, CategoricalInput
-from bofire.data_models.outlier_detection.api import (
-    OutlierDetections,
-)
+from bofire.data_models.outlier_detection.api import OutlierDetections
 from bofire.data_models.strategies.predictives.predictive import PredictiveStrategy
 from bofire.data_models.surrogates.api import (
     BotorchSurrogates,
@@ -33,9 +31,13 @@ class BotorchStrategy(PredictiveStrategy):
     categorical_method: CategoricalMethodEnum = CategoricalMethodEnum.EXHAUSTIVE
     discrete_method: CategoricalMethodEnum = CategoricalMethodEnum.EXHAUSTIVE
     surrogate_specs: Optional[BotorchSurrogates] = None
+    # outlier detection params
     outlier_detection_specs: Optional[OutlierDetections] = None
     min_experiments_before_outlier_check: PositiveInt = 1
     frequency_check: PositiveInt = 1
+    # hyperopt params
+    frequency_hyperopt: Annotated[int, Field(ge=0)] = 0  # 0 indicates no hyperopt
+    folds: int = 5
 
     @classmethod
     def is_constraint_implemented(cls, my_type: Type[Constraint]) -> bool:

diff --git a/bofire/runners/__init__.py b/bofire/runners/__init__.py
diff --git a/bofire/runners/api.py b/bofire/runners/api.py
@@ -0,0 +1,2 @@
+from bofire.runners.hyperoptimize import hyperoptimize
+from bofire.runners.run import run
diff --git a/bofire/runners/hyperoptimize.py b/bofire/runners/hyperoptimize.py
@@ -0,0 +1,88 @@
+import warnings
+from typing import Optional, Tuple
+
+import pandas as pd
+
+import bofire.strategies.api as strategies
+from bofire.benchmarks.api import Hyperopt
+from bofire.data_models.domain.api import Domain
+from bofire.data_models.enum import RegressionMetricsEnum
+from bofire.data_models.objectives.api import MinimizeObjective
+from bofire.data_models.strategies.api import (
+    FactorialStrategy,
+    RandomStrategy,
+    SoboStrategy,
+)
+from bofire.data_models.surrogates.api import AnyTrainableSurrogate
+from bofire.runners.run import run
+
+
+def hyperoptimize(
+    surrogate_data: AnyTrainableSurrogate,
+    training_data: pd.DataFrame,
+    folds: int,
+    random_state: Optional[int] = None,
+) -> Tuple[AnyTrainableSurrogate, pd.DataFrame]:
+    if surrogate_data.hyperconfig is None:
+        warnings.warn(
+            "No hyperopt is possible as no hyperopt config is available. Returning initial config."
+        )
+        return surrogate_data, pd.DataFrame({e.name: [] for e in RegressionMetricsEnum})
+
+    def best(domain: Domain, experiments: pd.DataFrame) -> float:
+        return (
+            experiments[domain.outputs[0].key].min()
+            if isinstance(domain.outputs[0].objective, MinimizeObjective)
+            else experiments[domain.outputs[0].key].max()
+        )
+
+    def sample(domain):
+        datamodel = RandomStrategy(domain=domain)
+        sampler = strategies.map(data_model=datamodel)
+        sampled = sampler.ask(len(domain.inputs) + 1)
+        return sampled
+
+    benchmark = Hyperopt(
+        surrogate_data=surrogate_data,
+        training_data=training_data,
+        folds=folds,
+        random_state=random_state,
+    )
+
+    if surrogate_data.hyperconfig.hyperstrategy == "FactorialStrategy":  # type: ignore
+        strategy = strategies.map(FactorialStrategy(domain=benchmark.domain))
+        experiments = benchmark.f(
+            strategy.ask(candidate_count=None), return_complete=True
+        )
+    else:
+        experiments = run(
+            benchmark=benchmark,
+            strategy_factory=RandomStrategy
+            if surrogate_data.hyperconfig.hyperstrategy == "RandomStrategy"  # type: ignore
+            else SoboStrategy,  # type: ignore
+            metric=best,
+            n_runs=1,
+            n_iterations=surrogate_data.hyperconfig.n_iterations  # type: ignore
+            - len(benchmark.domain.inputs)
+            - 1,
+            initial_sampler=sample,
+            n_procs=1,
+        )[0][0]
+
+    # analyze the results and get the best
+    experiments = experiments.sort_values(
+        by=benchmark.target_metric.name,
+        ascending=True
+        if isinstance(benchmark.domain.outputs[0].objective, MinimizeObjective)
+        else False,
+    )
+
+    surrogate_data.update_hyperparameters(experiments.iloc[0])
+
+    return (
+        surrogate_data,
+        experiments[
+            surrogate_data.hyperconfig.domain.inputs.get_keys()
+            + [e.name for e in RegressionMetricsEnum]
+        ],
+    )