diff --git a/docs/en_US/NAS/retiarii/ApiReference.rst b/docs/en_US/NAS/retiarii/ApiReference.rst index 67315e195a..1e49b8fcef 100644 --- a/docs/en_US/NAS/retiarii/ApiReference.rst +++ b/docs/en_US/NAS/retiarii/ApiReference.rst @@ -72,10 +72,16 @@ Oneshot Trainers Strategies ---------- -.. autoclass:: nni.retiarii.strategies.RandomStrategy +.. autoclass:: nni.retiarii.strategy.Random :members: -.. autoclass:: nni.retiarii.strategies.TPEStrategy +.. autoclass:: nni.retiarii.strategy.GridSearch + :members: + +.. autoclass:: nni.retiarii.strategy.RegularizedEvolution + :members: + +.. autoclass:: nni.retiarii.strategy.TPEStrategy :members: Retiarii Experiments diff --git a/docs/en_US/NAS/retiarii/Tutorial.rst b/docs/en_US/NAS/retiarii/Tutorial.rst index 925827adeb..9a9305cf96 100644 --- a/docs/en_US/NAS/retiarii/Tutorial.rst +++ b/docs/en_US/NAS/retiarii/Tutorial.rst @@ -167,13 +167,13 @@ In the following table, we listed the available trainers and strategies. - TPEStrategy - DartsTrainer * - Regression - - RandomStrategy + - Random - EnasTrainer * - - - + - GridSearch - ProxylessTrainer * - - - + - RegularizedEvolution - SinglePathTrainer (RandomTrainer) There usage and API document can be found `here <./ApiReference>`__\. @@ -204,7 +204,7 @@ After all the above are prepared, it is time to start an experiment to do the mo .. code-block:: python - exp = RetiariiExperiment(base_model, trainer, applied_mutators, simple_startegy) + exp = RetiariiExperiment(base_model, trainer, applied_mutators, simple_strategy) exp_config = RetiariiExeConfig('local') exp_config.experiment_name = 'mnasnet_search' exp_config.trial_concurrency = 2 diff --git a/docs/en_US/NAS/retiarii/WriteStrategy.rst b/docs/en_US/NAS/retiarii/WriteStrategy.rst index d8f10546f8..6f354b3b85 100644 --- a/docs/en_US/NAS/retiarii/WriteStrategy.rst +++ b/docs/en_US/NAS/retiarii/WriteStrategy.rst @@ -3,10 +3,12 @@ Customize A New Strategy To write a new strategy, you should inherit the base strategy class ``BaseStrategy``, then implement the member function ``run``. This member function takes ``base_model`` and ``applied_mutators`` as its input arguments. It can simply apply the user specified mutators in ``applied_mutators`` onto ``base_model`` to generate a new model. When a mutator is applied, it should be bound with a sampler (e.g., ``RandomSampler``). Every sampler implements the ``choice`` function which chooses value(s) from candidate values. The ``choice`` functions invoked in mutators are executed with the sampler. -Below is a very simple random strategy, the complete code can be found :githublink:`here `. +Below is a very simple random strategy, which makes the choices completely random. .. code-block:: python + from nni.retiarii import Sampler + class RandomSampler(Sampler): def choice(self, candidates, mutator, model, index): return random.choice(candidates) @@ -31,6 +33,6 @@ Below is a very simple random strategy, the complete code can be found :githubli else: time.sleep(2) -You can find that this strategy does not know the search space beforehand, it passively makes decisions every time ``choice`` is invoked from mutators. If a strategy wants to know the whole search space before making any decision (e.g., TPE, SMAC), it can use ``dry_run`` function provided by ``Mutator`` to obtain the space. An example strategy can be found :githublink:`here `. +You can find that this strategy does not know the search space beforehand, it passively makes decisions every time ``choice`` is invoked from mutators. If a strategy wants to know the whole search space before making any decision (e.g., TPE, SMAC), it can use ``dry_run`` function provided by ``Mutator`` to obtain the space. An example strategy can be found :githublink:`here `. After generating a new model, the strategy can use our provided APIs (e.g., ``submit_models``, ``is_stopped_exec``) to submit the model and get its reported results. More APIs can be found in `API References <./ApiReference.rst>`__. \ No newline at end of file diff --git a/nni/retiarii/execution/base.py b/nni/retiarii/execution/base.py index 1ce1d00da3..52baf3075d 100644 --- a/nni/retiarii/execution/base.py +++ b/nni/retiarii/execution/base.py @@ -65,11 +65,11 @@ def _send_trial_callback(self, paramater: dict) -> None: if self.resources <= 0: _logger.warning('There is no available resource, but trial is submitted.') self.resources -= 1 - _logger.info('on_resource_used: %d', self.resources) + _logger.info('Resource used. Remaining: %d', self.resources) def _request_trial_jobs_callback(self, num_trials: int) -> None: self.resources += num_trials - _logger.info('on_resource_available: %d', self.resources) + _logger.info('New resource available. Remaining: %d', self.resources) def _trial_end_callback(self, trial_id: int, success: bool) -> None: model = self._running_models[trial_id] diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index ee08ea994b..1b8c598a91 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -17,7 +17,7 @@ from ..integration import RetiariiAdvisor from ..mutator import Mutator from ..nn.pytorch.mutator import process_inline_mutation -from ..strategies.strategy import BaseStrategy +from ..strategy import BaseStrategy from ..trainer.interface import BaseOneShotTrainer, BaseTrainer from ..utils import get_records diff --git a/nni/retiarii/graph.py b/nni/retiarii/graph.py index f8a99b7eb9..fa1b136ff4 100644 --- a/nni/retiarii/graph.py +++ b/nni/retiarii/graph.py @@ -131,7 +131,7 @@ def fork(self) -> 'Model': new_model = Model(_internal=True) new_model._root_graph_name = self._root_graph_name new_model.graphs = {name: graph._fork_to(new_model) for name, graph in self.graphs.items()} - new_model.training_config = copy.deepcopy(self.training_config) + new_model.training_config = copy.deepcopy(self.training_config) # TODO this may be a problem when training config is large new_model.history = self.history + [self] return new_model diff --git a/nni/retiarii/strategies/__init__.py b/nni/retiarii/strategies/__init__.py deleted file mode 100644 index 62ef50bf4f..0000000000 --- a/nni/retiarii/strategies/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .tpe_strategy import TPEStrategy -from .random_strategy import RandomStrategy diff --git a/nni/retiarii/strategies/random_strategy.py b/nni/retiarii/strategies/random_strategy.py deleted file mode 100644 index 78c1ac13da..0000000000 --- a/nni/retiarii/strategies/random_strategy.py +++ /dev/null @@ -1,32 +0,0 @@ -import logging -import random -import time - -from .. import Sampler, submit_models, query_available_resources -from .strategy import BaseStrategy - -_logger = logging.getLogger(__name__) - -class RandomSampler(Sampler): - def choice(self, candidates, mutator, model, index): - return random.choice(candidates) - -class RandomStrategy(BaseStrategy): - def __init__(self): - self.random_sampler = RandomSampler() - - def run(self, base_model, applied_mutators): - _logger.info('stargety start...') - while True: - avail_resource = query_available_resources() - if avail_resource > 0: - model = base_model - _logger.info('apply mutators...') - _logger.info('mutators: %s', str(applied_mutators)) - for mutator in applied_mutators: - mutator.bind_sampler(self.random_sampler) - model = mutator.apply(model) - # run models - submit_models(model) - else: - time.sleep(2) diff --git a/nni/retiarii/strategy/__init__.py b/nni/retiarii/strategy/__init__.py new file mode 100644 index 0000000000..af8810873b --- /dev/null +++ b/nni/retiarii/strategy/__init__.py @@ -0,0 +1,4 @@ +from .base import BaseStrategy +from .bruteforce import Random, GridSearch +from .evolution import RegularizedEvolution +from .tpe_strategy import TPEStrategy diff --git a/nni/retiarii/strategies/strategy.py b/nni/retiarii/strategy/base.py similarity index 100% rename from nni/retiarii/strategies/strategy.py rename to nni/retiarii/strategy/base.py diff --git a/nni/retiarii/strategy/bruteforce.py b/nni/retiarii/strategy/bruteforce.py new file mode 100644 index 0000000000..a7d965fe91 --- /dev/null +++ b/nni/retiarii/strategy/bruteforce.py @@ -0,0 +1,115 @@ +import copy +import itertools +import logging +import random +import time +from typing import Any, Dict, List + +from .. import Sampler, submit_models, query_available_resources +from .base import BaseStrategy +from .utils import dry_run_for_search_space, get_targeted_model + +_logger = logging.getLogger(__name__) + + +def grid_generator(search_space: Dict[Any, List[Any]], shuffle=True): + keys = list(search_space.keys()) + search_space_values = copy.deepcopy(list(search_space.values())) + if shuffle: + for values in search_space_values: + random.shuffle(values) + for values in itertools.product(*search_space_values): + yield {key: value for key, value in zip(keys, values)} + + +def random_generator(search_space: Dict[Any, List[Any]], dedup=True, retries=500): + keys = list(search_space.keys()) + history = set() + search_space_values = copy.deepcopy(list(search_space.values())) + while True: + for retry_count in range(retries): + selected = [random.choice(v) for v in search_space_values] + if not dedup: + break + selected = tuple(selected) + if selected not in history: + history.add(selected) + break + if retry_count + 1 == retries: + _logger.info('Random generation has run out of patience. There is nothing to search. Exiting.') + return + yield {key: value for key, value in zip(keys, selected)} + + +class GridSearch(BaseStrategy): + """ + Traverse the search space and try all the possible combinations one by one. + + Parameters + ---------- + shuffle : bool + Shuffle the order in a candidate list, so that they are tried in a random order. Default: true. + """ + + def __init__(self, shuffle=True): + self._polling_interval = 2. + self.shuffle = shuffle + + def run(self, base_model, applied_mutators): + search_space = dry_run_for_search_space(base_model, applied_mutators) + for sample in grid_generator(search_space, shuffle=self.shuffle): + _logger.info('New model created. Waiting for resource. %s', str(sample)) + if query_available_resources() <= 0: + time.sleep(self._polling_interval) + submit_models(get_targeted_model(base_model, applied_mutators, sample)) + + +class _RandomSampler(Sampler): + def choice(self, candidates, mutator, model, index): + return random.choice(candidates) + + +class Random(BaseStrategy): + """ + Random search on the search space. + + Parameters + ---------- + variational : bool + Do not dry run to get the full search space. Used when the search space has variational size or candidates. Default: false. + dedup : bool + Do not try the same configuration twice. When variational is true, deduplication is not supported. Default: true. + """ + + def __init__(self, variational=False, dedup=True): + self.variational = variational + self.dedup = dedup + if variational and dedup: + raise ValueError('Dedup is not supported in variational mode.') + self.random_sampler = _RandomSampler() + self._polling_interval = 2. + + def run(self, base_model, applied_mutators): + if self.variational: + _logger.info('Random search running in variational mode.') + sampler = _RandomSampler() + for mutator in applied_mutators: + mutator.bind_sampler(sampler) + while True: + avail_resource = query_available_resources() + if avail_resource > 0: + model = base_model + for mutator in applied_mutators: + model = mutator.apply(model) + _logger.info('New model created. Applied mutators are: %s', str(applied_mutators)) + submit_models(model) + else: + time.sleep(self._polling_interval) + else: + _logger.info('Random search running in fixed size mode. Dedup: %s.', 'on' if self.dedup else 'off') + search_space = dry_run_for_search_space(base_model, applied_mutators) + for sample in random_generator(search_space, dedup=self.dedup): + _logger.info('New model created. Waiting for resource. %s', str(sample)) + if query_available_resources() <= 0: + time.sleep(self._polling_interval) + submit_models(get_targeted_model(base_model, applied_mutators, sample)) diff --git a/nni/retiarii/strategy/evolution.py b/nni/retiarii/strategy/evolution.py new file mode 100644 index 0000000000..fa365a8382 --- /dev/null +++ b/nni/retiarii/strategy/evolution.py @@ -0,0 +1,158 @@ +import collections +import dataclasses +import logging +import random +import time + +from ..execution import query_available_resources, submit_models +from ..graph import ModelStatus +from .base import BaseStrategy +from .utils import dry_run_for_search_space, get_targeted_model + + +_logger = logging.getLogger(__name__) + + +@dataclasses.dataclass +class Individual: + """ + A class that represents an individual. + Holds two attributes, where ``x`` is the model and ``y`` is the metric (e.g., accuracy). + """ + x: dict + y: float + + +class RegularizedEvolution(BaseStrategy): + """ + Algorithm for regularized evolution (i.e. aging evolution). + Follows "Algorithm 1" in Real et al. "Regularized Evolution for Image Classifier Architecture Search". + + Parameters + ---------- + optimize_mode : str + Can be one of "maximize" and "minimize". Default: maximize. + population_size : int + The number of individuals to keep in the population. Default: 100. + cycles : int + The number of cycles (trials) the algorithm should run for. Default: 20000. + sample_size : int + The number of individuals that should participate in each tournament. Default: 25. + mutation_prob : float + Probability that mutation happens in each dim. Default: 0.05 + on_failure : str + Can be one of "ignore" and "worst". If "ignore", simply give up the model and find a new one. + If "worst", mark the model as -inf (if maximize, inf if minimize), so that the algorithm "learns" to avoid such model. + Default: ignore. + """ + + def __init__(self, optimize_mode='maximize', population_size=100, sample_size=25, cycles=20000, + mutation_prob=0.05, on_failure='ignore'): + assert optimize_mode in ['maximize', 'minimize'] + assert on_failure in ['ignore', 'worst'] + assert sample_size < population_size + self.optimize_mode = optimize_mode + self.population_size = population_size + self.sample_size = sample_size + self.cycles = cycles + self.mutation_prob = mutation_prob + self.on_failure = on_failure + + self._worst = float('-inf') if self.optimize_mode == 'maximize' else float('inf') + + self._success_count = 0 + self._population = collections.deque() + self._running_models = [] + self._polling_interval = 2. + + def random(self, search_space): + return {k: random.choice(v) for k, v in search_space.items()} + + def mutate(self, parent, search_space): + child = {} + for k, v in parent.items(): + if random.uniform(0, 1) < self.mutation_prob: + # NOTE: we do not exclude the original choice here for simplicity, + # which is slightly different from the original paper. + child[k] = random.choice(search_space[k]) + else: + child[k] = v + return child + + def best_parent(self): + samples = [p for p in self._population] # copy population + random.shuffle(samples) + samples = list(samples)[:self.sample_size] + if self.optimize_mode == 'maximize': + parent = max(samples, key=lambda sample: sample.y) + else: + parent = min(samples, key=lambda sample: sample.y) + return parent.x + + def run(self, base_model, applied_mutators): + search_space = dry_run_for_search_space(base_model, applied_mutators) + # Run the first population regardless concurrency + _logger.info('Initializing the first population.') + while len(self._population) + len(self._running_models) <= self.population_size: + # try to submit new models + while len(self._population) + len(self._running_models) < self.population_size: + config = self.random(search_space) + self._submit_config(config, base_model, applied_mutators) + # collect results + self._move_succeeded_models_to_population() + self._remove_failed_models_from_running_list() + time.sleep(self._polling_interval) + + if len(self._population) >= self.population_size: + break + + # Resource-aware mutation of models + _logger.info('Running mutations.') + while self._success_count + len(self._running_models) <= self.cycles: + # try to submit new models + while query_available_resources() > 0 and self._success_count + len(self._running_models) < self.cycles: + config = self.mutate(self.best_parent(), search_space) + self._submit_config(config, base_model, applied_mutators) + # collect results + self._move_succeeded_models_to_population() + self._remove_failed_models_from_running_list() + time.sleep(self._polling_interval) + + if self._success_count >= self.cycles: + break + + def _submit_config(self, config, base_model, mutators): + _logger.info('Model submitted to running queue: %s', config) + model = get_targeted_model(base_model, mutators, config) + submit_models(model) + self._running_models.append((config, model)) + return model + + def _move_succeeded_models_to_population(self): + completed_indices = [] + for i, (config, model) in enumerate(self._running_models): + metric = None + if self.on_failure == 'worst' and model.status == ModelStatus.Failed: + metric = self._worst + elif model.status == ModelStatus.Trained: + metric = model.metric + if metric is not None: + individual = Individual(config, metric) + _logger.info('Individual created: %s', str(individual)) + self._population.append(individual) + if len(self._population) > self.population_size: + self._population.popleft() + completed_indices.append(i) + for i in completed_indices[::-1]: + # delete from end to start so that the index number will not be affected. + self._success_count += 1 + self._running_models.pop(i) + + def _remove_failed_models_from_running_list(self): + # This is only done when on_failure policy is set to "ignore". + # Otherwise, failed models will be treated as inf when processed. + if self.on_failure == 'ignore': + number_of_failed_models = len([g for g in self._running_models if g[1].status == ModelStatus.Failed]) + self._running_models = [g for g in self._running_models if g[1].status != ModelStatus.Failed] + if number_of_failed_models > 0: + _logger.info('%d failed models are ignored. Will retry.', number_of_failed_models) diff --git a/nni/retiarii/strategies/tpe_strategy.py b/nni/retiarii/strategy/tpe_strategy.py similarity index 91% rename from nni/retiarii/strategies/tpe_strategy.py rename to nni/retiarii/strategy/tpe_strategy.py index 9f0fcd2455..8d823bae11 100644 --- a/nni/retiarii/strategies/tpe_strategy.py +++ b/nni/retiarii/strategy/tpe_strategy.py @@ -4,7 +4,7 @@ from nni.algorithms.hpo.hyperopt_tuner import HyperoptTuner from .. import Sampler, submit_models, query_available_resources, is_stopped_exec -from .strategy import BaseStrategy +from .base import BaseStrategy _logger = logging.getLogger(__name__) @@ -50,16 +50,14 @@ def run(self, base_model, applied_mutators): sample_space.extend(recorded_candidates) self.tpe_sampler.update_sample_space(sample_space) - _logger.info('stargety start...') + _logger.info('TPE strategy has been started.') while True: avail_resource = query_available_resources() if avail_resource > 0: model = base_model - _logger.info('apply mutators...') - _logger.info('mutators: %s', str(applied_mutators)) + _logger.info('New model created. Applied mutators: %s', str(applied_mutators)) self.tpe_sampler.generate_samples(self.model_id) for mutator in applied_mutators: - _logger.info('mutate model...') mutator.bind_sampler(self.tpe_sampler) model = mutator.apply(model) # run models diff --git a/nni/retiarii/strategy/utils.py b/nni/retiarii/strategy/utils.py new file mode 100644 index 0000000000..c1055d1707 --- /dev/null +++ b/nni/retiarii/strategy/utils.py @@ -0,0 +1,29 @@ +import collections +from typing import Dict, Any, List +from ..graph import Model +from ..mutator import Mutator, Sampler + + +class _FixedSampler(Sampler): + def __init__(self, sample): + self.sample = sample + + def choice(self, candidates, mutator, model, index): + return self.sample[(mutator, index)] + + +def dry_run_for_search_space(model: Model, mutators: List[Mutator]) -> Dict[Any, List[Any]]: + search_space = collections.OrderedDict() + for mutator in mutators: + recorded_candidates, model = mutator.dry_run(model) + for i, candidates in enumerate(recorded_candidates): + search_space[(mutator, i)] = candidates + return search_space + + +def get_targeted_model(base_model: Model, mutators: List[Mutator], sample: dict) -> Model: + sampler = _FixedSampler(sample) + model = base_model + for mutator in mutators: + model = mutator.bind_sampler(sampler).apply(model) + return model diff --git a/test/retiarii_test/darts/test.py b/test/retiarii_test/darts/test.py index bf0fabdada..3c3d6fa37c 100644 --- a/test/retiarii_test/darts/test.py +++ b/test/retiarii_test/darts/test.py @@ -5,9 +5,9 @@ from pathlib import Path import nni.retiarii.trainer.pytorch.lightning as pl +import nni.retiarii.strategy as strategy from nni.retiarii import blackbox_module as bm from nni.retiarii.experiment.pytorch import RetiariiExperiment, RetiariiExeConfig -from nni.retiarii.strategies import TPEStrategy, RandomStrategy from torchvision import transforms from torchvision.datasets import CIFAR10 @@ -33,9 +33,9 @@ val_dataloaders=pl.DataLoader(test_dataset, batch_size=100), max_epochs=1, limit_train_batches=0.2) - simple_startegy = RandomStrategy() + simple_strategy = strategy.Random() - exp = RetiariiExperiment(base_model, trainer, [], simple_startegy) + exp = RetiariiExperiment(base_model, trainer, [], simple_strategy) exp_config = RetiariiExeConfig('local') exp_config.experiment_name = 'darts_search' diff --git a/test/retiarii_test/darts/test_oneshot.py b/test/retiarii_test/darts/test_oneshot.py index 755f33600c..731d44742c 100644 --- a/test/retiarii_test/darts/test_oneshot.py +++ b/test/retiarii_test/darts/test_oneshot.py @@ -8,8 +8,7 @@ from torchvision import transforms from torchvision.datasets import CIFAR10 -from nni.retiarii.experiment.pytorch import RetiariiExperiment, RetiariiExeConfig -from nni.retiarii.strategies import TPEStrategy +from nni.retiarii.experiment.pytorch import RetiariiExperiment from nni.retiarii.trainer.pytorch import DartsTrainer from darts_model import CNN diff --git a/test/retiarii_test/mnasnet/test.py b/test/retiarii_test/mnasnet/test.py index 1158d0ce08..8d07a8afb0 100644 --- a/test/retiarii_test/mnasnet/test.py +++ b/test/retiarii_test/mnasnet/test.py @@ -9,7 +9,7 @@ from nni.retiarii import blackbox_module as bm from base_mnasnet import MNASNet from nni.retiarii.experiment.pytorch import RetiariiExperiment, RetiariiExeConfig -from nni.retiarii.strategies import TPEStrategy +from nni.retiarii.strategy import TPEStrategy from torchvision import transforms from torchvision.datasets import CIFAR10 @@ -46,9 +46,9 @@ BlockMutator('mutable_1') ] - simple_startegy = TPEStrategy() + simple_strategy = TPEStrategy() - exp = RetiariiExperiment(base_model, trainer, applied_mutators, simple_startegy) + exp = RetiariiExperiment(base_model, trainer, applied_mutators, simple_strategy) exp_config = RetiariiExeConfig('local') exp_config.experiment_name = 'mnasnet_search' diff --git a/test/retiarii_test/mnist/test.py b/test/retiarii_test/mnist/test.py index 42ca877db1..f215d84d8f 100644 --- a/test/retiarii_test/mnist/test.py +++ b/test/retiarii_test/mnist/test.py @@ -1,11 +1,11 @@ import random import nni.retiarii.nn.pytorch as nn +import nni.retiarii.strategy as strategy import nni.retiarii.trainer.pytorch.lightning as pl import torch.nn.functional as F from nni.retiarii import blackbox_module as bm from nni.retiarii.experiment.pytorch import RetiariiExeConfig, RetiariiExperiment -from nni.retiarii.strategies import RandomStrategy from torch.utils.data import DataLoader from torchvision import transforms from torchvision.datasets import MNIST @@ -42,9 +42,9 @@ def forward(self, x): val_dataloaders=pl.DataLoader(test_dataset, batch_size=100), max_epochs=2) - simple_startegy = RandomStrategy() + simple_strategy = strategy.Random() - exp = RetiariiExperiment(base_model, trainer, [], simple_startegy) + exp = RetiariiExperiment(base_model, trainer, [], simple_strategy) exp_config = RetiariiExeConfig('local') exp_config.experiment_name = 'mnist_search' diff --git a/test/ut/retiarii/test_strategy.py b/test/ut/retiarii/test_strategy.py new file mode 100644 index 0000000000..5f5fe42208 --- /dev/null +++ b/test/ut/retiarii/test_strategy.py @@ -0,0 +1,139 @@ +import random +import time +import threading +from typing import * + +import nni.retiarii.execution.api +import nni.retiarii.nn.pytorch as nn +import nni.retiarii.strategy as strategy +import torch +import torch.nn.functional as F +from nni.retiarii import Model +from nni.retiarii.converter import convert_to_graph +from nni.retiarii.execution import wait_models +from nni.retiarii.execution.interface import AbstractExecutionEngine, WorkerInfo, MetricData, AbstractGraphListener +from nni.retiarii.graph import DebugTraining, ModelStatus +from nni.retiarii.nn.pytorch.mutator import process_inline_mutation + + +class MockExecutionEngine(AbstractExecutionEngine): + def __init__(self, failure_prob=0.): + self.models = [] + self.failure_prob = failure_prob + self._resource_left = 4 + + def _model_complete(self, model: Model): + time.sleep(random.uniform(0, 1)) + if random.uniform(0, 1) < self.failure_prob: + model.status = ModelStatus.Failed + else: + model.metric = random.uniform(0, 1) + model.status = ModelStatus.Trained + self._resource_left += 1 + + def submit_models(self, *models: Model) -> None: + for model in models: + self.models.append(model) + self._resource_left -= 1 + threading.Thread(target=self._model_complete, args=(model, )).start() + + def query_available_resource(self) -> Union[List[WorkerInfo], int]: + return self._resource_left + + def register_graph_listener(self, listener: AbstractGraphListener) -> None: + pass + + def trial_execute_graph(cls) -> MetricData: + pass + + +def _reset_execution_engine(engine=None): + nni.retiarii.execution.api._execution_engine = engine + + +class Net(nn.Module): + def __init__(self, hidden_size=32): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 20, 5, 1) + self.conv2 = nn.Conv2d(20, 50, 5, 1) + self.fc1 = nn.LayerChoice([ + nn.Linear(4*4*50, hidden_size, bias=True), + nn.Linear(4*4*50, hidden_size, bias=False) + ]) + self.fc2 = nn.LayerChoice([ + nn.Linear(hidden_size, 10, bias=False), + nn.Linear(hidden_size, 10, bias=True) + ]) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = F.max_pool2d(x, 2, 2) + x = F.relu(self.conv2(x)) + x = F.max_pool2d(x, 2, 2) + x = x.view(-1, 4*4*50) + x = F.relu(self.fc1(x)) + x = self.fc2(x) + return F.log_softmax(x, dim=1) + + +def _get_model_and_mutators(): + base_model = Net() + script_module = torch.jit.script(base_model) + base_model_ir = convert_to_graph(script_module, base_model) + base_model_ir.training_config = DebugTraining() + mutators = process_inline_mutation(base_model_ir) + return base_model_ir, mutators + + +def test_grid_search(): + gridsearch = strategy.GridSearch() + engine = MockExecutionEngine() + _reset_execution_engine(engine) + gridsearch.run(*_get_model_and_mutators()) + wait_models(*engine.models) + selection = set() + for model in engine.models: + selection.add(( + model.get_node_by_name('_model__fc1').operation.parameters['bias'], + model.get_node_by_name('_model__fc2').operation.parameters['bias'] + )) + assert len(selection) == 4 + _reset_execution_engine() + + +def test_random_search(): + random = strategy.Random() + engine = MockExecutionEngine() + _reset_execution_engine(engine) + random.run(*_get_model_and_mutators()) + wait_models(*engine.models) + selection = set() + for model in engine.models: + selection.add(( + model.get_node_by_name('_model__fc1').operation.parameters['bias'], + model.get_node_by_name('_model__fc2').operation.parameters['bias'] + )) + assert len(selection) == 4 + _reset_execution_engine() + + +def test_evolution(): + evolution = strategy.RegularizedEvolution(population_size=5, sample_size=3, cycles=10, mutation_prob=0.5, on_failure='ignore') + engine = MockExecutionEngine(failure_prob=0.2) + _reset_execution_engine(engine) + evolution.run(*_get_model_and_mutators()) + wait_models(*engine.models) + _reset_execution_engine() + + evolution = strategy.RegularizedEvolution(population_size=5, sample_size=3, cycles=10, mutation_prob=0.5, on_failure='worst') + engine = MockExecutionEngine(failure_prob=0.4) + _reset_execution_engine(engine) + evolution.run(*_get_model_and_mutators()) + wait_models(*engine.models) + _reset_execution_engine() + + +if __name__ == '__main__': + test_grid_search() + test_random_search() + test_evolution() diff --git a/ts/nni_manager/common/datastore.ts b/ts/nni_manager/common/datastore.ts index 41324d12e5..0c52512c63 100644 --- a/ts/nni_manager/common/datastore.ts +++ b/ts/nni_manager/common/datastore.ts @@ -23,6 +23,7 @@ interface TrialJobEventRecord { readonly data?: string; readonly logPath?: string; readonly sequenceId?: number; + readonly message?: string; } interface MetricData { diff --git a/ts/nni_manager/common/manager.ts b/ts/nni_manager/common/manager.ts index ea36d02943..1f3972ae43 100644 --- a/ts/nni_manager/common/manager.ts +++ b/ts/nni_manager/common/manager.ts @@ -105,7 +105,6 @@ abstract class Manager { public abstract getTrialLog(trialJobId: string, logType: LogType): Promise; public abstract getTrialJobStatistics(): Promise; - public abstract getTrialJobMessage(trialJobId: string): string | undefined; public abstract getStatus(): NNIManagerStatus; } diff --git a/ts/nni_manager/core/nniDataStore.ts b/ts/nni_manager/core/nniDataStore.ts index c3d49adcaa..c0fa8c54ef 100644 --- a/ts/nni_manager/core/nniDataStore.ts +++ b/ts/nni_manager/core/nniDataStore.ts @@ -87,7 +87,6 @@ class NNIDataStore implements DataStore { if (timestamp === undefined) { timestamp = Date.now(); } - return this.db.storeTrialJobEvent(event, trialJobId, timestamp, hyperParameter, jobDetail).catch( (err: Error) => { throw NNIError.FromError(err, 'Datastore error: '); @@ -381,6 +380,7 @@ class NNIDataStore implements DataStore { if (record.sequenceId !== undefined && jobInfo.sequenceId === undefined) { jobInfo.sequenceId = record.sequenceId; } + jobInfo.message = record.message; map.set(record.trialJobId, jobInfo); } diff --git a/ts/nni_manager/core/nnimanager.ts b/ts/nni_manager/core/nnimanager.ts index 9d703a2d5a..c7ea015611 100644 --- a/ts/nni_manager/core/nnimanager.ts +++ b/ts/nni_manager/core/nnimanager.ts @@ -353,14 +353,6 @@ class NNIManager implements Manager { return this.status; } - public getTrialJobMessage(trialJobId: string): string | undefined { - const trialJob = this.trialJobs.get(trialJobId); - if (trialJob !== undefined){ - return trialJob.message - } - return undefined - } - public async listTrialJobs(status?: TrialJobStatus): Promise { return this.dataStore.listTrialJobs(status); } diff --git a/ts/nni_manager/core/sqlDatabase.ts b/ts/nni_manager/core/sqlDatabase.ts index 485ba75a33..5ccb29c06d 100644 --- a/ts/nni_manager/core/sqlDatabase.ts +++ b/ts/nni_manager/core/sqlDatabase.ts @@ -22,7 +22,7 @@ import { TrialJobDetail } from '../common/trainingService'; const createTables: string = ` -create table TrialJobEvent (timestamp integer, trialJobId text, event text, data text, logPath text, sequenceId integer); +create table TrialJobEvent (timestamp integer, trialJobId text, event text, data text, logPath text, sequenceId integer, message text); create index TrialJobEvent_trialJobId on TrialJobEvent(trialJobId); create index TrialJobEvent_event on TrialJobEvent(event); @@ -62,7 +62,8 @@ function loadTrialJobEvent(row: any): TrialJobEventRecord { event: row.event, data: row.data === null ? undefined : row.data, logPath: row.logPath === null ? undefined : row.logPath, - sequenceId: row.sequenceId === null ? undefined : row.sequenceId + sequenceId: row.sequenceId === null ? undefined : row.sequenceId, + message: row.message === null ? undefined: row.message }; } @@ -163,10 +164,11 @@ class SqlDB implements Database { public storeTrialJobEvent( event: TrialJobEvent, trialJobId: string, timestamp: number, hyperParameter?: string, jobDetail?: TrialJobDetail): Promise { - const sql: string = 'insert into TrialJobEvent values (?,?,?,?,?,?)'; + const sql: string = 'insert into TrialJobEvent values (?,?,?,?,?,?,?)'; const logPath: string | undefined = jobDetail === undefined ? undefined : jobDetail.url; const sequenceId: number | undefined = jobDetail === undefined ? undefined : jobDetail.form.sequenceId; - const args: any[] = [timestamp, trialJobId, event, hyperParameter, logPath, sequenceId]; + const message: string | undefined = jobDetail === undefined ? undefined : jobDetail.message; + const args: any[] = [timestamp, trialJobId, event, hyperParameter, logPath, sequenceId, message]; this.log.trace(`storeTrialJobEvent: SQL: ${sql}, args: ${JSON.stringify(args)}`); const deferred: Deferred = new Deferred(); diff --git a/ts/nni_manager/rest_server/restHandler.ts b/ts/nni_manager/rest_server/restHandler.ts index d619e73d2c..b0b0bab53e 100644 --- a/ts/nni_manager/rest_server/restHandler.ts +++ b/ts/nni_manager/rest_server/restHandler.ts @@ -213,7 +213,6 @@ class NNIRestHandler { this.nniManager.listTrialJobs(req.query.status).then((jobInfos: TrialJobInfo[]) => { jobInfos.forEach((trialJob: TrialJobInfo) => { this.setErrorPathForFailedJob(trialJob); - this.setMessageforJob(trialJob); }); res.send(jobInfos); }).catch((err: Error) => { @@ -226,7 +225,6 @@ class NNIRestHandler { router.get('/trial-jobs/:id', (req: Request, res: Response) => { this.nniManager.getTrialJob(req.params.id).then((jobDetail: TrialJobInfo) => { const jobInfo: TrialJobInfo = this.setErrorPathForFailedJob(jobDetail); - this.setMessageforJob(jobInfo); res.send(jobInfo); }).catch((err: Error) => { this.handleError(err, res); @@ -327,14 +325,6 @@ class NNIRestHandler { return jobInfo; } - - private setMessageforJob(jobInfo: TrialJobInfo): TrialJobInfo { - if (jobInfo === undefined){ - return jobInfo - } - jobInfo.message = this.nniManager.getTrialJobMessage(jobInfo.trialJobId); - return jobInfo - } } export function createRestHandler(rs: NNIRestServer): Router { diff --git a/ts/nni_manager/rest_server/test/mockedNNIManager.ts b/ts/nni_manager/rest_server/test/mockedNNIManager.ts index d9e3e8b9b6..eb78a3615e 100644 --- a/ts/nni_manager/rest_server/test/mockedNNIManager.ts +++ b/ts/nni_manager/rest_server/test/mockedNNIManager.ts @@ -111,10 +111,6 @@ export class MockedNNIManager extends Manager { return deferred.promise; } - public getTrialJobMessage(trialJobId: string): string | undefined { - return "TEST-MESSAGE" - } - public stopExperiment(): Promise { throw new MethodNotImplementedError(); } diff --git a/ts/nni_manager/training_service/reusable/trial.ts b/ts/nni_manager/training_service/reusable/trial.ts index be1005c80f..8018139681 100644 --- a/ts/nni_manager/training_service/reusable/trial.ts +++ b/ts/nni_manager/training_service/reusable/trial.ts @@ -19,6 +19,7 @@ export class TrialDetail implements TrialJobDetail { public form: TrialJobApplicationForm; public isEarlyStopped?: boolean; public environment?: EnvironmentInformation; + public message?: string; // init settings of trial public settings = {}; diff --git a/ts/nni_manager/training_service/reusable/trialDispatcher.ts b/ts/nni_manager/training_service/reusable/trialDispatcher.ts index 00934200b9..1da8dc33bc 100644 --- a/ts/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/ts/nni_manager/training_service/reusable/trialDispatcher.ts @@ -435,7 +435,6 @@ class TrialDispatcher implements TrainingService { if (environment.environmentService === undefined) { throw new Error(`${environment.id} does not has environment service!`); } - trial.url = environment.trackingUrl; const environmentStatus = environment.status; @@ -704,6 +703,7 @@ class TrialDispatcher implements TrainingService { if (environment.environmentService === undefined) { throw new Error(`${environment.id} environmentService not initialized!`); } + trial.message = `Platform: ${environment.environmentService.getName}, environment: ${environment.id}`; if (environment.environmentService.hasStorageService) { const storageService = component.get(StorageService); trial.workingDirectory = storageService.joinPath('trials', trial.id);