Skip to content

Commit

Permalink
Add Optuna based suggestion service (#1613)
Browse files Browse the repository at this point in the history
* Implement Optuna service and cmd

* Update pkg/suggestion/v1beta1/optuna/service.py

Co-authored-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Update pkg/suggestion/v1beta1/optuna/service.py

Co-authored-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Update pkg/suggestion/v1beta1/optuna/service.py

Co-authored-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Update pkg/suggestion/v1beta1/optuna/service.py

Co-authored-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Merge the blocks of self.lock in OptunaService

* Remove Cython installation

* Update Python version for the Optuna suggestion service

* Add the example yaml of multivarite-tpe

* Fix the logic of handling unknown trials

* Use name and value instead of the string representation of assignment

* Turn on constant liar by default

Co-authored-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>
  • Loading branch information
g-votte and andreyvelich authored Aug 16, 2021
1 parent ecb4686 commit 7439a37
Show file tree
Hide file tree
Showing 8 changed files with 505 additions and 0 deletions.
31 changes: 31 additions & 0 deletions cmd/suggestion/optuna/v1beta1/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
FROM python:3.9

ENV TARGET_DIR /opt/katib
ENV SUGGESTION_DIR cmd/suggestion/optuna/v1beta1

RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \
apt-get -y update && \
apt-get -y install gfortran libopenblas-dev liblapack-dev; \
fi

RUN GRPC_HEALTH_PROBE_VERSION=v0.3.1 && \
if [ "$(uname -m)" = "ppc64le" ]; then \
wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-ppc64le; \
elif [ "$(uname -m)" = "aarch64" ]; then \
wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-arm64; \
else \
wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-amd64; \
fi && \
chmod +x /bin/grpc_health_probe

ADD ./pkg/ ${TARGET_DIR}/pkg/
ADD ./${SUGGESTION_DIR}/ ${TARGET_DIR}/${SUGGESTION_DIR}/
WORKDIR ${TARGET_DIR}/${SUGGESTION_DIR}
RUN pip install --no-cache-dir -r requirements.txt

RUN chgrp -R 0 ${TARGET_DIR} \
&& chmod -R g+rwX ${TARGET_DIR}

ENV PYTHONPATH ${TARGET_DIR}:${TARGET_DIR}/pkg/apis/manager/v1beta1/python:${TARGET_DIR}/pkg/apis/manager/health/python

ENTRYPOINT ["python", "main.py"]
42 changes: 42 additions & 0 deletions cmd/suggestion/optuna/v1beta1/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Copyright 2021 The Kubeflow Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import grpc
import time
from pkg.apis.manager.v1beta1.python import api_pb2_grpc
from pkg.apis.manager.health.python import health_pb2_grpc
from pkg.suggestion.v1beta1.optuna.service import OptunaService
from concurrent import futures

_ONE_DAY_IN_SECONDS = 60 * 60 * 24
DEFAULT_PORT = "0.0.0.0:6789"


def serve():
server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
service = OptunaService()
api_pb2_grpc.add_SuggestionServicer_to_server(service, server)
health_pb2_grpc.add_HealthServicer_to_server(service, server)
server.add_insecure_port(DEFAULT_PORT)
print("Listening...")
server.start()
try:
while True:
time.sleep(_ONE_DAY_IN_SECONDS)
except KeyboardInterrupt:
server.stop(0)


if __name__ == "__main__":
serve()
4 changes: 4 additions & 0 deletions cmd/suggestion/optuna/v1beta1/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
grpcio==1.39.0
protobuf==3.17.3
googleapis-common-protos==1.53.0
optuna>=2.8.0
65 changes: 65 additions & 0 deletions examples/v1beta1/multivariate-tpe-example.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
apiVersion: "kubeflow.org/v1beta1"
kind: Experiment
metadata:
namespace: kubeflow
name: multivariate-tpe-example
spec:
objective:
type: maximize
goal: 0.99
objectiveMetricName: Validation-accuracy
additionalMetricNames:
- Train-accuracy
algorithm:
algorithmName: multivariate-tpe
parallelTrialCount: 3
maxTrialCount: 12
maxFailedTrialCount: 3
parameters:
- name: lr
parameterType: double
feasibleSpace:
min: "0.01"
max: "0.03"
- name: num-layers
parameterType: int
feasibleSpace:
min: "2"
max: "5"
- name: optimizer
parameterType: categorical
feasibleSpace:
list:
- sgd
- adam
- ftrl
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: lr
- name: numberLayers
description: Number of training model layers
reference: num-layers
- name: optimizer
description: Training model optimizer (sdg, adam or ftrl)
reference: optimizer
trialSpec:
apiVersion: batch/v1
kind: Job
spec:
template:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/mxnet-mnist:v1beta1-45c5727
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"
- "--batch-size=64"
- "--lr=${trialParameters.learningRate}"
- "--num-layers=${trialParameters.numberLayers}"
- "--optimizer=${trialParameters.optimizer}"
restartPolicy: Never

170 changes: 170 additions & 0 deletions pkg/suggestion/v1beta1/optuna/service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
# Copyright 2021 The Kubeflow Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from collections import defaultdict
import threading

import optuna

from pkg.apis.manager.v1beta1.python import api_pb2
from pkg.apis.manager.v1beta1.python import api_pb2_grpc
from pkg.suggestion.v1beta1.internal.constant import INTEGER, DOUBLE, CATEGORICAL, DISCRETE, MAX_GOAL
from pkg.suggestion.v1beta1.internal.search_space import HyperParameterSearchSpace
from pkg.suggestion.v1beta1.internal.trial import Trial, Assignment
from pkg.suggestion.v1beta1.internal.base_health_service import HealthServicer


class OptunaService(api_pb2_grpc.SuggestionServicer, HealthServicer):

def __init__(self):
super(OptunaService, self).__init__()
self.study = None
self.search_space = None
self.recorded_trial_names = set()
self.assignments_to_optuna_number = defaultdict(list)
self.lock = threading.Lock()

def GetSuggestions(self, request, context):
"""
Main function to provide suggestion.
"""
with self.lock:
if self.study is None:
self.search_space = HyperParameterSearchSpace.convert(request.experiment)
self.study = self._create_study(request.experiment.spec.algorithm, self.search_space)

trials = Trial.convert(request.trials)

if len(trials) != 0:
self._tell(trials)
list_of_assignments = self._ask(request.request_number)

return api_pb2.GetSuggestionsReply(
parameter_assignments=Assignment.generate(list_of_assignments)
)

def _create_study(self, algorithm_spec, search_space):
sampler = self._create_sampler(algorithm_spec)
direction = "maximize" if search_space.goal == MAX_GOAL else "minimize"

study = optuna.create_study(sampler=sampler, direction=direction)

return study

def _create_sampler(self, algorithm_spec):
name = algorithm_spec.algorithm_name
settings = {s.name:s.value for s in algorithm_spec.algorithm_settings}

if name == "tpe" or name == "multivariate-tpe":
kwargs = {}
for k, v in settings.items():
if k == "startup_trials":
kwargs["n_startup_trials"] = int(v)
elif k == "ei_candidates":
kwargs["n_ei_candidates"] = int(v)
elif k == "random_state":
kwargs["seed"] = int(v)
else:
raise ValueError("Unknown name for {}: {}".format(name, k))

kwargs["multivariate"] = name == "multivariate-tpe"
kwargs["constant_liar"] = True

sampler = optuna.samplers.TPESampler(**kwargs)

elif name == "cmaes":
kwargs = {}
for k, v in settings.items():
if k == "restart_strategy":
kwargs["restart_strategy"] = v
elif k == "sigma":
kwargs["sigma0"] = float(v)
elif k == "random_state":
kwargs["seed"] = int(v)
else:
raise ValueError("Unknown name for {}: {}".format(name, k))

sampler = optuna.samplers.CmaEsSampler(**kwargs)

elif name == "random":
kwargs = {}
for k, v in settings.items():
if k == "random_state":
kwargs["seed"] = int(v)
else:
raise ValueError("Unknown name for {}: {}".format(name, k))

sampler = optuna.samplers.RandomSampler(**kwargs)

else:
raise ValueError("Unknown algorithm name: {}".format(name))

return sampler

def _ask(self, request_number):
list_of_assignments = []
for _ in range(request_number):
optuna_trial = self.study.ask(fixed_distributions=self._get_optuna_search_space())

assignments = [Assignment(k,v) for k,v in optuna_trial.params.items()]
list_of_assignments.append(assignments)

assignments_key = self._get_assignments_key(assignments)
self.assignments_to_optuna_number[assignments_key].append(optuna_trial.number)

return list_of_assignments

def _tell(self, trials):
for trial in trials:
if trial.name not in self.recorded_trial_names:
self.recorded_trial_names.add(trial.name)

value = float(trial.target_metric.value)
assignments_key = self._get_assignments_key(trial.assignments)
optuna_trial_numbers = self.assignments_to_optuna_number[assignments_key]

if len(optuna_trial_numbers) != 0:
trial_number = optuna_trial_numbers.pop(0)
self.study.tell(trial_number, value)
else:
raise ValueError("An unknown trial has been passed in the GetSuggestion request.")

def _get_assignments_key(self, assignments):
assignments = sorted(assignments, key=lambda a: a.name)
assignments_str = [f"{a.name}:{a.value}" for a in assignments]
return ",".join(assignments_str)

def _get_optuna_search_space(self):
search_space = {}
for param in self.search_space.params:
if param.type == INTEGER:
search_space[param.name] = optuna.distributions.IntUniformDistribution(int(param.min), int(param.max))
elif param.type == DOUBLE:
search_space[param.name] = optuna.distributions.UniformDistribution(float(param.min), float(param.max))
elif param.type == CATEGORICAL or param.type == DISCRETE:
search_space[param.name] = optuna.distributions.CategoricalDistribution(param.list)
return search_space

def _get_casted_assignment_value(self, assignment):
for param in self.search_space.params:
if param.name == assignment.name:
if param.type == INTEGER:
return int(assignment.value)
elif param.type == DOUBLE:
return float(assignment.value)
elif param.type == CATEGORICAL or param.type == DISCRETE:
return assignment.value
else:
raise ValueError("Unknown parameter type: {}".format(param.type))
raise ValueError("Parameter not found in the search space: {}".format(param.name))
3 changes: 3 additions & 0 deletions scripts/v1beta1/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ docker build -t ${REGISTRY}/suggestion-skopt:${TAG} -f ${CMD_PREFIX}/suggestion/
echo -e "\nBuilding goptuna suggestion...\n"
docker build -t ${REGISTRY}/suggestion-goptuna:${TAG} -f ${CMD_PREFIX}/suggestion/goptuna/${VERSION}/Dockerfile .

echo -e "\nBuilding optuna suggestion...\n"
docker build -t ${REGISTRY}/suggestion-optuna:${TAG} -f ${CMD_PREFIX}/suggestion/optuna/${VERSION}/Dockerfile .

echo -e "\nBuilding ENAS suggestion...\n"
if [ $MACHINE_ARCH == "aarch64" ]; then
docker build -t ${REGISTRY}/suggestion-enas:${TAG} -f ${CMD_PREFIX}/suggestion/nas/enas/${VERSION}/Dockerfile.aarch64 .
Expand Down
1 change: 1 addition & 0 deletions test/scripts/v1beta1/python-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ pip install -r test/suggestion/v1beta1/test_requirements.txt
pip install -r cmd/suggestion/chocolate/v1beta1/requirements.txt
pip install -r cmd/suggestion/hyperopt/v1beta1/requirements.txt
pip install -r cmd/suggestion/skopt/v1beta1/requirements.txt
pip install -r cmd/suggestion/optuna/v1beta1/requirements.txt
pip install -r cmd/suggestion/nas/enas/v1beta1/requirements.txt
pip install -r cmd/suggestion/hyperband/v1beta1/requirements.txt
pip install -r cmd/suggestion/nas/darts/v1beta1/requirements.txt
Expand Down
Loading

0 comments on commit 7439a37

Please sign in to comment.