Skip to content

Commit

Permalink
hyperband suggestion service (#631)
Browse files Browse the repository at this point in the history
  • Loading branch information
hougangliu authored and k8s-ci-robot committed Jun 8, 2019
1 parent 2ef2bc8 commit da6dae1
Show file tree
Hide file tree
Showing 13 changed files with 428 additions and 8 deletions.
8 changes: 8 additions & 0 deletions cmd/suggestion/hyperband/v1alpha2/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FROM python:3

ADD . /usr/src/app/github.com/kubeflow/katib
WORKDIR /usr/src/app/github.com/kubeflow/katib/cmd/suggestion/hyperband/v1alpha2
RUN pip install --no-cache-dir -r requirements.txt
ENV PYTHONPATH /usr/src/app/github.com/kubeflow/katib:/usr/src/app/github.com/kubeflow/katib/pkg/api/v1alpha2/python

ENTRYPOINT ["python", "main.py"]
Empty file.
23 changes: 23 additions & 0 deletions cmd/suggestion/hyperband/v1alpha2/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import grpc
import time
from pkg.api.v1alpha2.python import api_pb2_grpc
from pkg.suggestion.v1alpha2.hyperband_service import HyperbandService
from concurrent import futures

_ONE_DAY_IN_SECONDS = 60 * 60 * 24
DEFAULT_PORT = "0.0.0.0:6789"

def serve():
server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
api_pb2_grpc.add_SuggestionServicer_to_server(HyperbandService(), server)
server.add_insecure_port(DEFAULT_PORT)
print("Listening...")
server.start()
try:
while True:
time.sleep(_ONE_DAY_IN_SECONDS)
except KeyboardInterrupt:
server.stop(0)

if __name__ == "__main__":
serve()
9 changes: 9 additions & 0 deletions cmd/suggestion/hyperband/v1alpha2/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
grpcio
duecredit
cloudpickle==0.5.6
numpy>=1.13.3
scikit-learn>=0.19.0
scipy>=0.19.1
forestci
protobuf
googleapis-common-protos
69 changes: 69 additions & 0 deletions examples/v1alpha2/hyperband-example.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
apiVersion: "kubeflow.org/v1alpha2"
kind: Experiment
metadata:
namespace: kubeflow
name: hyperband-example
spec:
objective:
type: maximize
goal: 0.99
objectiveMetricName: Validation-accuracy
additionalMetricNames:
- accuracy
algorithm:
algorithmName: hyperband
algorithmSettings:
- name: "resourceName"
value: "--num-epochs"
- name: "eta"
value: "3"
- name: "r_l"
value: "9"
maxFailedTrialCount: 3
parameters:
- name: --lr
parameterType: double
feasibleSpace:
min: "0.01"
max: "0.03"
- name: --num-layers
parameterType: int
feasibleSpace:
min: "2"
max: "5"
- name: --optimizer
parameterType: categorical
feasibleSpace:
list:
- sgd
- adam
- ftrl
- name: --num-epochs
parametertype: int
feasible:
min: "20"
max: "20"
trialTemplate:
goTemplate:
rawTemplate: |-
apiVersion: batch/v1
kind: Job
metadata:
name: {{.Trial}}
namespace: {{.NameSpace}}
spec:
template:
spec:
containers:
- name: {{.Trial}}
image: katib/mxnet-mnist-example
command:
- "python"
- "/mxnet/example/image-classification/train_mnist.py"
- "--batch-size=64"
{{- with .HyperParameters}}
{{- range .}}
- "{{.Name}}={{.Value}}"
{{- end}}
{{- end}}
restartPolicy: Never
24 changes: 24 additions & 0 deletions manifests/v1alpha2/suggestion/hyperband/deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: katib-suggestion-hyperband
namespace: kubeflow
labels:
app: katib
component: suggestion-hyperband
spec:
replicas: 1
template:
metadata:
name: katib-suggestion-hyperband
labels:
app: katib
component: suggestion-hyperband
spec:
containers:
- name: katib-suggestion-hyperband
image: katib/v1alpha2/suggestion-hyperband
imagePullPolicy: IfNotPresent
ports:
- name: api
containerPort: 6789
17 changes: 17 additions & 0 deletions manifests/v1alpha2/suggestion/hyperband/service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
apiVersion: v1
kind: Service
metadata:
name: katib-suggestion-hyperband
namespace: kubeflow
labels:
app: katib
component: suggestion-hyperband
spec:
type: ClusterIP
ports:
- port: 6789
protocol: TCP
name: api
selector:
app: katib
component: suggestion-hyperband
4 changes: 4 additions & 0 deletions pkg/controller/v1alpha2/experiment/experiment_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,10 @@ func (r *ReconcileExperiment) createTrials(instance *experimentsv1alpha2.Experim
logger.Error(err, "Get suggestions error")
return err
}
if len(trials) == 0 {
// for some suggestion services, such as hyperband, it will stop generating new trial once some condition satisfied
util.UpdateExperimentStatusCondition(instance, false, true)
}
for _, trial := range trials {
if err = r.createTrialInstance(instance, trial); err != nil {
logger.Error(err, "Create trial instance error", "trial", trial)
Expand Down
11 changes: 9 additions & 2 deletions pkg/controller/v1alpha2/experiment/util/status_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ func UpdateExperimentStatus(instance *experimentsv1alpha2.Experiment, trials *tr

isObjectiveGoalReached := updateTrialsSummary(instance, trials)

updateExperimentStatusCondition(instance, isObjectiveGoalReached)
UpdateExperimentStatusCondition(instance, isObjectiveGoalReached, false)
return nil

}
Expand Down Expand Up @@ -134,7 +134,7 @@ func getObjectiveMetricValue(trial trialsv1alpha2.Trial, objectiveMetricName str
return nil
}

func updateExperimentStatusCondition(instance *experimentsv1alpha2.Experiment, isObjectiveGoalReached bool) {
func UpdateExperimentStatusCondition(instance *experimentsv1alpha2.Experiment, isObjectiveGoalReached bool, getSuggestionDone bool) {

completedTrialsCount := instance.Status.TrialsSucceeded + instance.Status.TrialsFailed + instance.Status.TrialsKilled
failedTrialsCount := instance.Status.TrialsFailed
Expand All @@ -154,6 +154,13 @@ func updateExperimentStatusCondition(instance *experimentsv1alpha2.Experiment, i
return
}

if getSuggestionDone && (instance.Status.TrialsPending+instance.Status.TrialsRunning) == 0 {
msg := "Experiment has succeeded because suggestion service has reached the end"
instance.MarkExperimentStatusSucceeded(ExperimentSucceededReason, msg)
instance.Status.CompletionTime = &now
return
}

if (instance.Spec.MaxFailedTrialCount != nil) && (failedTrialsCount >= *instance.Spec.MaxFailedTrialCount) {
msg := "Experiment has failed because max failed count has reached"
instance.MarkExperimentStatusFailed(ExperimentFailedReason, msg)
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/v1alpha2/trial/trial_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ func (r *ReconcileTrial) reconcileTrial(instance *trialsv1alpha2.Trial) error {
return err
}
// Update Trial job status only if observation field is available.
// This will ensure that trial is set to be complete only if metric is collected atleast once
// This will ensure that trial is set to be complete only if metric is collected at least once
if isTrialObservationAvailable(instance) {
if err = r.UpdateTrialStatusCondition(instance, deployedJob); err != nil {
logger.Error(err, "Update trial status condition error")
Expand Down
Loading

0 comments on commit da6dae1

Please sign in to comment.