Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: bump katib-operators version 0.15.0 -> 0.16-rc.1 #123

Merged
merged 22 commits into from
Aug 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ __pycache__/
.coverage
.idea
.tox
.vscode
38 changes: 19 additions & 19 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,25 +72,25 @@ Katib controller comes with a set of preconfigured images that are used in Katib

```json
{
"default_trial_template": "docker.io/kubeflowkatib/mxnet-mnist:v0.15.0",
"early_stopping__medianstop": "docker.io/kubeflowkatib/earlystopping-medianstop:v0.15.0",
"enas_cpu_template": "docker.io/kubeflowkatib/enas-cnn-cifar10-cpu:v0.15.0",
"metrics_collector_sidecar__stdout": "docker.io/kubeflowkatib/file-metrics-collector:v0.15.0",
"metrics_collector_sidecar__file": "docker.io/kubeflowkatib/file-metrics-collector:v0.15.0",
"metrics_collector_sidecar__tensorflow_event": "docker.io/kubeflowkatib/tfevent-metrics-collector:v0.15.0",
"pytorch_job_template__master": "docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.15.0",
"pytorch_job_template__worker": "docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.15.0",
"suggestion__random": "docker.io/kubeflowkatib/suggestion-hyperopt:v0.15.0",
"suggestion__tpe": "docker.io/kubeflowkatib/suggestion-hyperopt:v0.15.0",
"suggestion__grid": "docker.io/kubeflowkatib/suggestion-optuna:v0.15.0",
"suggestion__hyperband": "docker.io/kubeflowkatib/suggestion-hyperband:v0.15.0",
"suggestion__bayesianoptimization": "docker.io/kubeflowkatib/suggestion-skopt:v0.15.0",
"suggestion__cmaes": "docker.io/kubeflowkatib/suggestion-goptuna:v0.15.0",
"suggestion__sobol": "docker.io/kubeflowkatib/suggestion-goptuna:v0.15.0",
"suggestion__multivariate_tpe": "docker.io/kubeflowkatib/suggestion-optuna:v0.15.0",
"suggestion__enas": "docker.io/kubeflowkatib/suggestion-enas:v0.15.0",
"suggestion__darts": "docker.io/kubeflowkatib/suggestion-darts:v0.15.0",
"suggestion__pbt": "docker.io/kubeflowkatib/suggestion-pbt:v0.15.0",
"default_trial_template": "docker.io/kubeflowkatib/mxnet-mnist:v0.16.0-rc.1",
"early_stopping__medianstop": "docker.io/kubeflowkatib/earlystopping-medianstop:v0.16.0-rc.1",
"enas_cpu_template": "docker.io/kubeflowkatib/enas-cnn-cifar10-cpu:v0.16.0-rc.1",
"metrics_collector_sidecar__stdout": "docker.io/kubeflowkatib/file-metrics-collector:v0.16.0-rc.1",
"metrics_collector_sidecar__file": "docker.io/kubeflowkatib/file-metrics-collector:v0.16.0-rc.1",
"metrics_collector_sidecar__tensorflow_event": "docker.io/kubeflowkatib/tfevent-metrics-collector:v0.16.0-rc.1",
"pytorch_job_template__master": "docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.16.0-rc.1",
"pytorch_job_template__worker": "docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.16.0-rc.1",
"suggestion__random": "docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0-rc.1",
"suggestion__tpe": "docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0-rc.1",
"suggestion__grid": "docker.io/kubeflowkatib/suggestion-optuna:v0.16.0-rc.1",
"suggestion__hyperband": "docker.io/kubeflowkatib/suggestion-hyperband:v0.16.0-rc.1",
"suggestion__bayesianoptimization": "docker.io/kubeflowkatib/suggestion-skopt:v0.16.0-rc.1",
"suggestion__cmaes": "docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0-rc.1",
"suggestion__sobol": "docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0-rc.1",
"suggestion__multivariate_tpe": "docker.io/kubeflowkatib/suggestion-optuna:v0.16.0-rc.1",
"suggestion__enas": "docker.io/kubeflowkatib/suggestion-enas:v0.16.0-rc.1",
"suggestion__darts": "docker.io/kubeflowkatib/suggestion-darts:v0.16.0-rc.1",
"suggestion__pbt": "docker.io/kubeflowkatib/suggestion-pbt:v0.16.0-rc.1",
}
```

Expand Down
2 changes: 1 addition & 1 deletion charms/katib-controller/metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ resources:
type: oci-image
description: Backing OCI image
auto-fetch: true
upstream-source: docker.io/kubeflowkatib/katib-controller:v0.15.0
upstream-source: docker.io/kubeflowkatib/katib-controller:v0.16.0-rc.1
provides:
metrics-endpoint:
interface: prometheus_scrape
Expand Down
80 changes: 47 additions & 33 deletions charms/katib-controller/src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,25 +19,25 @@
from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus, WaitingStatus

DEFAULT_IMAGES = {
"default_trial_template": "docker.io/kubeflowkatib/mxnet-mnist:v0.15.0",
"early_stopping__medianstop": "docker.io/kubeflowkatib/earlystopping-medianstop:v0.15.0",
"enas_cpu_template": "docker.io/kubeflowkatib/enas-cnn-cifar10-cpu:v0.15.0",
"metrics_collector_sidecar__stdout": "docker.io/kubeflowkatib/file-metrics-collector:v0.15.0",
"metrics_collector_sidecar__file": "docker.io/kubeflowkatib/file-metrics-collector:v0.15.0",
"metrics_collector_sidecar__tensorflow_event": "docker.io/kubeflowkatib/tfevent-metrics-collector:v0.15.0", # noqa: E501
"pytorch_job_template__master": "docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.15.0",
"pytorch_job_template__worker": "docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.15.0",
"suggestion__random": "docker.io/kubeflowkatib/suggestion-hyperopt:v0.15.0",
"suggestion__tpe": "docker.io/kubeflowkatib/suggestion-hyperopt:v0.15.0",
"suggestion__grid": "docker.io/kubeflowkatib/suggestion-optuna:v0.15.0",
"suggestion__hyperband": "docker.io/kubeflowkatib/suggestion-hyperband:v0.15.0",
"suggestion__bayesianoptimization": "docker.io/kubeflowkatib/suggestion-skopt:v0.15.0",
"suggestion__cmaes": "docker.io/kubeflowkatib/suggestion-goptuna:v0.15.0",
"suggestion__sobol": "docker.io/kubeflowkatib/suggestion-goptuna:v0.15.0",
"suggestion__multivariate_tpe": "docker.io/kubeflowkatib/suggestion-optuna:v0.15.0",
"suggestion__enas": "docker.io/kubeflowkatib/suggestion-enas:v0.15.0",
"suggestion__darts": "docker.io/kubeflowkatib/suggestion-darts:v0.15.0",
"suggestion__pbt": "docker.io/kubeflowkatib/suggestion-pbt:v0.15.0",
"default_trial_template": "docker.io/kubeflowkatib/mxnet-mnist:v0.16.0-rc.1",
"early_stopping__medianstop": "docker.io/kubeflowkatib/earlystopping-medianstop:v0.16.0-rc.1",
"enas_cpu_template": "docker.io/kubeflowkatib/enas-cnn-cifar10-cpu:v0.16.0-rc.1",
"metrics_collector_sidecar__stdout": "docker.io/kubeflowkatib/file-metrics-collector:v0.16.0-rc.1", # noqa: E501
"metrics_collector_sidecar__file": "docker.io/kubeflowkatib/file-metrics-collector:v0.16.0-rc.1", # noqa: E501
"metrics_collector_sidecar__tensorflow_event": "docker.io/kubeflowkatib/tfevent-metrics-collector:v0.16.0-rc.1", # noqa: E501
"pytorch_job_template__master": "docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.16.0-rc.1",
"pytorch_job_template__worker": "docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.16.0-rc.1",
"suggestion__random": "docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0-rc.1",
"suggestion__tpe": "docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0-rc.1",
"suggestion__grid": "docker.io/kubeflowkatib/suggestion-optuna:v0.16.0-rc.1",
"suggestion__hyperband": "docker.io/kubeflowkatib/suggestion-hyperband:v0.16.0-rc.1",
"suggestion__bayesianoptimization": "docker.io/kubeflowkatib/suggestion-skopt:v0.16.0-rc.1",
"suggestion__cmaes": "docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0-rc.1",
"suggestion__sobol": "docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0-rc.1",
"suggestion__multivariate_tpe": "docker.io/kubeflowkatib/suggestion-optuna:v0.16.0-rc.1",
"suggestion__enas": "docker.io/kubeflowkatib/suggestion-enas:v0.16.0-rc.1",
"suggestion__darts": "docker.io/kubeflowkatib/suggestion-darts:v0.16.0-rc.1",
"suggestion__pbt": "docker.io/kubeflowkatib/suggestion-pbt:v0.16.0-rc.1",
}

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -173,6 +173,8 @@ def set_pod_spec(self, event):
self._check_leader()
self.custom_images = parse_images_config(self.model.config["custom_images"])
self.images_context = self.get_images(DEFAULT_IMAGES, self.custom_images)
self.katib_config_context = self.images_context
self.katib_config_context["webhookPort"] = self.model.config["webhook-port"]
image_details = self._check_image_details()
except CheckFailed as check_failed:
self.model.unit.status = check_failed.status
Expand Down Expand Up @@ -201,6 +203,7 @@ def set_pod_spec(self, event):
"pods",
"pods/log",
"pods/status",
"secrets",
],
"verbs": ["*"],
},
Expand Down Expand Up @@ -239,6 +242,14 @@ def set_pod_spec(self, event):
],
"verbs": ["*"],
},
{
"apiGroups": ["admissionregistration.k8s.io"],
"resources": [
"validatingwebhookconfigurations",
"mutatingwebhookconfigurations",
],
"verbs": ["get", "watch", "list", "patch"],
},
],
}
]
Expand All @@ -248,14 +259,7 @@ def set_pod_spec(self, event):
"name": "katib-controller",
"imageDetails": image_details,
"command": ["./katib-controller"],
"args": [
f"--webhook-port={self.model.config['webhook-port']}",
"--trial-resources=Job.v1.batch",
"--trial-resources=TFJob.v1.kubeflow.org",
"--trial-resources=PyTorchJob.v1.kubeflow.org",
"--trial-resources=MPIJob.v1.kubeflow.org",
"--trial-resources=PipelineRun.v1beta1.tekton.dev",
],
"args": ["--katib-config=/katib-config/katib-config.yaml"],
"ports": [
{
"name": "webhook",
Expand All @@ -276,7 +280,20 @@ def set_pod_spec(self, event):
{"path": "tls.crt", "content": self._stored.cert},
{"path": "tls.key", "content": self._stored.key},
],
}
},
{
"name": "katib-config",
"mountPath": "/katib-config",
"files": [
{
"path": "katib-config.yaml",
"content": render_template(
"src/templates/katib-config.yaml.j2",
self.katib_config_context,
),
}
],
},
],
}
],
Expand All @@ -302,11 +319,8 @@ def set_pod_spec(self, event):
},
"configMaps": {
"katib-config": {
f: render_template(f"src/templates/{f}.json.j2", self.images_context)
for f in (
"metrics-collector-sidecar",
"suggestion",
"early-stopping",
"katib-config.yaml": render_template(
"src/templates/katib-config.yaml.j2", self.katib_config_context
)
},
"trial-template": {
Expand Down

This file was deleted.

59 changes: 59 additions & 0 deletions charms/katib-controller/src/templates/katib-config.yaml.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
---
apiVersion: config.kubeflow.org/v1beta1
kind: KatibConfig
init:
controller:
webhookPort: {{ webhookPort }}
trialResources:
- Job.v1.batch
- TFJob.v1.kubeflow.org
- PyTorchJob.v1.kubeflow.org
- MPIJob.v1.kubeflow.org
- XGBoostJob.v1.kubeflow.org
- MXJob.v1.kubeflow.org
runtime:
metricsCollectors:
- kind: StdOut
image: {{ metrics_collector_sidecar__stdout }}
- kind: File
image: {{ metrics_collector_sidecar__file }}
- kind: TensorFlowEvent
image: {{ metrics_collector_sidecar__tensorflow_event }}
resources:
limits:
memory: 1Gi
suggestions:
- algorithmName: random
image: {{ suggestion__random }}
- algorithmName: tpe
image: {{ suggestion__tpe }}
- algorithmName: grid
image: {{ suggestion__grid }}
- algorithmName: hyperband
image: {{ suggestion__hyperband }}
- algorithmName: bayesianoptimization
image: {{ suggestion__bayesianoptimization }}
- algorithmName: cmaes
image: {{ suggestion__cmaes }}
- algorithmName: sobol
image: {{ suggestion__sobol }}
- algorithmName: multivariate-tpe
image: {{ suggestion__multivariate_tpe }}
- algorithmName: enas
image: {{ suggestion__enas }}
resources:
limits:
memory: 200Mi
- algorithmName: darts
image: {{ suggestion__darts }}
- algorithmName: pbt
image: {{ suggestion__pbt }}
persistentVolumeClaimSpec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 5Gi
earlyStoppings:
- algorithmName: medianstop
image: {{ early_stopping__medianstop }}

This file was deleted.

50 changes: 0 additions & 50 deletions charms/katib-controller/src/templates/suggestion.json.j2

This file was deleted.

9 changes: 6 additions & 3 deletions charms/katib-controller/src/templates/webhooks.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ metadata:
webhooks:
- name: validator.experiment.katib.kubeflow.org
sideEffects: None
failurePolicy: Fail
admissionReviewVersions:
- v1
clientConfig:
Expand All @@ -33,7 +32,6 @@ metadata:
webhooks:
- name: defaulter.experiment.katib.kubeflow.org
sideEffects: None
failurePolicy: Fail
admissionReviewVersions:
- v1
clientConfig:
Expand All @@ -54,7 +52,6 @@ webhooks:
- experiments
- name: mutator.pod.katib.kubeflow.org
sideEffects: None
failurePolicy: Fail
admissionReviewVersions:
- v1
clientConfig:
Expand All @@ -66,6 +63,12 @@ webhooks:
namespaceSelector:
matchLabels:
katib.kubeflow.org/metrics-collector-injection: enabled
objectSelector:
matchExpressions:
- key: katib.kubeflow.org/metrics-collector-injection
operator: NotIn
values:
- disabled
rules:
- apiGroups:
- ""
Expand Down
Loading
Loading