From 5d2b568f23c97d3d15ff5b057ee84e8fa273694e Mon Sep 17 00:00:00 2001 From: Daniela Plascencia Date: Wed, 16 Aug 2023 17:04:36 +0200 Subject: [PATCH 1/4] Update kubeflow/katib manifests from v0.16.0-rc.1 --- README.md | 2 +- .../cert-generator/cert-generator.yaml | 27 ------- .../cert-generator/kustomization.yaml | 7 -- .../components/cert-generator/rbac.yaml | 48 ----------- .../components/controller/controller.yaml | 16 ++-- .../components/controller/katib-config.yaml | 81 ------------------- .../components/controller/kustomization.yaml | 1 - .../upstream/components/controller/rbac.yaml | 19 +++++ .../controller/trial-templates.yaml | 8 +- .../components/db-manager/db-manager.yaml | 4 +- .../upstream/components/webhook/webhooks.yaml | 13 ++- .../katib-cert-manager/katib-config.yaml | 59 ++++++++++++++ .../katib-cert-manager/kustomization.yaml | 14 +++- .../katib-external-db/katib-config.yaml | 61 ++++++++++++++ .../katib-external-db/kustomization.yaml | 22 +++-- .../katib-leader-election/katib-config.yaml | 62 ++++++++++++++ .../katib-leader-election/kustomization.yaml | 14 ++-- .../patches/controller.yaml | 4 - .../katib-openshift/katib-config.yaml | 59 ++++++++++++++ .../katib-openshift/kustomization.yaml | 14 +++- .../katib-config.yaml | 61 ++++++++++++++ .../kustomization.yaml | 23 ++++-- .../katib-standalone/katib-config.yaml | 61 ++++++++++++++ .../katib-standalone/kustomization.yaml | 23 ++++-- .../katib-with-kubeflow/kustomization.yaml | 6 +- 25 files changed, 484 insertions(+), 225 deletions(-) delete mode 100644 apps/katib/upstream/components/cert-generator/cert-generator.yaml delete mode 100644 apps/katib/upstream/components/cert-generator/kustomization.yaml delete mode 100644 apps/katib/upstream/components/cert-generator/rbac.yaml delete mode 100644 apps/katib/upstream/components/controller/katib-config.yaml create mode 100644 apps/katib/upstream/installs/katib-cert-manager/katib-config.yaml create mode 100644 apps/katib/upstream/installs/katib-external-db/katib-config.yaml create mode 100644 apps/katib/upstream/installs/katib-leader-election/katib-config.yaml delete mode 100644 apps/katib/upstream/installs/katib-leader-election/patches/controller.yaml create mode 100644 apps/katib/upstream/installs/katib-openshift/katib-config.yaml create mode 100644 apps/katib/upstream/installs/katib-standalone-postgres/katib-config.yaml create mode 100644 apps/katib/upstream/installs/katib-standalone/katib-config.yaml diff --git a/README.md b/README.md index ec29040422..fa1faeca8d 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ This repo periodically syncs all official Kubeflow components from their respect | Jupyter Web App | apps/jupyter/jupyter-web-app/upstream | [v1.7.0-rc.0](https://github.com/kubeflow/kubeflow/tree/v1.7.0-rc.0/components/crud-web-apps/jupyter/manifests) | | Tensorboards Web App | apps/tensorboard/tensorboards-web-app/upstream | [v1.7.0-rc.0](https://github.com/kubeflow/kubeflow/tree/v1.7.0-rc.0/components/crud-web-apps/tensorboards/manifests) | | Volumes Web App | apps/volumes-web-app/upstream | [v1.7.0-rc.0](https://github.com/kubeflow/kubeflow/tree/v1.7.0-rc.0/components/crud-web-apps/volumes/manifests) | -| Katib | apps/katib/upstream | [v0.15.0-rc.0](https://github.com/kubeflow/katib/tree/v0.15.0-rc.0/manifests/v1beta1) | +| Katib | apps/katib/upstream | [v0.16.0-rc.1](https://github.com/kubeflow/katib/tree/v0.16.0-rc.1/manifests/v1beta1) | | KServe | contrib/kserve/kserve | [v0.10.0](https://github.com/kserve/kserve/tree/v0.10.0/install/v0.10.0) | | KServe Models Web App | contrib/kserve/models-web-app | [v0.10.0](https://github.com/kserve/models-web-app/tree/v0.10.0/config) | | Kubeflow Pipelines | apps/pipeline/upstream | [2.0.0-alpha.7](https://github.com/kubeflow/pipelines/tree/2.0.0-alpha.7/manifests/kustomize) | diff --git a/apps/katib/upstream/components/cert-generator/cert-generator.yaml b/apps/katib/upstream/components/cert-generator/cert-generator.yaml deleted file mode 100644 index 3f06b26d9d..0000000000 --- a/apps/katib/upstream/components/cert-generator/cert-generator.yaml +++ /dev/null @@ -1,27 +0,0 @@ ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: katib-cert-generator - namespace: kubeflow - labels: - katib.kubeflow.org/component: cert-generator -spec: - template: - metadata: - annotations: - sidecar.istio.io/inject: "false" - spec: - serviceAccountName: katib-cert-generator - containers: - - name: cert-generator - image: docker.io/kubeflowkatib/cert-generator - command: ["./katib-cert-generator"] - args: ["generate", "--namespace=$(KATIB_CORE_NAMESPACE)"] - env: - - name: KATIB_CORE_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - restartPolicy: Never - backoffLimit: 4 diff --git a/apps/katib/upstream/components/cert-generator/kustomization.yaml b/apps/katib/upstream/components/cert-generator/kustomization.yaml deleted file mode 100644 index f1536e8071..0000000000 --- a/apps/katib/upstream/components/cert-generator/kustomization.yaml +++ /dev/null @@ -1,7 +0,0 @@ ---- -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -resources: - - cert-generator.yaml - - rbac.yaml diff --git a/apps/katib/upstream/components/cert-generator/rbac.yaml b/apps/katib/upstream/components/cert-generator/rbac.yaml deleted file mode 100644 index d53c8609a2..0000000000 --- a/apps/katib/upstream/components/cert-generator/rbac.yaml +++ /dev/null @@ -1,48 +0,0 @@ ---- -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: katib-cert-generator -rules: - - apiGroups: - - "" - resources: - - secrets - - services - verbs: - - get - - create - - delete - - apiGroups: - - batch - resources: - - jobs - verbs: - - get - - apiGroups: - - admissionregistration.k8s.io - resources: - - validatingwebhookconfigurations - - mutatingwebhookconfigurations - verbs: - - get - - patch ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: katib-cert-generator - namespace: kubeflow ---- -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: katib-cert-generator -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: katib-cert-generator -subjects: - - kind: ServiceAccount - name: katib-cert-generator - namespace: kubeflow diff --git a/apps/katib/upstream/components/controller/controller.yaml b/apps/katib/upstream/components/controller/controller.yaml index 51487d1631..06579ff08c 100644 --- a/apps/katib/upstream/components/controller/controller.yaml +++ b/apps/katib/upstream/components/controller/controller.yaml @@ -15,6 +15,7 @@ spec: metadata: labels: katib.kubeflow.org/component: controller + katib.kubeflow.org/metrics-collector-injection: disabled annotations: prometheus.io/scrape: "true" prometheus.io/port: "8080" @@ -26,13 +27,7 @@ spec: image: docker.io/kubeflowkatib/katib-controller command: ["./katib-controller"] args: - - "--webhook-port=8443" - - "--trial-resources=Job.v1.batch" - - "--trial-resources=TFJob.v1.kubeflow.org" - - "--trial-resources=PyTorchJob.v1.kubeflow.org" - - "--trial-resources=MPIJob.v1.kubeflow.org" - - "--trial-resources=XGBoostJob.v1.kubeflow.org" - - "--trial-resources=MXJob.v1.kubeflow.org" + - --katib-config=/katib-config.yaml ports: - containerPort: 8443 name: webhook @@ -60,8 +55,15 @@ spec: - mountPath: /tmp/cert name: cert readOnly: true + - mountPath: /katib-config.yaml + name: katib-config + subPath: katib-config.yaml + readOnly: true volumes: - name: cert secret: defaultMode: 420 secretName: katib-webhook-cert + - name: katib-config + configMap: + name: katib-config diff --git a/apps/katib/upstream/components/controller/katib-config.yaml b/apps/katib/upstream/components/controller/katib-config.yaml deleted file mode 100644 index 7fa27c9625..0000000000 --- a/apps/katib/upstream/components/controller/katib-config.yaml +++ /dev/null @@ -1,81 +0,0 @@ ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: katib-config - namespace: kubeflow -data: - metrics-collector-sidecar: |- - { - "StdOut": { - "image": "docker.io/kubeflowkatib/file-metrics-collector:v0.15.0-rc.0" - }, - "File": { - "image": "docker.io/kubeflowkatib/file-metrics-collector:v0.15.0-rc.0" - }, - "TensorFlowEvent": { - "image": "docker.io/kubeflowkatib/tfevent-metrics-collector:v0.15.0-rc.0", - "resources": { - "limits": { - "memory": "1Gi" - } - } - } - } - suggestion: |- - { - "random": { - "image": "docker.io/kubeflowkatib/suggestion-hyperopt:v0.15.0-rc.0" - }, - "tpe": { - "image": "docker.io/kubeflowkatib/suggestion-hyperopt:v0.15.0-rc.0" - }, - "grid": { - "image": "docker.io/kubeflowkatib/suggestion-optuna:v0.15.0-rc.0" - }, - "hyperband": { - "image": "docker.io/kubeflowkatib/suggestion-hyperband:v0.15.0-rc.0" - }, - "bayesianoptimization": { - "image": "docker.io/kubeflowkatib/suggestion-skopt:v0.15.0-rc.0" - }, - "cmaes": { - "image": "docker.io/kubeflowkatib/suggestion-goptuna:v0.15.0-rc.0" - }, - "sobol": { - "image": "docker.io/kubeflowkatib/suggestion-goptuna:v0.15.0-rc.0" - }, - "multivariate-tpe": { - "image": "docker.io/kubeflowkatib/suggestion-optuna:v0.15.0-rc.0" - }, - "enas": { - "image": "docker.io/kubeflowkatib/suggestion-enas:v0.15.0-rc.0", - "resources": { - "limits": { - "memory": "200Mi" - } - } - }, - "darts": { - "image": "docker.io/kubeflowkatib/suggestion-darts:v0.15.0-rc.0" - }, - "pbt": { - "image": "docker.io/kubeflowkatib/suggestion-pbt:v0.15.0-rc.0", - "persistentVolumeClaimSpec": { - "accessModes": [ - "ReadWriteMany" - ], - "resources": { - "requests": { - "storage": "5Gi" - } - } - } - } - } - early-stopping: |- - { - "medianstop": { - "image": "docker.io/kubeflowkatib/earlystopping-medianstop:v0.15.0-rc.0" - } - } diff --git a/apps/katib/upstream/components/controller/kustomization.yaml b/apps/katib/upstream/components/controller/kustomization.yaml index 9d410a6fcf..18979ddba4 100644 --- a/apps/katib/upstream/components/controller/kustomization.yaml +++ b/apps/katib/upstream/components/controller/kustomization.yaml @@ -4,7 +4,6 @@ kind: Kustomization resources: - controller.yaml - - katib-config.yaml - rbac.yaml - service.yaml - trial-templates.yaml diff --git a/apps/katib/upstream/components/controller/rbac.yaml b/apps/katib/upstream/components/controller/rbac.yaml index 68db66b558..3118ea7ded 100644 --- a/apps/katib/upstream/components/controller/rbac.yaml +++ b/apps/katib/upstream/components/controller/rbac.yaml @@ -49,6 +49,15 @@ rules: - pods/status verbs: - "get" + - apiGroups: + - "" + resources: + - secrets + verbs: + - "get" + - "list" + - "watch" + - "patch" - apiGroups: - apps resources: @@ -108,6 +117,16 @@ rules: - suggestions/finalizers verbs: - "*" + - apiGroups: + - admissionregistration.k8s.io + resources: + - validatingwebhookconfigurations + - mutatingwebhookconfigurations + verbs: + - "get" + - "watch" + - "list" + - "patch" --- apiVersion: v1 kind: ServiceAccount diff --git a/apps/katib/upstream/components/controller/trial-templates.yaml b/apps/katib/upstream/components/controller/trial-templates.yaml index 4d290054cd..0b476228fe 100644 --- a/apps/katib/upstream/components/controller/trial-templates.yaml +++ b/apps/katib/upstream/components/controller/trial-templates.yaml @@ -15,7 +15,7 @@ data: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/mxnet-mnist:v0.15.0-rc.0 + image: docker.io/kubeflowkatib/mxnet-mnist:v0.16.0-rc.1 command: - "python3" - "/opt/mxnet-mnist/mnist.py" @@ -33,7 +33,7 @@ data: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/enas-cnn-cifar10-cpu:v0.15.0-rc.0 + image: docker.io/kubeflowkatib/enas-cnn-cifar10-cpu:v0.16.0-rc.1 command: - python3 - -u @@ -54,7 +54,7 @@ data: spec: containers: - name: pytorch - image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.15.0-rc.0 + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.16.0-rc.1 command: - "python3" - "/opt/pytorch-mnist/mnist.py" @@ -68,7 +68,7 @@ data: spec: containers: - name: pytorch - image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.15.0-rc.0 + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.16.0-rc.1 command: - "python3" - "/opt/pytorch-mnist/mnist.py" diff --git a/apps/katib/upstream/components/db-manager/db-manager.yaml b/apps/katib/upstream/components/db-manager/db-manager.yaml index 41b8b3f930..f4e6e455b0 100644 --- a/apps/katib/upstream/components/db-manager/db-manager.yaml +++ b/apps/katib/upstream/components/db-manager/db-manager.yaml @@ -35,8 +35,8 @@ spec: - name: api containerPort: 6789 livenessProbe: - exec: - command: ["/bin/grpc_health_probe", "-addr=:6789"] + grpc: + port: 6789 initialDelaySeconds: 10 periodSeconds: 60 failureThreshold: 5 diff --git a/apps/katib/upstream/components/webhook/webhooks.yaml b/apps/katib/upstream/components/webhook/webhooks.yaml index d9297340c7..bb3f291daf 100644 --- a/apps/katib/upstream/components/webhook/webhooks.yaml +++ b/apps/katib/upstream/components/webhook/webhooks.yaml @@ -6,7 +6,6 @@ metadata: webhooks: - name: validator.experiment.katib.kubeflow.org sideEffects: None - failurePolicy: Ignore admissionReviewVersions: - v1 clientConfig: @@ -33,7 +32,6 @@ metadata: webhooks: - name: defaulter.experiment.katib.kubeflow.org sideEffects: None - failurePolicy: Ignore admissionReviewVersions: - v1 clientConfig: @@ -54,7 +52,6 @@ webhooks: - experiments - name: mutator.pod.katib.kubeflow.org sideEffects: None - failurePolicy: Ignore admissionReviewVersions: - v1 clientConfig: @@ -66,6 +63,16 @@ webhooks: namespaceSelector: matchLabels: katib.kubeflow.org/metrics-collector-injection: enabled + # Once the AdmissionWebhookMatchConditions feature gate is enabled by default, we should switch to control based on userInfo. + # REF: + # - AdmissionWebhookMatchConditions: https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#matching-requests-matchconditions + # - Tracking issue: https://github.com/kubeflow/katib/issues/2206 + objectSelector: + matchExpressions: + - key: katib.kubeflow.org/metrics-collector-injection + operator: NotIn + values: + - disabled rules: - apiGroups: - "" diff --git a/apps/katib/upstream/installs/katib-cert-manager/katib-config.yaml b/apps/katib/upstream/installs/katib-cert-manager/katib-config.yaml new file mode 100644 index 0000000000..ff45d96c9e --- /dev/null +++ b/apps/katib/upstream/installs/katib-cert-manager/katib-config.yaml @@ -0,0 +1,59 @@ +--- +apiVersion: config.kubeflow.org/v1beta1 +kind: KatibConfig +init: + controller: + webhookPort: 8443 + trialResources: + - Job.v1.batch + - TFJob.v1.kubeflow.org + - PyTorchJob.v1.kubeflow.org + - MPIJob.v1.kubeflow.org + - XGBoostJob.v1.kubeflow.org + - MXJob.v1.kubeflow.org +runtime: + metricsCollectors: + - kind: StdOut + image: docker.io/kubeflowkatib/file-metrics-collector:v0.16.0-rc.1 + - kind: File + image: docker.io/kubeflowkatib/file-metrics-collector:v0.16.0-rc.1 + - kind: TensorFlowEvent + image: docker.io/kubeflowkatib/tfevent-metrics-collector:v0.16.0-rc.1 + resources: + limits: + memory: 1Gi + suggestions: + - algorithmName: random + image: docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0-rc.1 + - algorithmName: tpe + image: docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0-rc.1 + - algorithmName: grid + image: docker.io/kubeflowkatib/suggestion-optuna:v0.16.0-rc.1 + - algorithmName: hyperband + image: docker.io/kubeflowkatib/suggestion-hyperband:v0.16.0-rc.1 + - algorithmName: bayesianoptimization + image: docker.io/kubeflowkatib/suggestion-skopt:v0.16.0-rc.1 + - algorithmName: cmaes + image: docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0-rc.1 + - algorithmName: sobol + image: docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0-rc.1 + - algorithmName: multivariate-tpe + image: docker.io/kubeflowkatib/suggestion-optuna:v0.16.0-rc.1 + - algorithmName: enas + image: docker.io/kubeflowkatib/suggestion-enas:v0.16.0-rc.1 + resources: + limits: + memory: 200Mi + - algorithmName: darts + image: docker.io/kubeflowkatib/suggestion-darts:v0.16.0-rc.1 + - algorithmName: pbt + image: docker.io/kubeflowkatib/suggestion-pbt:v0.16.0-rc.1 + persistentVolumeClaimSpec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 5Gi + earlyStoppings: + - algorithmName: medianstop + image: docker.io/kubeflowkatib/earlystopping-medianstop:v0.16.0-rc.1 diff --git a/apps/katib/upstream/installs/katib-cert-manager/kustomization.yaml b/apps/katib/upstream/installs/katib-cert-manager/kustomization.yaml index 32b8cb230a..d021cbb000 100644 --- a/apps/katib/upstream/installs/katib-cert-manager/kustomization.yaml +++ b/apps/katib/upstream/installs/katib-cert-manager/kustomization.yaml @@ -22,13 +22,13 @@ resources: images: - name: docker.io/kubeflowkatib/katib-controller newName: docker.io/kubeflowkatib/katib-controller - newTag: v0.15.0-rc.0 + newTag: v0.16.0-rc.1 - name: docker.io/kubeflowkatib/katib-db-manager newName: docker.io/kubeflowkatib/katib-db-manager - newTag: v0.15.0-rc.0 + newTag: v0.16.0-rc.1 - name: docker.io/kubeflowkatib/katib-ui newName: docker.io/kubeflowkatib/katib-ui - newTag: v0.15.0-rc.0 + newTag: v0.16.0-rc.1 patchesStrategicMerge: - patches/katib-cert-injection.yaml @@ -59,3 +59,11 @@ vars: configurations: - params.yaml + +configMapGenerator: + - name: katib-config + behavior: create + files: + - katib-config.yaml + options: + disableNameSuffixHash: true diff --git a/apps/katib/upstream/installs/katib-external-db/katib-config.yaml b/apps/katib/upstream/installs/katib-external-db/katib-config.yaml new file mode 100644 index 0000000000..f59bdd07bb --- /dev/null +++ b/apps/katib/upstream/installs/katib-external-db/katib-config.yaml @@ -0,0 +1,61 @@ +--- +apiVersion: config.kubeflow.org/v1beta1 +kind: KatibConfig +init: + certGenerator: + enable: true + controller: + webhookPort: 8443 + trialResources: + - Job.v1.batch + - TFJob.v1.kubeflow.org + - PyTorchJob.v1.kubeflow.org + - MPIJob.v1.kubeflow.org + - XGBoostJob.v1.kubeflow.org + - MXJob.v1.kubeflow.org +runtime: + metricsCollectors: + - kind: StdOut + image: docker.io/kubeflowkatib/file-metrics-collector:v0.16.0-rc.1 + - kind: File + image: docker.io/kubeflowkatib/file-metrics-collector:v0.16.0-rc.1 + - kind: TensorFlowEvent + image: docker.io/kubeflowkatib/tfevent-metrics-collector:v0.16.0-rc.1 + resources: + limits: + memory: 1Gi + suggestions: + - algorithmName: random + image: docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0-rc.1 + - algorithmName: tpe + image: docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0-rc.1 + - algorithmName: grid + image: docker.io/kubeflowkatib/suggestion-optuna:v0.16.0-rc.1 + - algorithmName: hyperband + image: docker.io/kubeflowkatib/suggestion-hyperband:v0.16.0-rc.1 + - algorithmName: bayesianoptimization + image: docker.io/kubeflowkatib/suggestion-skopt:v0.16.0-rc.1 + - algorithmName: cmaes + image: docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0-rc.1 + - algorithmName: sobol + image: docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0-rc.1 + - algorithmName: multivariate-tpe + image: docker.io/kubeflowkatib/suggestion-optuna:v0.16.0-rc.1 + - algorithmName: enas + image: docker.io/kubeflowkatib/suggestion-enas:v0.16.0-rc.1 + resources: + limits: + memory: 200Mi + - algorithmName: darts + image: docker.io/kubeflowkatib/suggestion-darts:v0.16.0-rc.1 + - algorithmName: pbt + image: docker.io/kubeflowkatib/suggestion-pbt:v0.16.0-rc.1 + persistentVolumeClaimSpec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 5Gi + earlyStoppings: + - algorithmName: medianstop + image: docker.io/kubeflowkatib/earlystopping-medianstop:v0.16.0-rc.1 diff --git a/apps/katib/upstream/installs/katib-external-db/kustomization.yaml b/apps/katib/upstream/installs/katib-external-db/kustomization.yaml index 9b56980fc9..d585552f14 100644 --- a/apps/katib/upstream/installs/katib-external-db/kustomization.yaml +++ b/apps/katib/upstream/installs/katib-external-db/kustomization.yaml @@ -13,23 +13,18 @@ resources: - ../../components/db-manager/ # Katib UI. - ../../components/ui/ - # Katib Cert Generator - - ../../components/cert-generator/ # Katib webhooks. - ../../components/webhook/ images: - name: docker.io/kubeflowkatib/katib-controller newName: docker.io/kubeflowkatib/katib-controller - newTag: v0.15.0-rc.0 + newTag: v0.16.0-rc.1 - name: docker.io/kubeflowkatib/katib-db-manager newName: docker.io/kubeflowkatib/katib-db-manager - newTag: v0.15.0-rc.0 + newTag: v0.16.0-rc.1 - name: docker.io/kubeflowkatib/katib-ui newName: docker.io/kubeflowkatib/katib-ui - newTag: v0.15.0-rc.0 - - name: docker.io/kubeflowkatib/cert-generator - newName: docker.io/kubeflowkatib/cert-generator - newTag: v0.15.0-rc.0 + newTag: v0.16.0-rc.1 patchesStrategicMerge: - patches/db-manager.yaml # Modify katib-mysql-secrets with parameters for the DB. @@ -37,3 +32,14 @@ secretGenerator: - name: katib-mysql-secrets envs: - secrets.env + # Secret for webhooks certs. + - name: katib-webhook-cert + options: + disableNameSuffixHash: true +configMapGenerator: + - name: katib-config + behavior: create + files: + - katib-config.yaml + options: + disableNameSuffixHash: true diff --git a/apps/katib/upstream/installs/katib-leader-election/katib-config.yaml b/apps/katib/upstream/installs/katib-leader-election/katib-config.yaml new file mode 100644 index 0000000000..6af98b833e --- /dev/null +++ b/apps/katib/upstream/installs/katib-leader-election/katib-config.yaml @@ -0,0 +1,62 @@ +--- +apiVersion: config.kubeflow.org/v1beta1 +kind: KatibConfig +init: + certGenerator: + enable: true + controller: + webhookPort: 8443 + enableLeaderElection: true + trialResources: + - Job.v1.batch + - TFJob.v1.kubeflow.org + - PyTorchJob.v1.kubeflow.org + - MPIJob.v1.kubeflow.org + - XGBoostJob.v1.kubeflow.org + - MXJob.v1.kubeflow.org +runtime: + metricsCollectors: + - kind: StdOut + image: docker.io/kubeflowkatib/file-metrics-collector:v0.16.0-rc.1 + - kind: File + image: docker.io/kubeflowkatib/file-metrics-collector:v0.16.0-rc.1 + - kind: TensorFlowEvent + image: docker.io/kubeflowkatib/tfevent-metrics-collector:v0.16.0-rc.1 + resources: + limits: + memory: 1Gi + suggestions: + - algorithmName: random + image: docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0-rc.1 + - algorithmName: tpe + image: docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0-rc.1 + - algorithmName: grid + image: docker.io/kubeflowkatib/suggestion-optuna:v0.16.0-rc.1 + - algorithmName: hyperband + image: docker.io/kubeflowkatib/suggestion-hyperband:v0.16.0-rc.1 + - algorithmName: bayesianoptimization + image: docker.io/kubeflowkatib/suggestion-skopt:v0.16.0-rc.1 + - algorithmName: cmaes + image: docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0-rc.1 + - algorithmName: sobol + image: docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0-rc.1 + - algorithmName: multivariate-tpe + image: docker.io/kubeflowkatib/suggestion-optuna:v0.16.0-rc.1 + - algorithmName: enas + image: docker.io/kubeflowkatib/suggestion-enas:v0.16.0-rc.1 + resources: + limits: + memory: 200Mi + - algorithmName: darts + image: docker.io/kubeflowkatib/suggestion-darts:v0.16.0-rc.1 + - algorithmName: pbt + image: docker.io/kubeflowkatib/suggestion-pbt:v0.16.0-rc.1 + persistentVolumeClaimSpec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 5Gi + earlyStoppings: + - algorithmName: medianstop + image: docker.io/kubeflowkatib/earlystopping-medianstop:v0.16.0-rc.1 diff --git a/apps/katib/upstream/installs/katib-leader-election/kustomization.yaml b/apps/katib/upstream/installs/katib-leader-election/kustomization.yaml index ce21834cc5..64b8a1554b 100644 --- a/apps/katib/upstream/installs/katib-leader-election/kustomization.yaml +++ b/apps/katib/upstream/installs/katib-leader-election/kustomization.yaml @@ -9,10 +9,10 @@ resources: replicas: - name: katib-controller count: 2 -patchesJson6902: - - target: - group: apps - version: v1 - kind: Deployment - name: katib-controller - path: ./patches/controller.yaml +configMapGenerator: + - name: katib-config + behavior: replace + files: + - katib-config.yaml + options: + disableNameSuffixHash: true diff --git a/apps/katib/upstream/installs/katib-leader-election/patches/controller.yaml b/apps/katib/upstream/installs/katib-leader-election/patches/controller.yaml deleted file mode 100644 index 4e5839e297..0000000000 --- a/apps/katib/upstream/installs/katib-leader-election/patches/controller.yaml +++ /dev/null @@ -1,4 +0,0 @@ ---- -- op: add - path: /spec/template/spec/containers/0/args/- - value: "--enable-leader-election" diff --git a/apps/katib/upstream/installs/katib-openshift/katib-config.yaml b/apps/katib/upstream/installs/katib-openshift/katib-config.yaml new file mode 100644 index 0000000000..ff45d96c9e --- /dev/null +++ b/apps/katib/upstream/installs/katib-openshift/katib-config.yaml @@ -0,0 +1,59 @@ +--- +apiVersion: config.kubeflow.org/v1beta1 +kind: KatibConfig +init: + controller: + webhookPort: 8443 + trialResources: + - Job.v1.batch + - TFJob.v1.kubeflow.org + - PyTorchJob.v1.kubeflow.org + - MPIJob.v1.kubeflow.org + - XGBoostJob.v1.kubeflow.org + - MXJob.v1.kubeflow.org +runtime: + metricsCollectors: + - kind: StdOut + image: docker.io/kubeflowkatib/file-metrics-collector:v0.16.0-rc.1 + - kind: File + image: docker.io/kubeflowkatib/file-metrics-collector:v0.16.0-rc.1 + - kind: TensorFlowEvent + image: docker.io/kubeflowkatib/tfevent-metrics-collector:v0.16.0-rc.1 + resources: + limits: + memory: 1Gi + suggestions: + - algorithmName: random + image: docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0-rc.1 + - algorithmName: tpe + image: docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0-rc.1 + - algorithmName: grid + image: docker.io/kubeflowkatib/suggestion-optuna:v0.16.0-rc.1 + - algorithmName: hyperband + image: docker.io/kubeflowkatib/suggestion-hyperband:v0.16.0-rc.1 + - algorithmName: bayesianoptimization + image: docker.io/kubeflowkatib/suggestion-skopt:v0.16.0-rc.1 + - algorithmName: cmaes + image: docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0-rc.1 + - algorithmName: sobol + image: docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0-rc.1 + - algorithmName: multivariate-tpe + image: docker.io/kubeflowkatib/suggestion-optuna:v0.16.0-rc.1 + - algorithmName: enas + image: docker.io/kubeflowkatib/suggestion-enas:v0.16.0-rc.1 + resources: + limits: + memory: 200Mi + - algorithmName: darts + image: docker.io/kubeflowkatib/suggestion-darts:v0.16.0-rc.1 + - algorithmName: pbt + image: docker.io/kubeflowkatib/suggestion-pbt:v0.16.0-rc.1 + persistentVolumeClaimSpec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 5Gi + earlyStoppings: + - algorithmName: medianstop + image: docker.io/kubeflowkatib/earlystopping-medianstop:v0.16.0-rc.1 diff --git a/apps/katib/upstream/installs/katib-openshift/kustomization.yaml b/apps/katib/upstream/installs/katib-openshift/kustomization.yaml index 9013131c32..6f999901fd 100644 --- a/apps/katib/upstream/installs/katib-openshift/kustomization.yaml +++ b/apps/katib/upstream/installs/katib-openshift/kustomization.yaml @@ -30,13 +30,13 @@ resources: images: - name: docker.io/kubeflowkatib/katib-controller newName: docker.io/kubeflowkatib/katib-controller - newTag: v0.15.0-rc.0 + newTag: v0.16.0-rc.1 - name: docker.io/kubeflowkatib/katib-db-manager newName: docker.io/kubeflowkatib/katib-db-manager - newTag: v0.15.0-rc.0 + newTag: v0.16.0-rc.1 - name: docker.io/kubeflowkatib/katib-ui newName: docker.io/kubeflowkatib/katib-ui - newTag: v0.15.0-rc.0 + newTag: v0.16.0-rc.1 patchesJson6902: # Annotate Service to delegate TLS-secret generation to OpenShift service controller @@ -62,3 +62,11 @@ patchesJson6902: kind: MutatingWebhookConfiguration name: katib.kubeflow.org path: patches/webhook-inject-cabundle.yaml + +configMapGenerator: + - name: katib-config + behavior: create + files: + - katib-config.yaml + options: + disableNameSuffixHash: true diff --git a/apps/katib/upstream/installs/katib-standalone-postgres/katib-config.yaml b/apps/katib/upstream/installs/katib-standalone-postgres/katib-config.yaml new file mode 100644 index 0000000000..f59bdd07bb --- /dev/null +++ b/apps/katib/upstream/installs/katib-standalone-postgres/katib-config.yaml @@ -0,0 +1,61 @@ +--- +apiVersion: config.kubeflow.org/v1beta1 +kind: KatibConfig +init: + certGenerator: + enable: true + controller: + webhookPort: 8443 + trialResources: + - Job.v1.batch + - TFJob.v1.kubeflow.org + - PyTorchJob.v1.kubeflow.org + - MPIJob.v1.kubeflow.org + - XGBoostJob.v1.kubeflow.org + - MXJob.v1.kubeflow.org +runtime: + metricsCollectors: + - kind: StdOut + image: docker.io/kubeflowkatib/file-metrics-collector:v0.16.0-rc.1 + - kind: File + image: docker.io/kubeflowkatib/file-metrics-collector:v0.16.0-rc.1 + - kind: TensorFlowEvent + image: docker.io/kubeflowkatib/tfevent-metrics-collector:v0.16.0-rc.1 + resources: + limits: + memory: 1Gi + suggestions: + - algorithmName: random + image: docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0-rc.1 + - algorithmName: tpe + image: docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0-rc.1 + - algorithmName: grid + image: docker.io/kubeflowkatib/suggestion-optuna:v0.16.0-rc.1 + - algorithmName: hyperband + image: docker.io/kubeflowkatib/suggestion-hyperband:v0.16.0-rc.1 + - algorithmName: bayesianoptimization + image: docker.io/kubeflowkatib/suggestion-skopt:v0.16.0-rc.1 + - algorithmName: cmaes + image: docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0-rc.1 + - algorithmName: sobol + image: docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0-rc.1 + - algorithmName: multivariate-tpe + image: docker.io/kubeflowkatib/suggestion-optuna:v0.16.0-rc.1 + - algorithmName: enas + image: docker.io/kubeflowkatib/suggestion-enas:v0.16.0-rc.1 + resources: + limits: + memory: 200Mi + - algorithmName: darts + image: docker.io/kubeflowkatib/suggestion-darts:v0.16.0-rc.1 + - algorithmName: pbt + image: docker.io/kubeflowkatib/suggestion-pbt:v0.16.0-rc.1 + persistentVolumeClaimSpec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 5Gi + earlyStoppings: + - algorithmName: medianstop + image: docker.io/kubeflowkatib/earlystopping-medianstop:v0.16.0-rc.1 diff --git a/apps/katib/upstream/installs/katib-standalone-postgres/kustomization.yaml b/apps/katib/upstream/installs/katib-standalone-postgres/kustomization.yaml index 5aeeeb6776..85df4a14bd 100644 --- a/apps/katib/upstream/installs/katib-standalone-postgres/kustomization.yaml +++ b/apps/katib/upstream/installs/katib-standalone-postgres/kustomization.yaml @@ -15,23 +15,18 @@ resources: - ../../components/postgres/ # Katib UI. - ../../components/ui/ - # Katib Cert Generator - - ../../components/cert-generator/ # Katib webhooks. - ../../components/webhook/ images: - name: docker.io/kubeflowkatib/katib-controller newName: docker.io/kubeflowkatib/katib-controller - newTag: v0.15.0-rc.0 + newTag: v0.16.0-rc.1 - name: docker.io/kubeflowkatib/katib-db-manager newName: docker.io/kubeflowkatib/katib-db-manager - newTag: v0.15.0-rc.0 + newTag: v0.16.0-rc.1 - name: docker.io/kubeflowkatib/katib-ui newName: docker.io/kubeflowkatib/katib-ui - newTag: v0.15.0-rc.0 - - name: docker.io/kubeflowkatib/cert-generator - newName: docker.io/kubeflowkatib/cert-generator - newTag: v0.15.0-rc.0 + newTag: v0.16.0-rc.1 patchesJson6902: - target: group: apps @@ -39,3 +34,15 @@ patchesJson6902: kind: Deployment name: katib-db-manager path: ./patches/db-manager.yaml +configMapGenerator: + - name: katib-config + behavior: create + files: + - katib-config.yaml + options: + disableNameSuffixHash: true +# Secret for webhooks certs. +secretGenerator: + - name: katib-webhook-cert + options: + disableNameSuffixHash: true diff --git a/apps/katib/upstream/installs/katib-standalone/katib-config.yaml b/apps/katib/upstream/installs/katib-standalone/katib-config.yaml new file mode 100644 index 0000000000..f59bdd07bb --- /dev/null +++ b/apps/katib/upstream/installs/katib-standalone/katib-config.yaml @@ -0,0 +1,61 @@ +--- +apiVersion: config.kubeflow.org/v1beta1 +kind: KatibConfig +init: + certGenerator: + enable: true + controller: + webhookPort: 8443 + trialResources: + - Job.v1.batch + - TFJob.v1.kubeflow.org + - PyTorchJob.v1.kubeflow.org + - MPIJob.v1.kubeflow.org + - XGBoostJob.v1.kubeflow.org + - MXJob.v1.kubeflow.org +runtime: + metricsCollectors: + - kind: StdOut + image: docker.io/kubeflowkatib/file-metrics-collector:v0.16.0-rc.1 + - kind: File + image: docker.io/kubeflowkatib/file-metrics-collector:v0.16.0-rc.1 + - kind: TensorFlowEvent + image: docker.io/kubeflowkatib/tfevent-metrics-collector:v0.16.0-rc.1 + resources: + limits: + memory: 1Gi + suggestions: + - algorithmName: random + image: docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0-rc.1 + - algorithmName: tpe + image: docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0-rc.1 + - algorithmName: grid + image: docker.io/kubeflowkatib/suggestion-optuna:v0.16.0-rc.1 + - algorithmName: hyperband + image: docker.io/kubeflowkatib/suggestion-hyperband:v0.16.0-rc.1 + - algorithmName: bayesianoptimization + image: docker.io/kubeflowkatib/suggestion-skopt:v0.16.0-rc.1 + - algorithmName: cmaes + image: docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0-rc.1 + - algorithmName: sobol + image: docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0-rc.1 + - algorithmName: multivariate-tpe + image: docker.io/kubeflowkatib/suggestion-optuna:v0.16.0-rc.1 + - algorithmName: enas + image: docker.io/kubeflowkatib/suggestion-enas:v0.16.0-rc.1 + resources: + limits: + memory: 200Mi + - algorithmName: darts + image: docker.io/kubeflowkatib/suggestion-darts:v0.16.0-rc.1 + - algorithmName: pbt + image: docker.io/kubeflowkatib/suggestion-pbt:v0.16.0-rc.1 + persistentVolumeClaimSpec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 5Gi + earlyStoppings: + - algorithmName: medianstop + image: docker.io/kubeflowkatib/earlystopping-medianstop:v0.16.0-rc.1 diff --git a/apps/katib/upstream/installs/katib-standalone/kustomization.yaml b/apps/katib/upstream/installs/katib-standalone/kustomization.yaml index 147df4d163..fb67984948 100644 --- a/apps/katib/upstream/installs/katib-standalone/kustomization.yaml +++ b/apps/katib/upstream/installs/katib-standalone/kustomization.yaml @@ -15,20 +15,27 @@ resources: - ../../components/mysql/ # Katib UI. - ../../components/ui/ - # Katib Cert Generator - - ../../components/cert-generator/ # Katib webhooks. - ../../components/webhook/ images: - name: docker.io/kubeflowkatib/katib-controller newName: docker.io/kubeflowkatib/katib-controller - newTag: v0.15.0-rc.0 + newTag: v0.16.0-rc.1 - name: docker.io/kubeflowkatib/katib-db-manager newName: docker.io/kubeflowkatib/katib-db-manager - newTag: v0.15.0-rc.0 + newTag: v0.16.0-rc.1 - name: docker.io/kubeflowkatib/katib-ui newName: docker.io/kubeflowkatib/katib-ui - newTag: v0.15.0-rc.0 - - name: docker.io/kubeflowkatib/cert-generator - newName: docker.io/kubeflowkatib/cert-generator - newTag: v0.15.0-rc.0 + newTag: v0.16.0-rc.1 +configMapGenerator: + - name: katib-config + behavior: create + files: + - katib-config.yaml + options: + disableNameSuffixHash: true +# Secret for webhooks certs. +secretGenerator: + - name: katib-webhook-cert + options: + disableNameSuffixHash: true diff --git a/apps/katib/upstream/installs/katib-with-kubeflow/kustomization.yaml b/apps/katib/upstream/installs/katib-with-kubeflow/kustomization.yaml index a77a3b3649..f57f647d84 100644 --- a/apps/katib/upstream/installs/katib-with-kubeflow/kustomization.yaml +++ b/apps/katib/upstream/installs/katib-with-kubeflow/kustomization.yaml @@ -11,13 +11,13 @@ resources: images: - name: docker.io/kubeflowkatib/katib-controller newName: docker.io/kubeflowkatib/katib-controller - newTag: v0.15.0-rc.0 + newTag: v0.16.0-rc.1 - name: docker.io/kubeflowkatib/katib-db-manager newName: docker.io/kubeflowkatib/katib-db-manager - newTag: v0.15.0-rc.0 + newTag: v0.16.0-rc.1 - name: docker.io/kubeflowkatib/katib-ui newName: docker.io/kubeflowkatib/katib-ui - newTag: v0.15.0-rc.0 + newTag: v0.16.0-rc.1 patchesStrategicMerge: - patches/remove-namespace.yaml From 4278133f30c24a6ada4c3cfbf6a99fa2d71bddae Mon Sep 17 00:00:00 2001 From: Daniela Plascencia Date: Wed, 16 Aug 2023 18:29:15 +0200 Subject: [PATCH 2/4] ci: add debug logs --- .github/workflows/katib_kind_test.yaml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/.github/workflows/katib_kind_test.yaml b/.github/workflows/katib_kind_test.yaml index 29eb83e17d..f0d07df652 100644 --- a/.github/workflows/katib_kind_test.yaml +++ b/.github/workflows/katib_kind_test.yaml @@ -56,3 +56,21 @@ jobs: echo "Waiting for the Experiment to become Succeeded..." kubectl wait --for=condition=Succeeded experiments.kubeflow.org -n kubeflow-user --all --timeout 300s + + - name: Save relevant debug artifacts + if: failure() + run: | + mkdir ~/katib-debug-logs + bash -c "df -h | tee ~/katib-debug-logs/df.log" + bash -c "free -h | tee ~/katib-debug-logs/free.log" + kubectl describe deployments -A | tee ~/katib-debug-logs/deployments.log + kubectl describe nodes | tee ~/katib-debug-logs/nodes.log + kubectl describe pods -A | tee ~/katib-debug-logs/describe-pods.log + kubectl get pods -A | tee ~/katib-debug-logs/get-pods.log + + - name: Upload debug artifacts + if: failure() + uses: actions/upload-artifact@v3 + with: + name: katib-debug-logs + path: ~/katib-debug-logs From cfa8660345752f74332f6a3cdbb2d92c8fbef4fd Mon Sep 17 00:00:00 2001 From: Daniela Plascencia Date: Wed, 16 Aug 2023 18:33:18 +0200 Subject: [PATCH 3/4] ci: fix indentation --- .github/workflows/katib_kind_test.yaml | 32 +++++++++++++------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/workflows/katib_kind_test.yaml b/.github/workflows/katib_kind_test.yaml index f0d07df652..7a4085d90d 100644 --- a/.github/workflows/katib_kind_test.yaml +++ b/.github/workflows/katib_kind_test.yaml @@ -57,20 +57,20 @@ jobs: echo "Waiting for the Experiment to become Succeeded..." kubectl wait --for=condition=Succeeded experiments.kubeflow.org -n kubeflow-user --all --timeout 300s - - name: Save relevant debug artifacts - if: failure() - run: | - mkdir ~/katib-debug-logs - bash -c "df -h | tee ~/katib-debug-logs/df.log" - bash -c "free -h | tee ~/katib-debug-logs/free.log" - kubectl describe deployments -A | tee ~/katib-debug-logs/deployments.log - kubectl describe nodes | tee ~/katib-debug-logs/nodes.log - kubectl describe pods -A | tee ~/katib-debug-logs/describe-pods.log - kubectl get pods -A | tee ~/katib-debug-logs/get-pods.log + - name: Save relevant debug artifacts + if: failure() + run: | + mkdir ~/katib-debug-logs + bash -c "df -h | tee ~/katib-debug-logs/df.log" + bash -c "free -h | tee ~/katib-debug-logs/free.log" + kubectl describe deployments -A | tee ~/katib-debug-logs/deployments.log + kubectl describe nodes | tee ~/katib-debug-logs/nodes.log + kubectl describe pods -A | tee ~/katib-debug-logs/describe-pods.log + kubectl get pods -A | tee ~/katib-debug-logs/get-pods.log - - name: Upload debug artifacts - if: failure() - uses: actions/upload-artifact@v3 - with: - name: katib-debug-logs - path: ~/katib-debug-logs + - name: Upload debug artifacts + if: failure() + uses: actions/upload-artifact@v3 + with: + name: katib-debug-logs + path: ~/katib-debug-logs From 6518754d825a2d858c725d3abf5b78dba684f9d0 Mon Sep 17 00:00:00 2001 From: Daniela Plascencia Date: Wed, 16 Aug 2023 23:28:05 +0200 Subject: [PATCH 4/4] ci: remove debug artefacts --- .github/workflows/katib_kind_test.yaml | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/.github/workflows/katib_kind_test.yaml b/.github/workflows/katib_kind_test.yaml index 7a4085d90d..29eb83e17d 100644 --- a/.github/workflows/katib_kind_test.yaml +++ b/.github/workflows/katib_kind_test.yaml @@ -56,21 +56,3 @@ jobs: echo "Waiting for the Experiment to become Succeeded..." kubectl wait --for=condition=Succeeded experiments.kubeflow.org -n kubeflow-user --all --timeout 300s - - - name: Save relevant debug artifacts - if: failure() - run: | - mkdir ~/katib-debug-logs - bash -c "df -h | tee ~/katib-debug-logs/df.log" - bash -c "free -h | tee ~/katib-debug-logs/free.log" - kubectl describe deployments -A | tee ~/katib-debug-logs/deployments.log - kubectl describe nodes | tee ~/katib-debug-logs/nodes.log - kubectl describe pods -A | tee ~/katib-debug-logs/describe-pods.log - kubectl get pods -A | tee ~/katib-debug-logs/get-pods.log - - - name: Upload debug artifacts - if: failure() - uses: actions/upload-artifact@v3 - with: - name: katib-debug-logs - path: ~/katib-debug-logs