From 518d7e7f19706d7b1b6ebad2062bb705067c7fcb Mon Sep 17 00:00:00 2001 From: hougang liu Date: Tue, 22 Jan 2019 09:12:26 +0800 Subject: [PATCH 1/4] ignore tfjob/pytorch job if corresponding CRD not created --- pkg/controller/studyjob/katib_api_util.go | 4 ++ pkg/controller/studyjob/manifest_parser.go | 44 ---------------- .../studyjob/studyjob_controller.go | 10 +++- pkg/controller/studyjob/utils.go | 52 +++++++++++++++++++ 4 files changed, 64 insertions(+), 46 deletions(-) diff --git a/pkg/controller/studyjob/katib_api_util.go b/pkg/controller/studyjob/katib_api_util.go index 87e3fc14947..1b309d6c8e1 100644 --- a/pkg/controller/studyjob/katib_api_util.go +++ b/pkg/controller/studyjob/katib_api_util.go @@ -138,6 +138,10 @@ func deleteStudy(instance *katibv1alpha1.StudyJob) error { c := katibapi.NewManagerClient(conn) ctx := context.Background() studyID := instance.Status.StudyID + if studyID == "" { + // in case that information for a studyjob is not created in DB + return nil + } deleteStudyreq := &katibapi.DeleteStudyRequest{ StudyId: studyID, } diff --git a/pkg/controller/studyjob/manifest_parser.go b/pkg/controller/studyjob/manifest_parser.go index 2334d5d40a2..4635c55e512 100644 --- a/pkg/controller/studyjob/manifest_parser.go +++ b/pkg/controller/studyjob/manifest_parser.go @@ -18,7 +18,6 @@ import ( "bytes" "context" "fmt" - "log" "text/template" katibapi "github.com/kubeflow/katib/pkg/api" @@ -27,51 +26,8 @@ import ( "github.com/kubeflow/katib/pkg/manager/studyjobclient" "k8s.io/apimachinery/pkg/util/uuid" - k8syaml "k8s.io/apimachinery/pkg/util/yaml" ) -func getWorkerKind(workerSpec *katibv1alpha1.WorkerSpec) (string, error) { - var typeChecker interface{} - BUFSIZE := 1024 - _, m, err := getWorkerManifest( - nil, - "validation", - &katibapi.Trial{ - TrialId: "validation", - ParameterSet: []*katibapi.Parameter{}, - }, - workerSpec, - "", - "", - true, - ) - if err != nil { - return "", err - } - if err := k8syaml.NewYAMLOrJSONDecoder(m, BUFSIZE).Decode(&typeChecker); err != nil { - log.Printf("Yaml decode validation error %v", err) - return "", err - } - tcMap, ok := typeChecker.(map[string]interface{}) - if !ok { - return "", fmt.Errorf("Cannot get kind of worker %v", typeChecker) - } - wkind, ok := tcMap["kind"] - if !ok { - return "", fmt.Errorf("Cannot get kind of worker %v", typeChecker) - } - wkindS, ok := wkind.(string) - if !ok { - return "", fmt.Errorf("Cannot get kind of worker %v", typeChecker) - } - for _, kind := range ValidWorkerKindList { - if kind == wkindS { - return wkindS, nil - } - } - return "", fmt.Errorf("Invalid kind of worker %v", typeChecker) -} - func getWorkerManifest(c katibapi.ManagerClient, studyID string, trial *katibapi.Trial, workerSpec *katibv1alpha1.WorkerSpec, kind string, ns string, dryrun bool) (string, *bytes.Buffer, error) { var wtp *template.Template = nil var err error diff --git a/pkg/controller/studyjob/studyjob_controller.go b/pkg/controller/studyjob/studyjob_controller.go index 0fdaf74d094..42050e39b88 100644 --- a/pkg/controller/studyjob/studyjob_controller.go +++ b/pkg/controller/studyjob/studyjob_controller.go @@ -51,6 +51,10 @@ const ( cleanDataFinalizer = "clean-studyjob-data" ) +var ( + invalidCRDResources [] string +) + /** * USER ACTION REQUIRED: This is a scaffold file intended for the user to modify with their own Controller * business logic. Delete these comments after modifying this file.* @@ -117,7 +121,8 @@ func add(mgr manager.Manager, r reconcile.Reconciler) error { OwnerType: &katibv1alpha1.StudyJob{}, }) if err != nil { - return err + invalidCRDResources = append(invalidCRDResources, TFJobWorker) + log.Printf("Fail to watch TFJob resource: %v", err) } err = c.Watch( @@ -127,7 +132,8 @@ func add(mgr manager.Manager, r reconcile.Reconciler) error { OwnerType: &katibv1alpha1.StudyJob{}, }) if err != nil { - return err + invalidCRDResources = append(invalidCRDResources, PyTorchJobWorker) + log.Printf("Fail to watch PyTorchJob resource: %v", err) } return nil diff --git a/pkg/controller/studyjob/utils.go b/pkg/controller/studyjob/utils.go index a8595ac7ded..bf000eca9c8 100644 --- a/pkg/controller/studyjob/utils.go +++ b/pkg/controller/studyjob/utils.go @@ -39,6 +39,58 @@ func createWorkerJobObj(kind string) runtime.Object { return nil } +func validateWorkerResource(wkind string) error { + for _, crd := range invalidCRDResources { + if crd == wkind { + return fmt.Errorf("Cannot support %s; If CRD for it installed, please restart studyjob-controller to take effect", wkind) + } + } + return nil +} + + +func getWorkerKind(workerSpec *katibv1alpha1.WorkerSpec) (string, error) { + var typeChecker interface{} + BUFSIZE := 1024 + _, m, err := getWorkerManifest( + nil, + "validation", + &katibapi.Trial{ + TrialId: "validation", + ParameterSet: []*katibapi.Parameter{}, + }, + workerSpec, + "", + "", + true, + ) + if err != nil { + return "", err + } + if err := k8syaml.NewYAMLOrJSONDecoder(m, BUFSIZE).Decode(&typeChecker); err != nil { + log.Printf("Yaml decode validation error %v", err) + return "", err + } + tcMap, ok := typeChecker.(map[string]interface{}) + if !ok { + return "", fmt.Errorf("Cannot get kind of worker %v", typeChecker) + } + wkind, ok := tcMap["kind"] + if !ok { + return "", fmt.Errorf("Cannot get kind of worker %v", typeChecker) + } + wkindS, ok := wkind.(string) + if !ok { + return "", fmt.Errorf("Cannot get kind of worker %v", typeChecker) + } + for _, kind := range ValidWorkerKindList { + if kind == wkindS { + return wkindS, validateWorkerResource(kind) + } + } + return "", fmt.Errorf("Invalid kind of worker %v", typeChecker) +} + func validateStudy(instance *katibv1alpha1.StudyJob, namespace string) error { if instance.Spec.SuggestionSpec == nil { return fmt.Errorf("No Spec.SuggestionSpec specified.") From 3aa21425bd0c7a3851fc6efe20e0175b80a870b8 Mon Sep 17 00:00:00 2001 From: hougang liu Date: Wed, 23 Jan 2019 13:15:01 +0800 Subject: [PATCH 2/4] update log message --- pkg/controller/studyjob/utils.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/controller/studyjob/utils.go b/pkg/controller/studyjob/utils.go index bf000eca9c8..dded45909d8 100644 --- a/pkg/controller/studyjob/utils.go +++ b/pkg/controller/studyjob/utils.go @@ -42,7 +42,7 @@ func createWorkerJobObj(kind string) runtime.Object { func validateWorkerResource(wkind string) error { for _, crd := range invalidCRDResources { if crd == wkind { - return fmt.Errorf("Cannot support %s; If CRD for it installed, please restart studyjob-controller to take effect", wkind) + return fmt.Errorf("Cannot support %s; Please install the CRD and restart studyjob-controller", wkind) } } return nil From 7e3474932c9d116d0691dbf09d471f981e5773bb Mon Sep 17 00:00:00 2001 From: hougang liu Date: Wed, 23 Jan 2019 15:07:13 +0800 Subject: [PATCH 3/4] only ignore NoMatchError when watch CRD --- pkg/controller/studyjob/studyjob_controller.go | 10 ++++------ pkg/controller/studyjob/utils.go | 13 +++++++++++++ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/pkg/controller/studyjob/studyjob_controller.go b/pkg/controller/studyjob/studyjob_controller.go index 42050e39b88..eaba63501e2 100644 --- a/pkg/controller/studyjob/studyjob_controller.go +++ b/pkg/controller/studyjob/studyjob_controller.go @@ -120,9 +120,8 @@ func add(mgr manager.Manager, r reconcile.Reconciler) error { IsController: true, OwnerType: &katibv1alpha1.StudyJob{}, }) - if err != nil { - invalidCRDResources = append(invalidCRDResources, TFJobWorker) - log.Printf("Fail to watch TFJob resource: %v", err) + if !ignoreWatchError(err, TFJobWorker) { + return err } err = c.Watch( @@ -131,9 +130,8 @@ func add(mgr manager.Manager, r reconcile.Reconciler) error { IsController: true, OwnerType: &katibv1alpha1.StudyJob{}, }) - if err != nil { - invalidCRDResources = append(invalidCRDResources, PyTorchJobWorker) - log.Printf("Fail to watch PyTorchJob resource: %v", err) + if !ignoreWatchError(err, PyTorchJobWorker) { + return err } return nil diff --git a/pkg/controller/studyjob/utils.go b/pkg/controller/studyjob/utils.go index dded45909d8..db31ff6e1f2 100644 --- a/pkg/controller/studyjob/utils.go +++ b/pkg/controller/studyjob/utils.go @@ -22,6 +22,7 @@ import ( batchv1 "k8s.io/api/batch/v1" batchv1beta "k8s.io/api/batch/v1beta1" + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" k8syaml "k8s.io/apimachinery/pkg/util/yaml" @@ -48,6 +49,18 @@ func validateWorkerResource(wkind string) error { return nil } +func ignoreWatchError(err error, job string) bool { + if err == nil { + return true + } + if meta.IsNoMatchError(err) { + invalidCRDResources = append(invalidCRDResources, job) + log.Printf("Fail to watch CRD: %v; Please install the CRD and restart studyjob-controller to support %s worker", err, job) + return true + } else { + return false + } +} func getWorkerKind(workerSpec *katibv1alpha1.WorkerSpec) (string, error) { var typeChecker interface{} From 3ef89a09aa4f5f1953a97c0e673b1912c38f541c Mon Sep 17 00:00:00 2001 From: hougang liu Date: Fri, 25 Jan 2019 07:44:46 +0800 Subject: [PATCH 4/4] refactor func name for watch error --- pkg/controller/studyjob/studyjob_controller.go | 4 ++-- pkg/controller/studyjob/utils.go | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/controller/studyjob/studyjob_controller.go b/pkg/controller/studyjob/studyjob_controller.go index eaba63501e2..245d1fc4742 100644 --- a/pkg/controller/studyjob/studyjob_controller.go +++ b/pkg/controller/studyjob/studyjob_controller.go @@ -120,7 +120,7 @@ func add(mgr manager.Manager, r reconcile.Reconciler) error { IsController: true, OwnerType: &katibv1alpha1.StudyJob{}, }) - if !ignoreWatchError(err, TFJobWorker) { + if isFatalWatchError(err, TFJobWorker) { return err } @@ -130,7 +130,7 @@ func add(mgr manager.Manager, r reconcile.Reconciler) error { IsController: true, OwnerType: &katibv1alpha1.StudyJob{}, }) - if !ignoreWatchError(err, PyTorchJobWorker) { + if isFatalWatchError(err, PyTorchJobWorker) { return err } diff --git a/pkg/controller/studyjob/utils.go b/pkg/controller/studyjob/utils.go index db31ff6e1f2..237ca102c33 100644 --- a/pkg/controller/studyjob/utils.go +++ b/pkg/controller/studyjob/utils.go @@ -49,16 +49,16 @@ func validateWorkerResource(wkind string) error { return nil } -func ignoreWatchError(err error, job string) bool { +func isFatalWatchError(err error, job string) bool { if err == nil { - return true + return false } if meta.IsNoMatchError(err) { invalidCRDResources = append(invalidCRDResources, job) log.Printf("Fail to watch CRD: %v; Please install the CRD and restart studyjob-controller to support %s worker", err, job) - return true - } else { return false + } else { + return true } }