Skip to content

Commit

Permalink
Add ownerReference for configmap and tensorboard (#947)
Browse files Browse the repository at this point in the history
* Add ownerReference for configmap and tensorboard and fix etjob gitImage

Signed-off-by: Syulin7 <735122171@qq.com>

* Update et-operator image

Signed-off-by: Syulin7 <735122171@qq.com>

---------

Signed-off-by: Syulin7 <735122171@qq.com>
  • Loading branch information
Syulin7 authored Apr 3, 2023
1 parent 09a5715 commit 0c2d171
Show file tree
Hide file tree
Showing 15 changed files with 115 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ metadata:
controller-gen.kubebuilder.io/version: v0.6.0
git-repo: https://github.com/AliyunContainerService/et-operator.git
git-branch: master
git-commit: "1499985"
git-commit: "8aff240"
creationTimestamp: null
name: trainingjobs.kai.alibabacloud.com
spec:
Expand Down
2 changes: 1 addition & 1 deletion arena-artifacts/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ pytorch:
et:
enabled: true
image: acs/et-operator
tag: v0.1.3-aliyun-0d6b825
tag: v0.1.3-aliyun-8aff240
imagePullPolicy: IfNotPresent
resources:
limits:
Expand Down
4 changes: 4 additions & 0 deletions charts/etjob/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,7 @@

* change image repo from kube-ai to acs

### 0.3.1

* update gitImage

2 changes: 1 addition & 1 deletion charts/etjob/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ apiVersion: v1
appVersion: "1.0"
description: A Helm chart for ETJob
name: etjob
version: 0.3.0
version: 0.3.1
2 changes: 1 addition & 1 deletion charts/etjob/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ gpuCount: 0 # user define
# rsync image
rsyncImage: registry.cn-zhangjiakou.aliyuncs.com/acs/rsync:v3.1.0-aliyun
# git sync image
gitImage: registry.cn-zhangjiakou.aliyuncs.com/acs/git-sync:v3.1.1
gitImage: registry.cn-zhangjiakou.aliyuncs.com/acs/git-sync:v3.3.5

shmSize: 2Gi
privileged: false
Expand Down
4 changes: 4 additions & 0 deletions charts/pytorchjob/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,7 @@
### 0.5.0

* Support ActiveDeadlineSeconds,TTLSecondsAfterFinished

### 0.5.1

* Add service labels
2 changes: 1 addition & 1 deletion charts/pytorchjob/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ apiVersion: v1
appVersion: "1.0"
description: A Helm chart for PyTorchJob
name: pytorchjob
version: 0.5.0
version: 0.5.1
4 changes: 4 additions & 0 deletions charts/pytorchjob/templates/ingress.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ metadata:
release: {{ .Release.Name }}
heritage: {{ .Release.Service }}
createdBy: "PyTorchJob"
controller-name: pytorch-operator
group-name: kubeflow.org
job-name: { { .Release.Name } }
pytorch-job-name: { { .Release.Name } }
spec:
rules:
- http:
Expand Down
4 changes: 4 additions & 0 deletions charts/pytorchjob/templates/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ metadata:
heritage: {{ .Release.Service }}
role: tensorboard
createdBy: "PyTorchJob"
controller-name: pytorch-operator
group-name: kubeflow.org
job-name: {{ .Release.Name }}
pytorch-job-name: {{ .Release.Name }}
spec:
type: {{ .Values.tensorboardServiceType }}
ports:
Expand Down
4 changes: 4 additions & 0 deletions charts/tfjob/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,7 @@
### 0.39.0

* Support TTLSecondsAfterFinished

### 0.39.1

* Add service labels
2 changes: 1 addition & 1 deletion charts/tfjob/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ apiVersion: v1
appVersion: "1.0"
description: A Helm chart for TFJob
name: tfjob
version: 0.39.0
version: 0.39.1
2 changes: 2 additions & 0 deletions charts/tfjob/templates/ingress.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ metadata:
heritage: {{ .Release.Service }}
role: tensorboard
createdBy: "TFJob"
group-name: kubeflow.org
tf-job-name: { { .Release.Name } }
spec:
rules:
- http:
Expand Down
2 changes: 2 additions & 0 deletions charts/tfjob/templates/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ metadata:
heritage: {{ .Release.Service }}
role: tensorboard
createdBy: "TFJob"
group-name: kubeflow.org
tf-job-name: {{ .Release.Name }}
spec:
type: {{ .Values.tensorboardServiceType }}
ports:
Expand Down
79 changes: 72 additions & 7 deletions pkg/util/kubectl/kubectl.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,21 @@ package kubectl

import (
"context"
"encoding/json"
"fmt"
"github.com/kubeflow/arena/pkg/apis/config"
"io/ioutil"
v1 "k8s.io/api/apps/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"os"
"os/exec"
"strings"
"sync"

log "github.com/sirupsen/logrus"
v1 "k8s.io/api/apps/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/client-go/kubernetes"

"github.com/kubeflow/arena/pkg/apis/config"
)

var kubectlCmd = []string{"arena-kubectl"}
Expand Down Expand Up @@ -230,14 +233,15 @@ func DeleteAppConfigMap(name, namespace string) (err error) {
args := []string{"delete", "configmap", name, "--namespace", namespace}
out, err := kubectl(args)

if err != nil {
if err != nil && !strings.Contains(string(out), "Error from server (NotFound): ") {
log.Debugf("Failed to execute %s, %v with %v", "kubectl", args, err)
log.Debugf("%s", string(out))
return err
} else {
fmt.Printf("%s", string(out))
log.Debugf("configmap %s has been deleted successfully", name)
}

return err
return nil
}

/**
Expand Down Expand Up @@ -354,3 +358,64 @@ func UpdateDeployment(deploy *v1.Deployment) error {
_, err := client.AppsV1().Deployments(deploy.Namespace).Update(context.TODO(), deploy, metav1.UpdateOptions{})
return err
}

// PatchOwnerReferenceWithAppInfoFile patch tfjob / pytorchjob ownerReference
func PatchOwnerReferenceWithAppInfoFile(name, trainingType, appInfoFile, namespace string) error {
binary, err := exec.LookPath(kubectlCmd[0])
if err != nil {
return fmt.Errorf("failed to locate kubectl binary: %v", err)
}
errs := []string{}

// get training job
args := []string{binary, "get", trainingType, name, "--namespace", namespace, "-o json"}
cmd := exec.Command("bash", "-c", strings.Join(args, " "))
out, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("failed to get training job: %v", err)
}

obj := &unstructured.Unstructured{}
err = json.Unmarshal(out, obj)
if err != nil {
return fmt.Errorf("failed to unmarshal training job: %v", err)
}

patch := fmt.Sprintf(`-p='[{"op": "add", "path": "/metadata/ownerReferences", `+
`"value": [{"apiVersion": "%s","kind": "%s","name": "%s","uid": "%s","blockOwnerDeletion": true,"controller": true}]}]'`,
obj.GetAPIVersion(), obj.GetKind(), name, obj.GetUID())

data, err := ioutil.ReadFile(appInfoFile)
if err != nil {
return err
}
resources := strings.Split(string(data), "\n")
// add configmap
configmapName := fmt.Sprintf("%v-%v", name, trainingType)
resources = append(resources, "configmap/"+configmapName)

for _, resource := range resources {
// skip tfjob / pytorchjob.
if resource == "tfjob.kubeflow.org/"+name ||
resource == "pytorchjob.kubeflow.org/"+name {
continue
}

// patch ownerReferences
args := []string{binary, "patch", resource, "--namespace", namespace, "--type=json", patch}
log.Debugf("Exec bash -c %v", args)
cmd := exec.Command("bash", "-c", strings.Join(args, " "))
out, err := cmd.CombinedOutput()
log.Debugf("%s", string(out))
if err != nil {
errs = append(errs, string(out))
}
}

if len(errs) != 0 {
log.Debugf("Failed to patch ownerReference,reason: %v", errs)
return fmt.Errorf("%v", strings.Join(errs, "\n"))
}

return nil
}
14 changes: 13 additions & 1 deletion pkg/workflow/workflow.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ import (
"github.com/kubeflow/arena/pkg/util/kubectl"
log "github.com/sirupsen/logrus"
k8serrors "k8s.io/apimachinery/pkg/api/errors"

"github.com/kubeflow/arena/pkg/apis/types"
)

/**
Expand Down Expand Up @@ -164,7 +166,17 @@ func SubmitJob(name string, trainingType string, namespace string, values interf
return err
}

// 6. Clean up the template file
// 6. Patch OwnerReference for tfjob / pytorchjob
if trainingType == string(types.TFTrainingJob) ||
trainingType == string(types.PytorchTrainingJob) {
err := kubectl.PatchOwnerReferenceWithAppInfoFile(name, trainingType, appInfoFileName, namespace)
if err != nil {
log.Warnf("Failed to patch ownerReference %s due to %v`", name, err)
return err
}
}

// 7. Clean up the template file
if log.GetLevel() != log.DebugLevel {
err = os.Remove(valueFileName)
if err != nil {
Expand Down

0 comments on commit 0c2d171

Please sign in to comment.