-
Notifications
You must be signed in to change notification settings - Fork 448
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add pod level inject webhook #716
Merged
Merged
Changes from 8 commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
aabeba0
Add pod level inject webhook.
wuchunghsuan 8cccde6
Implement sidecar injection (Hard code container)
wuchunghsuan bb42b18
Inject metrics collector as a sidecar
wuchunghsuan 4f06a88
Update metrics-collector to satisfy sidecar
wuchunghsuan 5f9e8b5
Clean up test logs
wuchunghsuan e2831e7
Get experiment name and job kind
wuchunghsuan 9dbb823
Update common labels
wuchunghsuan 12608c5
Separate the sidecar metrics collector
wuchunghsuan c7c5aab
Merge remote-tracking branch 'upstream/master' into pod-webhook
wuchunghsuan File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# Build the manager binary | ||
FROM golang:alpine AS build-env | ||
|
||
# Copy in the go src | ||
ADD . /go/src/github.com/kubeflow/katib | ||
|
||
WORKDIR /go/src/github.com/kubeflow/katib/cmd/sidecar-metricscollector | ||
|
||
# Build | ||
RUN if [ "$(uname -m)" = "ppc64le" ]; then \ | ||
CGO_ENABLED=0 GOOS=linux GOARCH=ppc64le go build -a -o sidecar-metricscollector ./v1alpha2; \ | ||
else \ | ||
CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -a -o sidecar-metricscollector ./v1alpha2; \ | ||
fi | ||
|
||
# Copy the controller-manager into a thin image | ||
FROM alpine:3.7 | ||
WORKDIR /app | ||
COPY --from=build-env /go/src/github.com/kubeflow/katib/cmd/sidecar-metricscollector/sidecar-metricscollector . | ||
ENTRYPOINT ["./sidecar-metricscollector"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
/* | ||
Copyright 2018 The Kubeflow Authors | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
/* | ||
MetricsCollector is a default metricscollector for worker. | ||
It will collect metrics from pod log. | ||
You should print metrics in {{MetricsName}}={{MetricsValue}} format. | ||
For example, the objective value name is F1 and the metrics are loss, your training code should print like below. | ||
--- | ||
epoch 1: | ||
batch1 loss=0.8 | ||
batch2 loss=0.6 | ||
|
||
F1=0.4 | ||
|
||
epoch 2: | ||
batch1 loss=0.4 | ||
batch2 loss=0.2 | ||
|
||
F1=0.7 | ||
--- | ||
The metrics collector will collect all logs of metrics. | ||
*/ | ||
|
||
package main | ||
|
||
import ( | ||
"context" | ||
"flag" | ||
"strings" | ||
|
||
"google.golang.org/grpc" | ||
"k8s.io/klog" | ||
|
||
api "github.com/kubeflow/katib/pkg/api/v1alpha2" | ||
"github.com/kubeflow/katib/pkg/util/v1alpha2/sidecarmetricscollector" | ||
) | ||
|
||
var experimentName = flag.String("e", "", "Experiment Name") | ||
var trialName = flag.String("t", "", "Trial Name") | ||
var jobKind = flag.String("k", "", "Job Kind") | ||
var namespace = flag.String("n", "", "NameSpace") | ||
var managerService = flag.String("m", "", "Katib Manager service") | ||
var metricNames = flag.String("mn", "", "Metric names") | ||
|
||
func main() { | ||
flag.Parse() | ||
klog.Infof("Experiment Name: %s, Trial Name: %s, Job Kind: %s", *experimentName, *trialName, *jobKind) | ||
conn, err := grpc.Dial(*managerService, grpc.WithInsecure()) | ||
if err != nil { | ||
klog.Fatalf("could not connect: %v", err) | ||
} | ||
defer conn.Close() | ||
c := api.NewManagerClient(conn) | ||
mc, err := sidecarmetricscollector.NewSidecarMetricsCollector() | ||
if err != nil { | ||
klog.Fatalf("Failed to create MetricsCollector: %v", err) | ||
} | ||
ctx := context.Background() | ||
olog, err := mc.CollectObservationLog(*trialName, *jobKind, strings.Split(*metricNames, ";"), *namespace) | ||
if err != nil { | ||
klog.Fatalf("Failed to collect logs: %v", err) | ||
} | ||
reportreq := &api.ReportObservationLogRequest{ | ||
TrialName: *trialName, | ||
ObservationLog: olog, | ||
} | ||
_, err = c.ReportObservationLog(ctx, reportreq) | ||
if err != nil { | ||
klog.Fatalf("Failed to Report logs: %v", err) | ||
} | ||
klog.Infof("Metrics reported. :\n%v", olog) | ||
return | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
117 changes: 117 additions & 0 deletions
117
pkg/util/v1alpha2/sidecarmetricscollector/sidecarmetricscollector.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
package sidecarmetricscollector | ||
|
||
import ( | ||
"bytes" | ||
"errors" | ||
"fmt" | ||
"strings" | ||
"time" | ||
|
||
apiv1 "k8s.io/api/core/v1" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
"k8s.io/apimachinery/pkg/labels" | ||
"k8s.io/client-go/kubernetes" | ||
"k8s.io/klog" | ||
"sigs.k8s.io/controller-runtime/pkg/client/config" | ||
|
||
v1alpha2 "github.com/kubeflow/katib/pkg/api/v1alpha2" | ||
commonv1alpha2 "github.com/kubeflow/katib/pkg/common/v1alpha2" | ||
) | ||
|
||
type SidecarMetricsCollector struct { | ||
clientset *kubernetes.Clientset | ||
} | ||
|
||
func NewSidecarMetricsCollector() (*SidecarMetricsCollector, error) { | ||
config, err := config.GetConfig() | ||
if err != nil { | ||
return nil, err | ||
} | ||
clientset, err := kubernetes.NewForConfig(config) | ||
if err != nil { | ||
return nil, err | ||
} | ||
return &SidecarMetricsCollector{ | ||
clientset: clientset, | ||
}, nil | ||
|
||
} | ||
|
||
func (d *SidecarMetricsCollector) CollectObservationLog(tId string, jobKind string, metrics []string, namespace string) (*v1alpha2.ObservationLog, error) { | ||
labelMap := commonv1alpha2.GetJobLabelMap(jobKind, tId) | ||
pl, err := d.clientset.CoreV1().Pods(namespace).List(metav1.ListOptions{LabelSelector: labels.Set(labelMap).String(), IncludeUninitialized: true}) | ||
if err != nil { | ||
return nil, err | ||
} | ||
if len(pl.Items) == 0 { | ||
return nil, fmt.Errorf("No Pods are found in Trial %v", tId) | ||
} | ||
logopt := apiv1.PodLogOptions{Container: "tensorflow", Timestamps: true, Follow: true} | ||
reader, err := d.clientset.CoreV1().Pods(namespace).GetLogs(pl.Items[0].ObjectMeta.Name, &logopt).Stream() | ||
for err != nil { | ||
klog.Errorf("Retry to get logs, Error: %v", err) | ||
time.Sleep(time.Duration(1) * time.Second) | ||
reader, err = d.clientset.CoreV1().Pods(namespace).GetLogs(pl.Items[0].ObjectMeta.Name, &logopt).Stream() | ||
} | ||
buf := new(bytes.Buffer) | ||
buf.ReadFrom(reader) | ||
logs := buf.String() | ||
|
||
olog, err := d.parseLogs(tId, strings.Split(logs, "\n"), metrics) | ||
return olog, err | ||
} | ||
|
||
func (d *SidecarMetricsCollector) parseLogs(tId string, logs []string, metrics []string) (*v1alpha2.ObservationLog, error) { | ||
var lasterr error | ||
olog := &v1alpha2.ObservationLog{} | ||
mlogs := []*v1alpha2.MetricLog{} | ||
for _, logline := range logs { | ||
if logline == "" { | ||
continue | ||
} | ||
ls := strings.SplitN(logline, " ", 2) | ||
if len(ls) != 2 { | ||
klog.Errorf("Error parsing log: %s", logline) | ||
lasterr = errors.New("Error parsing log") | ||
continue | ||
} | ||
_, err := time.Parse(time.RFC3339Nano, ls[0]) | ||
if err != nil { | ||
klog.Errorf("Error parsing time %s: %v", ls[0], err) | ||
lasterr = err | ||
continue | ||
} | ||
kvpairs := strings.Fields(ls[1]) | ||
for _, kv := range kvpairs { | ||
v := strings.Split(kv, "=") | ||
if len(v) > 2 { | ||
klog.Infof("Ignoring trailing garbage: %s", kv) | ||
} | ||
if len(v) == 1 { | ||
continue | ||
} | ||
metricName := "" | ||
for _, m := range metrics { | ||
if v[0] == m { | ||
metricName = v[0] | ||
} | ||
} | ||
if metricName == "" { | ||
continue | ||
} | ||
timestamp := ls[0] | ||
mlogs = append(mlogs, &v1alpha2.MetricLog{ | ||
TimeStamp: timestamp, | ||
Metric: &v1alpha2.Metric{ | ||
Name: metricName, | ||
Value: v[1], | ||
}, | ||
}) | ||
} | ||
} | ||
olog.MetricLogs = mlogs | ||
if lasterr != nil { | ||
return olog, lasterr | ||
} | ||
return olog, nil | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
kubeflow/pytorch-operator#204 had made pytorch also can use "job-name"
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, updated the pytorch label name.