Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ Add OpenCensus controller metrics #368

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions Gopkg.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

59 changes: 59 additions & 0 deletions pkg/controller/metrics/default_views.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
Copyright 2018 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package metrics

import (
"go.opencensus.io/stats/view"
"go.opencensus.io/tag"
)

var (
// DefaultPrometheusDistribution is an OpenCensus Distribution with the same
// buckets as the default buckets in the Prometheus client.
DefaultPrometheusDistribution = view.Distribution(.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10)

// ViewReconcileTotal counts ReconcileTotal with Controller and Result tags.
ViewReconcileTotal = view.View{
Name: "controller_runtime_reconcile_total",
Measure: MeasureReconcileTotal,
Aggregation: view.Count(),
TagKeys: []tag.Key{TagController, TagResult},
}

// ViewReconcileError counts ReconcileError with a Controller tag.
ViewReconcileError = view.View{
Name: "controller_runtime_reconcile_errors_total",
Measure: MeasureReconcileErrors,
Aggregation: view.Count(),
TagKeys: []tag.Key{TagController},
}
// ViewReconcileTime is a histogram of ReconcileTime with a Controller tag.
ViewReconcileTime = view.View{
Name: "controller_runtime_reconcile_time_seconds",
Measure: MeasureReconcileTime,
Aggregation: DefaultPrometheusDistribution,
TagKeys: []tag.Key{TagController},
}

// DefaultViews is an array of OpenCensus views that can be registered
// using view.Register(metrics.DefaultViews...) to export default metrics.
DefaultViews = []*view.View{
&ViewReconcileTotal,
&ViewReconcileError,
&ViewReconcileTime,
}
)
50 changes: 50 additions & 0 deletions pkg/controller/metrics/default_views_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
Copyright 2018 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package metrics

import (
"testing"

"go.opencensus.io/stats/view"
)

func TestRegisterUnregisterDefaultViews(t *testing.T) {
if err := view.Register(DefaultViews...); err != nil {
t.Fatalf("Error registering views: %v", err)
}
if view.Find(ViewReconcileTotal.Name) == nil {
t.Errorf("Couldn't find view: %s", ViewReconcileTotal.Name)
}
if view.Find(ViewReconcileError.Name) == nil {
t.Errorf("Couldn't find view: %s", ViewReconcileError.Name)
}
if view.Find(ViewReconcileTime.Name) == nil {
t.Errorf("Couldn't find view %s", ViewReconcileTime.Name)
}

view.Unregister(DefaultViews...)
if view.Find(ViewReconcileTotal.Name) != nil {
t.Errorf("view %s was not unregistered", ViewReconcileTotal.Name)
}
if view.Find(ViewReconcileError.Name) != nil {
t.Errorf("view %s was not unregistered", ViewReconcileError.Name)
}
if view.Find(ViewReconcileTime.Name) != nil {
t.Errorf("view %s was not unregistered", ViewReconcileTime.Name)
}

}
73 changes: 73 additions & 0 deletions pkg/controller/metrics/oc_metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
Copyright 2018 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package metrics

import (
"go.opencensus.io/stats"
"go.opencensus.io/tag"
)

var (
// MeasureReconcileTotal is a counter which records the total number of reconciliations
// per controller. It has two tags: controller refers
// to the controller name and result refers to the reconcile result e.g.
// success, error, requeue, requeue_after.
MeasureReconcileTotal = stats.Int64(
"sigs.kubernetes.io/controller-runtime/measures/reconcile_total",
"Total number of reconciliations per controller",
stats.UnitNone,
)

// MeasureReconcileErrors is a counter which records the total
// number of errors from the Reconciler. It has one tag: controller refers
// to the controller name. TODO is this necessary when we have the above with
// error tag?
MeasureReconcileErrors = stats.Int64(
"sigs.kubernetes.io/controller-runtime/measures/reconcile_errors_total",
"Total number of reconciliation errors per controller",
stats.UnitNone,
)

// MeasureReconcileTime is a measure which keeps track of the duration
// of reconciliations. It has one tag: controller refers
// to the controller name. // TODO should this be milliseconds?
MeasureReconcileTime = stats.Float64(
"sigs.kubernetes.io/controller-runtime/measures/reconcile_time_seconds",
"Length of time per reconciliation per controller",
"s",
)

// Tag keys must conform to the restrictions described in
// go.opencensus.io/tag/validate.go. Currently those restrictions are:
// - length between 1 and 255 inclusive
// - characters are printable US-ASCII

// TagController is a tag referring to the controller name that produced a
// measurement.
TagController = mustNewTagKey("controller")

// TagResult is a tag referring to the reconcile result of a reconciler.
TagResult = mustNewTagKey("result")
)

func mustNewTagKey(k string) tag.Key {
tagKey, err := tag.NewKey(k)
if err != nil {
panic(err)
}
return tagKey
}
34 changes: 31 additions & 3 deletions pkg/internal/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,13 @@ limitations under the License.
package controller

import (
"context"
"fmt"
"sync"
"time"

"go.opencensus.io/stats"
"go.opencensus.io/tag"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
Expand All @@ -29,6 +32,7 @@ import (
"k8s.io/client-go/util/workqueue"
"sigs.k8s.io/controller-runtime/pkg/cache"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/metrics"
"sigs.k8s.io/controller-runtime/pkg/handler"
ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/internal/controller/metrics"
logf "sigs.k8s.io/controller-runtime/pkg/internal/log"
Expand Down Expand Up @@ -173,10 +177,18 @@ func (c *Controller) Start(stop <-chan struct{}) error {
func (c *Controller) processNextWorkItem() bool {
// This code copy-pasted from the sample-Controller.

// Create a context used for tagging OpenCensus measurements.
metricsCtx, _ := tag.New(context.Background(),

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there are tags present in the background context, then those values will continue to be propagated to the newly created context. As such, if we use tag.Insert in subsequent calls for existing tags, the values existing tag keys will not be updated.

Is that desired behavior?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@logicalhan Are you saying the context returned by context.Background() may have tags present? I'm not aware of a way to add values to the background context, so I don't expect there will ever be tags present in it. As such I believe that in this code there is no effective difference between tag.Insert and tag.Upsert.

If I'm misunderstanding your question, please let me know. :)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To maintain contextual information of the specific request, tag propagation through your distributed system is necessary; the higher levels generate tags that are then passed down to the lower-level services. Data is collected with the tags and are sent to your observability systems.

Above, a request comes in to the Web server. Web server tags all the outgoing requests with the following tags.
originator=photo-app
frontend=web
These values are propagated all the way down to database and the CDN.

With these tags, you can uniquely identify and break down which service called the downstream services, how much quota they’ve been used, what calls are failing and more.

In OpenCensus, tags can be propagated downstream and the values are stored in the context. Since this is the first time OpenCensus is being introduced, this is likely safe (since nothing else would be passing along OpenCensus tag information). However, that assumption may not hold in the future.

I'm actually not saying that tag.Insert is the incorrect thing to do here. I'm just pointing out that there is a difference in behavior between Insert and Update and depending on what we actually want to do here, one would be more appropriate than the other. Insert can no-op if we pass tag information to whatever triggers the controller. That may or may not be desirable.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, Upsert is probably safer, since it effectively forces the same behavior regardless of what happens upstream.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, that clarifies things!

In this hypothetical future when the metrics context is received from an external source that may have added tags of its own, I believe tag.Insert is correct here. If the tag is already specified by the user of the library, the library should respect the user's intent and avoid overwriting their metrics tags with tag.Upsert.

Does that seem reasonable?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would imagine you would need to Upsert the result tag. If a request has gotten here, then it would presumably have been successful upstream.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually all the result tags, you probably want to upsert.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since controllers can requeue things and pick them up again later, if something gets requeued, then wouldn't we expect the context have tags? It seems that we really do want to be judicious about using Insert or Upsert, since it will change existing outputted metric data.

It may be worth adding a testcase around this by forcing a requeue and then forcing an error when it gets picked up again and comparing the measurements with what we'd expect.

tag.Insert(metrics.TagController, c.Name),
)

// Update metrics after processing each item
// TODO if no reconcile occurred, should the time metric be updated?
reconcileStartTS := time.Now()
defer func() {
c.updateMetrics(time.Now().Sub(reconcileStartTS))
reconcileTime := time.Now().Sub(reconcileStartTS)
c.updatePromMetrics(reconcileTime)
c.updateOCMetrics(metricsCtx, reconcileTime)
}()

obj, shutdown := c.Queue.Get()
Expand Down Expand Up @@ -210,21 +222,31 @@ func (c *Controller) processNextWorkItem() bool {
return true
}

// Defer recording that a reconcile occurred. Result checks will set the
// correct tag in metricsCtx.
defer func() {
stats.Record(metricsCtx, metrics.MeasureReconcileTotal.M(1))
}()
// RunInformersAndControllers the syncHandler, passing it the namespace/Name string of the
// resource to be synced.
if result, err := c.Do.Reconcile(req); err != nil {
c.Queue.AddRateLimited(req)
log.Error(err, "Reconciler error", "controller", c.Name, "request", req)
ctrlmetrics.ReconcileErrors.WithLabelValues(c.Name).Inc()
ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, "error").Inc()
stats.Record(metricsCtx, metrics.MeasureReconcileErrors.M(1))
metricsCtx, _ = tag.New(metricsCtx, tag.Insert(metrics.TagResult, "error"))
return false
} else if result.RequeueAfter > 0 {
c.Queue.AddAfter(req, result.RequeueAfter)
ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, "requeue_after").Inc()
metricsCtx, _ = tag.New(metricsCtx, tag.Insert(metrics.TagResult, "requeue_after"))
return true
} else if result.Requeue {
c.Queue.AddRateLimited(req)
ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, "requeue").Inc()
metricsCtx, _ = tag.New(metricsCtx, tag.Insert(metrics.TagResult, "requeue"))

return true
}

Expand All @@ -236,6 +258,7 @@ func (c *Controller) processNextWorkItem() bool {
log.V(1).Info("Successfully Reconciled", "controller", c.Name, "request", req)

ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, "success").Inc()
metricsCtx, _ = tag.New(metricsCtx, tag.Insert(metrics.TagResult, "success"))
// Return true, don't take a break
return true
}
Expand All @@ -246,7 +269,12 @@ func (c *Controller) InjectFunc(f inject.Func) error {
return nil
}

// updateMetrics updates prometheus metrics within the controller
func (c *Controller) updateMetrics(reconcileTime time.Duration) {
// updatePromMetrics updates prometheus metrics within the controller
func (c *Controller) updatePromMetrics(reconcileTime time.Duration) {
ctrlmetrics.ReconcileTime.WithLabelValues(c.Name).Observe(reconcileTime.Seconds())
}

// updateOCMetrics updates OpenCensus metrics within the controller
func (c *Controller) updateOCMetrics(ctx context.Context, reconcileTime time.Duration) {
stats.Record(ctx, metrics.MeasureReconcileTime.M(reconcileTime.Seconds()))
}
Loading