diff --git a/README.md b/README.md index c3f392c..b8a6636 100644 --- a/README.md +++ b/README.md @@ -298,13 +298,28 @@ All performed checks expose metrics which can be used to monitor/alert: At `/metrics` you will find the following metrics: -- `kubenurse_errors_total`: Kubenurse error counter partitioned by error type -- `kubenurse_request_duration`: a histogram for Kubenurse request duration partitioned by error type -- `kubenurse_httpclient_request_duration_seconds`: a latency histogram of request latencies from the kubenurse http client. -- `kubenurse_httpclient_trace_requests_total`: a latency histogram for the http - client _trace_ metric instrumentation, with detailed statistics for e.g. - `dns_start`, `got_conn` events, and more. the details can be seen in the - [`httptrace.go`](https://github.com/postfinance/kubenurse/blob/52767fbb280b65c06ac926dac49dd874e9ec4aee/internal/servicecheck/httptrace.go#L73) +- ~~`kubenurse_request_duration`~~ \ + (deprecated since v1.13.0) latency histogram for + request duration, replaced with the metric below. +- `kubenurse_httpclient_request_duration_seconds` \ + latency histogram for request duration, partitioned by request type +- `kubenurse_httpclient_trace_request_duration_seconds` \ + latency histogram for httpclient _trace_ metric instrumentation, partitioned + by request type and httptrace connection events, where events are e.g. + `dns_start`, `got_conn`, `tls_handshake_done`, and more. the details can be + seen in the + [`httptrace.go`](https://github.com/postfinance/kubenurse/blob/v1.13.0/internal/servicecheck/httptrace.go#L91) file -- `kubenurse_neighbourhood_incoming_checks`: a gauge which reports how many - unique neighbours have queried the current pod in the last minute +- `kubenurse_httpclient_requests_total` \ + counter for the total number of http requests, partitioned by HTTP code, + method, and request type +- `kubenurse_errors_total` \ + error counter, partitioned by httptrace event and request type +- `kubenurse_neighbourhood_incoming_checks` \ + gauge which reports how many unique neighbours have queried the current pod + in the last minute + +For metrics partitioned with a `type` label, it is possible to precisely know +which request type increased an error counter, or to compare the latencies of +multiple request types, for example compare how your service and ingress +latencies differ. diff --git a/internal/kubenurse/server.go b/internal/kubenurse/server.go index c43e66c..e5c8db8 100644 --- a/internal/kubenurse/server.go +++ b/internal/kubenurse/server.go @@ -4,7 +4,7 @@ package kubenurse import ( "context" "fmt" - "log" + "log/slog" "net/http" "os" "strconv" @@ -122,7 +122,8 @@ func New(ctx context.Context, c client.Client) (*Server, error) { //nolint:funle bucket, e := strconv.ParseFloat(bucketStr, 64) if e != nil { - log.Fatalf("couldn't parse one of the custom histogram buckets. error:\n%v", e) + slog.Error("couldn't parse one of the custom histogram buckets", "bucket", bucket, "err", e) + return nil, e } histogramBuckets = append(histogramBuckets, bucket) @@ -243,6 +244,8 @@ func (s *Server) Run() error { }() } + slog.Info("kubenurse just started") + wg.Wait() close(errc) diff --git a/internal/servicecheck/httptrace.go b/internal/servicecheck/httptrace.go index baa14d2..2b1a7df 100644 --- a/internal/servicecheck/httptrace.go +++ b/internal/servicecheck/httptrace.go @@ -3,7 +3,7 @@ package servicecheck import ( "context" "crypto/tls" - "log" + "log/slog" "net/http" "net/http/httptrace" "time" @@ -24,7 +24,9 @@ func (rt RoundTripperFunc) RoundTrip(r *http.Request) (*http.Response, error) { // This collects traces and logs errors. As promhttp.InstrumentRoundTripperTrace doesn't process // errors, this is custom made and inspired by prometheus/client_golang's promhttp -func withHttptrace(registry *prometheus.Registry, next http.RoundTripper, durationHistogram []float64) http.RoundTripper { +// +//nolint:funlen // needed to pack all histograms and use them directly in the httptrace wrapper +func withHttptrace(registry *prometheus.Registry, next http.RoundTripper, durHist []float64) http.RoundTripper { httpclientReqTotal := prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: MetricsNamespace, @@ -39,7 +41,7 @@ func withHttptrace(registry *prometheus.Registry, next http.RoundTripper, durati Namespace: MetricsNamespace, Name: "httpclient_request_duration_seconds", Help: "A latency histogram of request latencies from the kubenurse http client.", - Buckets: durationHistogram, + Buckets: durHist, }, []string{"type"}, ) @@ -49,20 +51,31 @@ func withHttptrace(registry *prometheus.Registry, next http.RoundTripper, durati Namespace: MetricsNamespace, Name: "httpclient_trace_request_duration_seconds", Help: "Latency histogram for requests from the kubenurse http client. Time in seconds since the start of the http request.", - Buckets: durationHistogram, + Buckets: durHist, }, []string{"event", "type"}, ) - registry.MustRegister(httpclientReqTotal, httpclientReqDuration, httpclientTraceReqDuration) + errorCounter := prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: MetricsNamespace, + Name: "errors_total", + Help: "Kubenurse error counter partitioned by error type", + }, + []string{"event", "type"}, + ) + + registry.MustRegister(httpclientReqTotal, httpclientReqDuration, httpclientTraceReqDuration, errorCounter) collectMetric := func(traceEventType string, start time.Time, r *http.Request, err error) { td := time.Since(start).Seconds() kubenurseTypeLabel := r.Context().Value(kubenurseTypeKey{}).(string) - // If we got an error inside a trace, log it and do not collect metrics + // If we get an error inside a trace, log it if err != nil { - log.Printf("httptrace: failed %s for %s with %v", traceEventType, kubenurseTypeLabel, err) + errorCounter.WithLabelValues(traceEventType, kubenurseTypeLabel).Inc() + slog.Error("request failure in httptrace", "event_type", traceEventType, "request_type", kubenurseTypeLabel, "err", err) + return } @@ -94,8 +107,8 @@ func withHttptrace(registry *prometheus.Registry, next http.RoundTripper, durati TLSHandshakeStart: func() { collectMetric("tls_handshake_start", start, r, nil) }, - TLSHandshakeDone: func(_ tls.ConnectionState, _ error) { - collectMetric("tls_handshake_done", start, r, nil) + TLSHandshakeDone: func(_ tls.ConnectionState, err error) { + collectMetric("tls_handshake_done", start, r, err) }, WroteRequest: func(info httptrace.WroteRequestInfo) { collectMetric("wrote_request", start, r, info.Err) diff --git a/internal/servicecheck/neighbours.go b/internal/servicecheck/neighbours.go index 3cf413a..b8fedee 100644 --- a/internal/servicecheck/neighbours.go +++ b/internal/servicecheck/neighbours.go @@ -103,7 +103,7 @@ func (c *Checker) checkNeighbours(nh []*Neighbour) { return c.doRequest(ctx, "http://"+neighbour.PodIP+":8080/alwayshappy", true) } - _, _ = c.measure(check, "path_"+neighbour.NodeName) + c.measure(check, "path_"+neighbour.NodeName) } } diff --git a/internal/servicecheck/servicecheck.go b/internal/servicecheck/servicecheck.go index 0f60df6..85e55bf 100644 --- a/internal/servicecheck/servicecheck.go +++ b/internal/servicecheck/servicecheck.go @@ -5,7 +5,7 @@ import ( "context" "crypto/tls" "fmt" - "log" + "log/slog" "net" "net/http" "os" @@ -26,19 +26,11 @@ const ( // results. Other parameters of the Checker struct need to be configured separately. func New(_ context.Context, cl client.Client, promRegistry *prometheus.Registry, allowUnschedulable bool, cacheTTL time.Duration, durationHistogramBuckets []float64) (*Checker, error) { - errorCounter := prometheus.NewCounterVec( - prometheus.CounterOpts{ - Namespace: MetricsNamespace, - Name: "errors_total", - Help: "Kubenurse error counter partitioned by error type", - }, - []string{"type"}, - ) - // setup http transport tlsConfig, err := generateTLSConfig(os.Getenv("KUBENURSE_EXTRA_CA")) if err != nil { - log.Printf("cannot generate tlsConfig with KUBENURSE_EXTRA_CA: %s", err) + slog.Error("cannot generate tlsConfig with provided KUBENURSE_EXTRA_CA. Continuing with default tlsConfig", + "KUBENURSE_EXTRA_CA", os.Getenv("KUBENURSE_EXTRA_CA"), "err", err) tlsConfig = &tls.Config{MinVersion: tls.VersionTLS12} } @@ -70,40 +62,26 @@ func New(_ context.Context, cl client.Client, promRegistry *prometheus.Registry, client: cl, httpClient: httpClient, cacheTTL: cacheTTL, - errorCounter: errorCounter, - durationHistogram: durationHistogram, stop: make(chan struct{}), }, nil } // Run runs all servicechecks and returns the result togeter with a boolean which indicates success. The cache // is respected. -func (c *Checker) Run() (Result, bool) { - var ( - haserr bool - err error - ) - +func (c *Checker) Run() { // Run Checks res := Result{} - res.APIServerDirect, err = c.measure(c.APIServerDirect, "api_server_direct") - haserr = haserr || (err != nil) - - res.APIServerDNS, err = c.measure(c.APIServerDNS, "api_server_dns") - haserr = haserr || (err != nil) - - res.MeIngress, err = c.measure(c.MeIngress, "me_ingress") - haserr = haserr || (err != nil) - - res.MeService, err = c.measure(c.MeService, "me_service") - haserr = haserr || (err != nil) + res.APIServerDirect = c.measure(c.APIServerDirect, "api_server_direct") + res.APIServerDNS = c.measure(c.APIServerDNS, "api_server_dns") + res.MeIngress = c.measure(c.MeIngress, "me_ingress") + res.MeService = c.measure(c.MeService, "me_service") if c.SkipCheckNeighbourhood { res.NeighbourhoodState = skippedStr } else { + var err error res.Neighbourhood, err = c.GetNeighbours(context.Background(), c.KubenurseNamespace, c.NeighbourFilter) - haserr = haserr || (err != nil) // Neighbourhood special error treating if err != nil { @@ -118,8 +96,6 @@ func (c *Checker) Run() (Result, bool) { // Cache result (used for /alive handler) c.LastCheckResult = &res - - return res, haserr } // RunScheduled runs the checks in the specified interval which can be used to keep the metrics up-to-date. This @@ -184,23 +160,13 @@ func (c *Checker) MeService(ctx context.Context) (string, error) { } // measure implements metric collections for the check -func (c *Checker) measure(check Check, label string) (string, error) { - start := time.Now() - +func (c *Checker) measure(check Check, requestType string) string { // Add our label (check type) to the context so our http tracer can annotate // metrics and errors based with the label - ctx := context.WithValue(context.Background(), kubenurseTypeKey{}, label) + ctx := context.WithValue(context.Background(), kubenurseTypeKey{}, requestType) // Execute check - res, err := check(ctx) - - // Process metrics - c.durationHistogram.WithLabelValues(label).Observe(time.Since(start).Seconds()) - - if err != nil { - log.Printf("failed request for %s with %v", label, err) - c.errorCounter.WithLabelValues(label).Inc() - } + res, _ := check(ctx) // this error is ignored as it is already logged in httptrace - return res, err + return res } diff --git a/internal/servicecheck/servicecheck_test.go b/internal/servicecheck/servicecheck_test.go index 6fba11e..0e2ec1f 100644 --- a/internal/servicecheck/servicecheck_test.go +++ b/internal/servicecheck/servicecheck_test.go @@ -42,9 +42,9 @@ func TestCombined(t *testing.T) { t.Run("run", func(t *testing.T) { r := require.New(t) - result, hadError := checker.Run() - r.True(hadError) - r.Len(result.Neighbourhood, 1) + checker.Run() + + r.Len(checker.LastCheckResult.Neighbourhood, 1) }) t.Run("scheduled", func(t *testing.T) { diff --git a/internal/servicecheck/types.go b/internal/servicecheck/types.go index 98151e9..09bea73 100644 --- a/internal/servicecheck/types.go +++ b/internal/servicecheck/types.go @@ -5,7 +5,6 @@ import ( "net/http" "time" - "github.com/prometheus/client_golang/prometheus" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -39,10 +38,6 @@ type Checker struct { // Controller runtime cached client client client.Client - // metrics - errorCounter *prometheus.CounterVec - durationHistogram *prometheus.HistogramVec - // Http Client for https requests httpClient *http.Client diff --git a/main.go b/main.go index a35c2cc..2fe7c87 100644 --- a/main.go +++ b/main.go @@ -4,8 +4,7 @@ package main import ( "context" "errors" - "fmt" - "log" + "log/slog" "os" "os/signal" "syscall" @@ -18,17 +17,13 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) -const ( - nurse = "I'm ready to help you!" -) - func main() { ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) defer cancel() restConf, err := controllerruntime.GetConfig() if err != nil { - log.Println(err) + slog.Error("error during controllerruntime.GetConfig()", "err", err) return } @@ -44,13 +39,13 @@ func main() { }) if err != nil { - log.Printf("error during cache creation: %s", err) + slog.Error("error during cache creation", "err", err) return } go func() { if err = ca.Start(ctx); err != nil && !errors.Is(err, context.Canceled) { - log.Printf("client cache error: %s", err) + slog.Error("controller-runtime client cache error", "err", err) cancel() } }() @@ -63,34 +58,32 @@ func main() { c, err := client.New(restConf, opts) if err != nil { - log.Printf("error while starting controller-runtime client: %s", err) + slog.Error("error while starting controller-runtime client", "err", err) return } server, err := kubenurse.New(ctx, c) if err != nil { - log.Printf("%s", err) + slog.Error("error in kubenurse.New call", "err", err) return } go func() { <-ctx.Done() // blocks until ctx is canceled - log.Println("shutting down, received signal to stop") + slog.Info("shutting down, received signal to stop") // background ctx since, the "root" context is already canceled shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 10*time.Second) defer shutdownCancel() if err := server.Shutdown(shutdownCtx); err != nil { - log.Printf("gracefully halting kubenurse server: %s", err) + slog.Error("error during graceful shutdown", "err", err) } }() - fmt.Println(nurse) // most important line of this project - // blocks, until the server is stopped by calling Shutdown() if err := server.Run(); err != nil { - log.Printf("running kubenurse: %s", err) + slog.Error("error while running kubenurse", "err", err) } }