From ce6dd456fb2eae3acff99c84c8da857e47c2e479 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 14 Jun 2023 09:34:43 +0200 Subject: [PATCH] Add cortex_frontend_query_result_cache_requests_total and cortex_frontend_query_result_cache_hits_total metrics (#5235) Signed-off-by: Marco Pracucci --- CHANGELOG.md | 4 +- .../cardinality_query_cache.go | 23 +++++---- .../cardinality_query_cache_test.go | 44 +++++++++++++++-- pkg/frontend/querymiddleware/results_cache.go | 21 +++++++++ pkg/frontend/querymiddleware/roundtrip.go | 2 +- .../querymiddleware/split_and_cache.go | 6 +++ .../querymiddleware/split_and_cache_test.go | 47 ++++++++++++++++++- 7 files changed, 131 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 385be34916f..b5537b60d15 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,9 @@ * [CHANGE] Store-gateway: skip verifying index header integrity upon loading. To enable verification set `blocks_storage.bucket_store.index_header.verify_on_load: true`. * [CHANGE] Querier: change the default value of the experimental `-querier.streaming-chunks-per-ingester-buffer-size` flag to 256. #5203 * [FEATURE] Cardinality API: Add a new `count_method` parameter which enables counting active series #5136 -* [FEATURE] Query-frontend: added experimental support to cache cardinality query responses. The cache will be used when `-query-frontend.cache-results` is enabled and `-query-frontend.results-cache-ttl-for-cardinality-query` set to a value greater than 0. #5212 +* [FEATURE] Query-frontend: added experimental support to cache cardinality query responses. The cache will be used when `-query-frontend.cache-results` is enabled and `-query-frontend.results-cache-ttl-for-cardinality-query` set to a value greater than 0. The following metrics have been added to track the query results cache hit ratio per `request_type`: #5212 #5235 + * `cortex_frontend_query_result_cache_requests_total{request_type="query_range|cardinality"}` + * `cortex_frontend_query_result_cache_hits_total{request_type="query_range|cardinality"}` * [ENHANCEMENT] Cardinality API: When zone aware replication is enabled, the label values cardinality API can now tolerate single zone failure #5178 * [ENHANCEMENT] Distributor: optimize sending requests to ingesters when incoming requests don't need to be modified. For now this feature can be disabled by setting `-timeseries-unmarshal-caching-optimization-enabled=false`. #5137 * [ENHANCEMENT] Add advanced CLI flags to control gRPC client behaviour: #5161 diff --git a/pkg/frontend/querymiddleware/cardinality_query_cache.go b/pkg/frontend/querymiddleware/cardinality_query_cache.go index 94cf0d72d4f..9f28b597c16 100644 --- a/pkg/frontend/querymiddleware/cardinality_query_cache.go +++ b/pkg/frontend/querymiddleware/cardinality_query_cache.go @@ -14,6 +14,7 @@ import ( "github.com/go-kit/log/level" "github.com/grafana/dskit/cache" "github.com/grafana/dskit/tenant" + "github.com/prometheus/client_golang/prometheus" apierror "github.com/grafana/mimir/pkg/api/error" "github.com/grafana/mimir/pkg/cardinality" @@ -39,18 +40,20 @@ type cardinalityQueryRequest interface { // cardinalityQueryCache is a http.RoundTripped wrapping the downstream with an HTTP response cache. // This RoundTripper is used to add caching support to cardinality analysis API endpoints. type cardinalityQueryCache struct { - cache cache.Cache - limits Limits - next http.RoundTripper - logger log.Logger + cache cache.Cache + limits Limits + metrics *resultsCacheMetrics + next http.RoundTripper + logger log.Logger } -func newCardinalityQueryCacheRoundTripper(cache cache.Cache, limits Limits, next http.RoundTripper, logger log.Logger) http.RoundTripper { +func newCardinalityQueryCacheRoundTripper(cache cache.Cache, limits Limits, next http.RoundTripper, logger log.Logger, reg prometheus.Registerer) http.RoundTripper { return &cardinalityQueryCache{ - cache: cache, - limits: limits, - next: next, - logger: logger, + cache: cache, + limits: limits, + metrics: newResultsCacheMetrics("cardinality", reg), + next: next, + logger: logger, } } @@ -92,9 +95,11 @@ func (c *cardinalityQueryCache) RoundTrip(req *http.Request) (*http.Response, er } // Lookup the cache. + c.metrics.cacheRequests.Inc() cacheKey, hashedCacheKey := generateCardinalityQueryRequestCacheKey(tenantIDs, queryReq) res := c.fetchCachedResponse(ctx, cacheKey, hashedCacheKey) if res != nil { + c.metrics.cacheHits.Inc() level.Debug(spanLog).Log("msg", "response fetched from the cache") return res, nil } diff --git a/pkg/frontend/querymiddleware/cardinality_query_cache_test.go b/pkg/frontend/querymiddleware/cardinality_query_cache_test.go index cfc43424ee2..921238c6c3e 100644 --- a/pkg/frontend/querymiddleware/cardinality_query_cache_test.go +++ b/pkg/frontend/querymiddleware/cardinality_query_cache_test.go @@ -5,6 +5,7 @@ package querymiddleware import ( "bytes" "context" + "fmt" "io" "net/http" "net/url" @@ -14,6 +15,8 @@ import ( "github.com/grafana/dskit/cache" "github.com/grafana/dskit/tenant" + "github.com/prometheus/client_golang/prometheus" + promtest "github.com/prometheus/client_golang/prometheus/testutil" "github.com/prometheus/prometheus/util/testutil" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -47,6 +50,7 @@ func TestCardinalityQueryCache_RoundTrip(t *testing.T) { expectedHeader http.Header expectedBody []byte expectedDownstreamCalled bool + expectedLookupFromCache bool expectedStoredToCache bool }{ "should fetch the response from the downstream and store it the cache if the downstream request succeed": { @@ -56,6 +60,7 @@ func TestCardinalityQueryCache_RoundTrip(t *testing.T) { expectedHeader: http.Header{"Content-Type": []string{"application/json"}}, expectedBody: []byte(`{content:"fresh"}`), expectedDownstreamCalled: true, + expectedLookupFromCache: true, expectedStoredToCache: true, }, "should not store the response in the cache if disabled for the tenant": { @@ -65,6 +70,7 @@ func TestCardinalityQueryCache_RoundTrip(t *testing.T) { expectedHeader: http.Header{"Content-Type": []string{"application/json"}}, expectedBody: []byte(`{content:"fresh"}`), expectedDownstreamCalled: true, + expectedLookupFromCache: false, expectedStoredToCache: false, }, "should not store the response in the cache if disabled for the request": { @@ -75,24 +81,27 @@ func TestCardinalityQueryCache_RoundTrip(t *testing.T) { expectedHeader: http.Header{"Content-Type": []string{"application/json"}}, expectedBody: []byte(`{content:"fresh"}`), expectedDownstreamCalled: true, + expectedLookupFromCache: false, expectedStoredToCache: false, }, "should not store the response in the cache if the downstream returned a 4xx status code": { - cacheTTL: 0, + cacheTTL: time.Minute, downstreamRes: downstreamRes(400, []byte(`{error:"400"}`)), expectedStatusCode: 400, expectedHeader: http.Header{"Content-Type": []string{"application/json"}}, expectedBody: []byte(`{error:"400"}`), expectedDownstreamCalled: true, + expectedLookupFromCache: true, expectedStoredToCache: false, }, "should not store the response in the cache if the downstream returned a 5xx status code": { - cacheTTL: 0, + cacheTTL: time.Minute, downstreamRes: downstreamRes(500, []byte(`{error:"500"}`)), expectedStatusCode: 500, expectedHeader: http.Header{"Content-Type": []string{"application/json"}}, expectedBody: []byte(`{error:"500"}`), expectedDownstreamCalled: true, + expectedLookupFromCache: true, expectedStoredToCache: false, }, "should fetch the response from the cache if the cached response is not expired": { @@ -109,6 +118,7 @@ func TestCardinalityQueryCache_RoundTrip(t *testing.T) { expectedHeader: http.Header{"Content-Type": []string{"application/json"}}, expectedBody: []byte(`{content:"cached"}`), expectedDownstreamCalled: false, // Should not call the downstream. + expectedLookupFromCache: true, expectedStoredToCache: false, // Should not store anything to the cache. }, "should fetch the response from the downstream and overwrite the cached response if corrupted": { @@ -121,6 +131,7 @@ func TestCardinalityQueryCache_RoundTrip(t *testing.T) { expectedHeader: http.Header{"Content-Type": []string{"application/json"}}, expectedBody: []byte(`{content:"fresh"}`), expectedDownstreamCalled: true, + expectedLookupFromCache: true, expectedStoredToCache: true, }, "should fetch the response from the downstream and overwrite the cached response if a key collision was detected": { @@ -137,6 +148,7 @@ func TestCardinalityQueryCache_RoundTrip(t *testing.T) { expectedHeader: http.Header{"Content-Type": []string{"application/json"}}, expectedBody: []byte(`{content:"fresh"}`), expectedDownstreamCalled: true, + expectedLookupFromCache: true, expectedStoredToCache: true, }, } @@ -187,7 +199,8 @@ func TestCardinalityQueryCache_RoundTrip(t *testing.T) { } initialStoreCallsCount := cacheBackend.CountStoreCalls() - rt := newCardinalityQueryCacheRoundTripper(cacheBackend, limits, downstream, testutil.NewLogger(t)) + reg := prometheus.NewPedanticRegistry() + rt := newCardinalityQueryCacheRoundTripper(cacheBackend, limits, downstream, testutil.NewLogger(t), reg) res, err := rt.RoundTrip(req) require.NoError(t, err) @@ -220,6 +233,29 @@ func TestCardinalityQueryCache_RoundTrip(t *testing.T) { } else { assert.Equal(t, initialStoreCallsCount, cacheBackend.CountStoreCalls()) } + + // Assert on metrics. + expectedRequestsCount := 0 + expectedHitsCount := 0 + if testData.expectedLookupFromCache { + expectedRequestsCount = 1 + if !testData.expectedDownstreamCalled { + expectedHitsCount = 1 + } + } + + assert.NoError(t, promtest.GatherAndCompare(reg, strings.NewReader(fmt.Sprintf(` + # HELP cortex_frontend_query_result_cache_requests_total Total number of requests (or partial requests) looked up in the results cache. + # TYPE cortex_frontend_query_result_cache_requests_total counter + cortex_frontend_query_result_cache_requests_total{request_type="cardinality"} %d + + # HELP cortex_frontend_query_result_cache_hits_total Total number of requests (or partial requests) fetched from the results cache. + # TYPE cortex_frontend_query_result_cache_hits_total counter + cortex_frontend_query_result_cache_hits_total{request_type="cardinality"} %d + `, expectedRequestsCount, expectedHitsCount)), + "cortex_frontend_query_result_cache_requests_total", + "cortex_frontend_query_result_cache_hits_total", + )) }) } }) @@ -279,7 +315,7 @@ func TestCardinalityQueryCache_RoundTrip_WithTenantFederation(t *testing.T) { cacheBackend := cache.NewInstrumentedMockCache() limits := multiTenantMockLimits{byTenant: testData.limits} - rt := newCardinalityQueryCacheRoundTripper(cacheBackend, limits, downstream, testutil.NewLogger(t)) + rt := newCardinalityQueryCacheRoundTripper(cacheBackend, limits, downstream, testutil.NewLogger(t), nil) res, err := rt.RoundTrip(req) require.NoError(t, err) diff --git a/pkg/frontend/querymiddleware/results_cache.go b/pkg/frontend/querymiddleware/results_cache.go index f663e3d0103..46264ef7355 100644 --- a/pkg/frontend/querymiddleware/results_cache.go +++ b/pkg/frontend/querymiddleware/results_cache.go @@ -23,6 +23,7 @@ import ( "github.com/opentracing/opentracing-go" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" "github.com/prometheus/common/model" "github.com/prometheus/prometheus/model/timestamp" "github.com/prometheus/prometheus/promql" @@ -93,6 +94,26 @@ func errUnsupportedResultsCacheBackend(backend string) error { return fmt.Errorf("%w: %q, supported values: %v", errUnsupportedBackend, backend, supportedResultsCacheBackends) } +type resultsCacheMetrics struct { + cacheRequests prometheus.Counter + cacheHits prometheus.Counter +} + +func newResultsCacheMetrics(requestType string, reg prometheus.Registerer) *resultsCacheMetrics { + return &resultsCacheMetrics{ + cacheRequests: promauto.With(reg).NewCounter(prometheus.CounterOpts{ + Name: "cortex_frontend_query_result_cache_requests_total", + Help: "Total number of requests (or partial requests) looked up in the results cache.", + ConstLabels: map[string]string{"request_type": requestType}, + }), + cacheHits: promauto.With(reg).NewCounter(prometheus.CounterOpts{ + Name: "cortex_frontend_query_result_cache_hits_total", + Help: "Total number of requests (or partial requests) fetched from the results cache.", + ConstLabels: map[string]string{"request_type": requestType}, + }), + } +} + // newResultsCache creates a new results cache based on the input configuration. func newResultsCache(cfg ResultsCacheConfig, logger log.Logger, reg prometheus.Registerer) (cache.Cache, error) { // Add the "component" label similarly to other components, so that metrics don't clash and have the same labels set diff --git a/pkg/frontend/querymiddleware/roundtrip.go b/pkg/frontend/querymiddleware/roundtrip.go index 667940530b9..033a8f51732 100644 --- a/pkg/frontend/querymiddleware/roundtrip.go +++ b/pkg/frontend/querymiddleware/roundtrip.go @@ -291,7 +291,7 @@ func newQueryTripperware( // Inject the cardinality query cache roundtripper only if the query results cache is enabled. cardinality := next if cfg.CacheResults { - cardinality = newCardinalityQueryCacheRoundTripper(c, limits, next, log) + cardinality = newCardinalityQueryCacheRoundTripper(c, limits, next, log, registerer) } return RoundTripFunc(func(r *http.Request) (*http.Response, error) { diff --git a/pkg/frontend/querymiddleware/split_and_cache.go b/pkg/frontend/querymiddleware/split_and_cache.go index 4c109122df8..e187bcbffca 100644 --- a/pkg/frontend/querymiddleware/split_and_cache.go +++ b/pkg/frontend/querymiddleware/split_and_cache.go @@ -44,6 +44,8 @@ var ( ) type splitAndCacheMiddlewareMetrics struct { + *resultsCacheMetrics + splitQueriesCount prometheus.Counter queryResultCacheAttemptedCount prometheus.Counter queryResultCacheSkippedCount *prometheus.CounterVec @@ -51,6 +53,7 @@ type splitAndCacheMiddlewareMetrics struct { func newSplitAndCacheMiddlewareMetrics(reg prometheus.Registerer) *splitAndCacheMiddlewareMetrics { m := &splitAndCacheMiddlewareMetrics{ + resultsCacheMetrics: newResultsCacheMetrics("query_range", reg), splitQueriesCount: promauto.With(reg).NewCounter(prometheus.CounterOpts{ Name: "cortex_frontend_split_queries_total", Help: "Total number of underlying query requests after the split by interval is applied.", @@ -344,7 +347,10 @@ func (s *splitAndCacheMiddleware) fetchCacheExtents(ctx context.Context, now tim spanLog.LogKV("key", key, "hashedKey", hashed) } + // Lookup the cache. + s.metrics.cacheRequests.Add(float64(len(keys))) founds := s.cache.Fetch(ctx, hashedKeys) + s.metrics.cacheHits.Add(float64(len(founds))) // Decode all cached responses. extents := make([][]Extent, len(keys)) diff --git a/pkg/frontend/querymiddleware/split_and_cache_test.go b/pkg/frontend/querymiddleware/split_and_cache_test.go index 5b307dd1280..b9321c06593 100644 --- a/pkg/frontend/querymiddleware/split_and_cache_test.go +++ b/pkg/frontend/querymiddleware/split_and_cache_test.go @@ -221,6 +221,12 @@ func TestSplitAndCacheMiddleware_SplitByInterval(t *testing.T) { # HELP cortex_frontend_split_queries_total Total number of underlying query requests after the split by interval is applied. # TYPE cortex_frontend_split_queries_total counter cortex_frontend_split_queries_total 4 + # HELP cortex_frontend_query_result_cache_hits_total Total number of requests (or partial requests) fetched from the results cache. + # TYPE cortex_frontend_query_result_cache_hits_total counter + cortex_frontend_query_result_cache_hits_total{request_type="query_range"} 0 + # HELP cortex_frontend_query_result_cache_requests_total Total number of requests (or partial requests) looked up in the results cache. + # TYPE cortex_frontend_query_result_cache_requests_total counter + cortex_frontend_query_result_cache_requests_total{request_type="query_range"} 0 `))) // Assert query stats from context @@ -231,6 +237,7 @@ func TestSplitAndCacheMiddleware_SplitByInterval(t *testing.T) { func TestSplitAndCacheMiddleware_ResultsCache(t *testing.T) { cacheBackend := cache.NewInstrumentedMockCache() + reg := prometheus.NewPedanticRegistry() mw := newSplitAndCacheMiddleware( true, true, @@ -243,7 +250,7 @@ func TestSplitAndCacheMiddleware_ResultsCache(t *testing.T) { PrometheusResponseExtractor{}, resultsCacheAlwaysEnabled, log.NewNopLogger(), - prometheus.NewPedanticRegistry(), + reg, ) expectedResponse := &PrometheusResponse{ @@ -333,6 +340,31 @@ func TestSplitAndCacheMiddleware_ResultsCache(t *testing.T) { // Assert query stats from context queryStats = stats.FromContext(ctx) assert.Equal(t, uint32(2), queryStats.LoadSplitQueries()) + + // Assert metrics + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(` + # HELP cortex_frontend_query_result_cache_attempted_total Total number of queries that were attempted to be fetched from cache. + # TYPE cortex_frontend_query_result_cache_attempted_total counter + cortex_frontend_query_result_cache_attempted_total 3 + + # HELP cortex_frontend_query_result_cache_skipped_total Total number of times a query was not cacheable because of a reason. This metric is tracked for each partial query when time-splitting is enabled. + # TYPE cortex_frontend_query_result_cache_skipped_total counter + cortex_frontend_query_result_cache_skipped_total{reason="has-modifiers"} 0 + cortex_frontend_query_result_cache_skipped_total{reason="too-new"} 0 + cortex_frontend_query_result_cache_skipped_total{reason="unaligned-time-range"} 0 + + # HELP cortex_frontend_split_queries_total Total number of underlying query requests after the split by interval is applied. + # TYPE cortex_frontend_split_queries_total counter + cortex_frontend_split_queries_total 3 + + # HELP cortex_frontend_query_result_cache_requests_total Total number of requests (or partial requests) looked up in the results cache. + # TYPE cortex_frontend_query_result_cache_requests_total counter + cortex_frontend_query_result_cache_requests_total{request_type="query_range"} 3 + + # HELP cortex_frontend_query_result_cache_hits_total Total number of requests (or partial requests) fetched from the results cache. + # TYPE cortex_frontend_query_result_cache_hits_total counter + cortex_frontend_query_result_cache_hits_total{request_type="query_range"} 2 + `))) } func TestSplitAndCacheMiddleware_ResultsCache_ShouldNotLookupCacheIfStepIsNotAligned(t *testing.T) { @@ -423,6 +455,7 @@ func TestSplitAndCacheMiddleware_ResultsCache_ShouldNotLookupCacheIfStepIsNotAli // Assert query stats from context queryStats := stats.FromContext(ctx) assert.Equal(t, uint32(1), queryStats.LoadSplitQueries()) + // Assert metrics assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(` # HELP cortex_frontend_query_result_cache_attempted_total Total number of queries that were attempted to be fetched from cache. @@ -436,6 +469,12 @@ func TestSplitAndCacheMiddleware_ResultsCache_ShouldNotLookupCacheIfStepIsNotAli # HELP cortex_frontend_split_queries_total Total number of underlying query requests after the split by interval is applied. # TYPE cortex_frontend_split_queries_total counter cortex_frontend_split_queries_total 1 + # HELP cortex_frontend_query_result_cache_hits_total Total number of requests (or partial requests) fetched from the results cache. + # TYPE cortex_frontend_query_result_cache_hits_total counter + cortex_frontend_query_result_cache_hits_total{request_type="query_range"} 0 + # HELP cortex_frontend_query_result_cache_requests_total Total number of requests (or partial requests) looked up in the results cache. + # TYPE cortex_frontend_query_result_cache_requests_total counter + cortex_frontend_query_result_cache_requests_total{request_type="query_range"} 0 `))) } @@ -572,6 +611,12 @@ func TestSplitAndCacheMiddleware_ResultsCache_ShouldNotCacheRequestEarlierThanMa # HELP cortex_frontend_split_queries_total Total number of underlying query requests after the split by interval is applied. # TYPE cortex_frontend_split_queries_total counter cortex_frontend_split_queries_total 0 + # HELP cortex_frontend_query_result_cache_hits_total Total number of requests (or partial requests) fetched from the results cache. + # TYPE cortex_frontend_query_result_cache_hits_total counter + cortex_frontend_query_result_cache_hits_total{request_type="query_range"} 0 + # HELP cortex_frontend_query_result_cache_requests_total Total number of requests (or partial requests) looked up in the results cache. + # TYPE cortex_frontend_query_result_cache_requests_total counter + cortex_frontend_query_result_cache_requests_total{request_type="query_range"} 0 `, }, "should cache a response up until max cache freshness time ago": {