From 03f2f06e1247e997a0246d72f5c2c1fd9bd386df Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 4 Nov 2024 04:33:53 -0500 Subject: [PATCH] Improve lock contention affecting read and write latencies during TSDB head compaction (cherry-pick Prometheus PR 15242) (#9822) * Cherry-pick Prometheus PR 15242 Signed-off-by: Marco Pracucci * Added CHANGELOG entry Signed-off-by: Marco Pracucci * Updated CHANGELOG Signed-off-by: Marco Pracucci --------- Signed-off-by: Marco Pracucci --- CHANGELOG.md | 1 + go.mod | 2 +- go.sum | 4 ++-- .../prometheus/tsdb/index/postings.go | 23 +++++++++++++++++++ vendor/modules.txt | 4 ++-- 5 files changed, 29 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 428d1c0bb1f..8e5f139908a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -52,6 +52,7 @@ * [ENHANCEMENT] Ingester: Reduced lock contention in the `PostingsForMatchers` cache. #9773 * [ENHANCEMENT] Storage: Allow HTTP client settings to be tuned for GCS and Azure backends via an `http` block or corresponding CLI flags. This was already supported by the S3 backend. #9778 * [ENHANCEMENT] Ruler: Support `group_limit` and `group_next_token` parameters in the `/api/v1/rules` endpoint. #9563 +* [ENHANCEMENT] Ingester: improved lock contention affecting read and write latencies during TSDB head compaction. #9822 * [BUGFIX] Fix issue where functions such as `rate()` over native histograms could return incorrect values if a float stale marker was present in the selected range. #9508 * [BUGFIX] Fix issue where negation of native histograms (eg. `-some_native_histogram_series`) did nothing. #9508 * [BUGFIX] Fix issue where `metric might not be a counter, name does not end in _total/_sum/_count/_bucket` annotation would be emitted even if `rate` or `increase` did not have enough samples to compute a result. #9508 diff --git a/go.mod b/go.mod index fb520e997d4..71eacced97a 100644 --- a/go.mod +++ b/go.mod @@ -282,7 +282,7 @@ require ( ) // Using a fork of Prometheus with Mimir-specific changes. -replace github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20241030085501-6c2603082009 +replace github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20241104085513-57e1ca2a6f61 // Replace memberlist with our fork which includes some fixes that haven't been // merged upstream yet: diff --git a/go.sum b/go.sum index ec470d1b4db..d52f55a1bbb 100644 --- a/go.sum +++ b/go.sum @@ -1270,8 +1270,8 @@ github.com/grafana/gomemcache v0.0.0-20241016125027-0a5bcc5aef40 h1:1TeKhyS+pvzO github.com/grafana/gomemcache v0.0.0-20241016125027-0a5bcc5aef40/go.mod h1:IGRj8oOoxwJbHBYl1+OhS9UjQR0dv6SQOep7HqmtyFU= github.com/grafana/memberlist v0.3.1-0.20220714140823-09ffed8adbbe h1:yIXAAbLswn7VNWBIvM71O2QsgfgW9fRXZNR0DXe6pDU= github.com/grafana/memberlist v0.3.1-0.20220714140823-09ffed8adbbe/go.mod h1:MS2lj3INKhZjWNqd3N0m3J+Jxf3DAOnAH9VT3Sh9MUE= -github.com/grafana/mimir-prometheus v0.0.0-20241030085501-6c2603082009 h1:ZIlymN/o3WNl8jzBRkQXLvtW5/5wF9bZ5Cy9dxHwku0= -github.com/grafana/mimir-prometheus v0.0.0-20241030085501-6c2603082009/go.mod h1:7SuFBLahBoRY7KcgzWzK0p1n5QL+5dyr/Ysat6v1378= +github.com/grafana/mimir-prometheus v0.0.0-20241104085513-57e1ca2a6f61 h1:5iLgPt4mT0zZoe3N+y4Dw8IAvxJwTl0eIlUS8q0iGg0= +github.com/grafana/mimir-prometheus v0.0.0-20241104085513-57e1ca2a6f61/go.mod h1:7SuFBLahBoRY7KcgzWzK0p1n5QL+5dyr/Ysat6v1378= github.com/grafana/opentracing-contrib-go-stdlib v0.0.0-20230509071955-f410e79da956 h1:em1oddjXL8c1tL0iFdtVtPloq2hRPen2MJQKoAWpxu0= github.com/grafana/opentracing-contrib-go-stdlib v0.0.0-20230509071955-f410e79da956/go.mod h1:qtI1ogk+2JhVPIXVc6q+NHziSmy2W5GbdQZFUHADCBU= github.com/grafana/prometheus-alertmanager v0.25.1-0.20240930132144-b5e64e81e8d3 h1:6D2gGAwyQBElSrp3E+9lSr7k8gLuP3Aiy20rweLWeBw= diff --git a/vendor/github.com/prometheus/prometheus/tsdb/index/postings.go b/vendor/github.com/prometheus/prometheus/tsdb/index/postings.go index 1b1d4e32883..5f9c27d8fe9 100644 --- a/vendor/github.com/prometheus/prometheus/tsdb/index/postings.go +++ b/vendor/github.com/prometheus/prometheus/tsdb/index/postings.go @@ -24,6 +24,7 @@ import ( "sort" "strings" "sync" + "time" "github.com/bboreham/go-loser" @@ -312,8 +313,30 @@ func (p *MemPostings) Delete(deleted map[storage.SeriesRef]struct{}, affected ma } } + i := 0 for l := range affected { + i++ process(l) + + // From time to time we want some readers to go through and read their postings. + // It takes around 50ms to process a 1K series batch, and 120ms to process a 10K series batch (local benchmarks on an M3). + // Note that a read query will most likely want to read multiple postings lists, say 5, 10 or 20 (depending on the number of matchers) + // And that read query will most likely evaluate only one of those matchers before we unpause here, so we want to pause often. + if i%512 == 0 { + p.mtx.Unlock() + // While it's tempting to just do a `time.Sleep(time.Millisecond)` here, + // it wouldn't ensure use that readers actually were able to get the read lock, + // because if there are writes waiting on same mutex, readers won't be able to get it. + // So we just grab one RLock ourselves. + p.mtx.RLock() + // We shouldn't wait here, because we would be blocking a potential write for no reason. + // Note that if there's a writer waiting for us to unlock, no reader will be able to get the read lock. + p.mtx.RUnlock() //nolint:staticcheck // SA2001: this is an intentionally empty critical section. + // Now we can wait a little bit just to increase the chance of a reader getting the lock. + // If we were deleting 100M series here, pausing every 512 with 1ms sleeps would be an extra of 200s, which is negligible. + time.Sleep(time.Millisecond) + p.mtx.Lock() + } } process(allPostingsKey) } diff --git a/vendor/modules.txt b/vendor/modules.txt index 3ab98ded15a..3f5f9aeb05f 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1007,7 +1007,7 @@ github.com/prometheus/exporter-toolkit/web github.com/prometheus/procfs github.com/prometheus/procfs/internal/fs github.com/prometheus/procfs/internal/util -# github.com/prometheus/prometheus v1.99.0 => github.com/grafana/mimir-prometheus v0.0.0-20241030085501-6c2603082009 +# github.com/prometheus/prometheus v1.99.0 => github.com/grafana/mimir-prometheus v0.0.0-20241104085513-57e1ca2a6f61 ## explicit; go 1.22.0 github.com/prometheus/prometheus/config github.com/prometheus/prometheus/discovery @@ -1671,7 +1671,7 @@ sigs.k8s.io/kustomize/kyaml/yaml/walk sigs.k8s.io/yaml sigs.k8s.io/yaml/goyaml.v2 sigs.k8s.io/yaml/goyaml.v3 -# github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20241030085501-6c2603082009 +# github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20241104085513-57e1ca2a6f61 # github.com/hashicorp/memberlist => github.com/grafana/memberlist v0.3.1-0.20220714140823-09ffed8adbbe # gopkg.in/yaml.v3 => github.com/colega/go-yaml-yaml v0.0.0-20220720105220-255a8d16d094 # github.com/grafana/regexp => github.com/grafana/regexp v0.0.0-20240531075221-3685f1377d7b