grafana · joe-elliott · Oct 4, 2021 · Oct 4, 2021 · Oct 4, 2021 · Oct 4, 2021
@@ -52,6 +52,7 @@
 * [ENHANCEMENT] Add `search_query_timeout` to Querier config. [#984](https://github.com/grafana/tempo/pull/984) (@kvrhdn)
 * [ENHANCEMENT] Jsonnet: add `$._config.memcached.memory_limit_mb` [#987](https://github.com/grafana/tempo/pull/987) (@kvrhdn)
 * [ENHANCEMENT] Upgrade jsonnet-libs to 1.19 and update tk examples [#1001](https://github.com/grafana/tempo/pull/1001) (@mapno)
+* [ENHANCEMENT] Shard tenant index creation by tenant and add functionality to handle stale indexes. [#1005](https://github.com/grafana/tempo/pull/1005) (@joe-elliott)
 * [BUGFIX] Update port spec for GCS docker-compose example [#869](https://github.com/grafana/tempo/pull/869) (@zalegrala)
 * [BUGFIX] Fix "magic number" errors and other block mishandling when an ingester forcefully shuts down [#937](https://github.com/grafana/tempo/issues/937) (@mdisibio)
 * [BUGFIX] Fix compactor memory leak [#806](https://github.com/grafana/tempo/pull/806) (@mdisibio)

@@ -386,6 +386,12 @@ storage:
         # the index.  Default 2.
         [blocklist_poll_tenant_index_builders: <int>]
 
+        # The oldest allowable tenant index. If an index is pulled that is older than this duration the polling
+        # will consider this an error. Note that `blocklist_poll_fallback` applies here. i.e. if fallback is true
+        # and a tenant index exceeds this duration it will fallback to listing the bucket contents.
+        # Default 0 (disabled).
+        [blocklist_poll_stale_tenant_index: <duration>]
+
         # Cache type to use. Should be one of "redis", "memcached"
         # Example: "cache: memcached"
         [cache: <string>]

@@ -309,6 +309,7 @@ storage:
     blocklist_poll_concurrency: 50
     blocklist_poll_fallback: true
     blocklist_poll_tenant_index_builders: 2
+    blocklist_poll_stale_tenant_index: 0
     backend: local
     local:
       path: /tmp/tempo/traces

@@ -20,11 +20,17 @@ storage:
         # Maximum number of compactors that should build the tenant index. All other components will download 
         # the index.  Default 2.
         [blocklist_poll_tenant_index_builders: <int>]
+
+        # The oldest allowable tenant index. If an index is pulled that is older than this duration the polling
+        # will consider this an error. Note that `blocklist_poll_fallback` applies here. i.e. if fallback is true
+        # and a tenant index exceeds this duration it will fallback to listing the bucket contents.
+        # Default 0 (disabled).
+        [blocklist_poll_stale_tenant_index: <duration>]
 ```
 
 Due to the mechanics of the [tenant index]({{< relref "../operations/polling" >}}) the blocklist will be stale by
-at most 2x the configured `blockist_poll` duration. There are two configuration options that need to be balanced 
-against the blockist_poll to handle this:
+at most 2x the configured `blocklist_poll` duration. There are two configuration options that need to be balanced 
+against the `blockist_poll` to handle this:
 
 The ingester `complete_block_timeout` is used to hold a block in the ingester for a given period of time after
 it has been flushed. This allows the ingester to return traces to the queriers while they are still unaware

@@ -126,28 +126,28 @@
           {
             alert: 'TempoNoTenantIndexBuilders',
             expr: |||
-              sum by (%(group_by_cluster)s) (tempodb_blocklist_tenant_index_builder{}) == 0
+              sum by (%(group_by_tenant)s) (tempodb_blocklist_tenant_index_builder{}) == 0
             ||| % $._config,
             'for': '5m',
             labels: {
               severity: 'critical',
             },
             annotations: {
-              message: 'No tenant index builders. Tenant index is out of date.',
+              message: 'No tenant index builders for tenant {{ $labels.tenant }}. Tenant index will quickly become stale.',
               runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoNoTenantIndexBuilders',
             },
           },
           {
             alert: 'TempoTenantIndexTooOld',
             expr: |||
               max by (%s) (tempodb_blocklist_tenant_index_age_seconds{}) > %s
-            ||| % [$._config.group_by_cluster, $._config.alerts.max_tenant_index_age_seconds],
+            ||| % [$._config.group_by_tenant, $._config.alerts.max_tenant_index_age_seconds],
             'for': '5m',
             labels: {
               severity: 'critical',
             },
             annotations: {
-              message: 'Tenant index age is %s seconds old.' % $._config.alerts.max_tenant_index_age_seconds,
+              message: 'Tenant index age is %s seconds old for tenant {{ $labels.tenant }}.' % $._config.alerts.max_tenant_index_age_seconds,
               runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoTenantIndexTooOld',
             },
           },

@@ -19,16 +19,19 @@
       max_tenant_index_age_seconds: 600,
     },
 
-    // Groups labels to uniquely identify and group by {jobs, clusters}
+    // Groups labels to uniquely identify and group by {jobs, clusters, tenants}
     cluster_selectors: ['cluster', 'namespace'],
-    job_selectors: ['namespace', 'job'],
+    job_selectors: ['cluster', 'namespace', 'job'],
+    tenant_selectors: ['cluster', 'namespace', 'tenant'],
 
     // Each group prefix is composed of `_`-separated labels
-    group_prefix_jobs: makePrefix($._config.job_selectors),
     group_prefix_clusters: makePrefix($._config.cluster_selectors),
+    group_prefix_jobs: makePrefix($._config.job_selectors),
+    group_prefix_tenants: makePrefix($._config.tenant_selectors),
 
     // Each group-by label list is `, `-separated and unique identifies
-    group_by_job: makeGroupBy($._config.job_selectors),
     group_by_cluster: makeGroupBy($._config.cluster_selectors),
+    group_by_job: makeGroupBy($._config.job_selectors),
+    group_by_tenant: makeGroupBy($._config.tenant_selectors),
   },
 }
@@ -111,8 +111,8 @@ failed to pull bucket index for tenant. falling back to polling
 ```
 
 If the following (or other errors) are being logged repeatedly then the tenant index is not being updated and more direct action is necessary.
-If the core issue can not be resolved one option is to delete all tenant indexes which will force the components to fallback to 
-scanning the entire bucket.
+If the core issue can not be resolved delete any tenant index that is not being updated. This will force the components to fallback to 
+bucket scanning for the offending tenants.
 ```
 failed to write tenant index
 ```
@@ -121,7 +121,7 @@ failed to write tenant index
 
 See [Polling Issues](#polling-issues) below for general information.
 
-If a cluster has no tenant index builders then nothing is refreshing the per tenant indexes. This can be dangerous
+If a cluster has no tenant index builders for a given tenant then nothing is refreshing the per tenant index. This can be dangerous
 b/c other components will not be aware there is an issue as they repeatedly download a stale tenant index. In Tempo the compactors
 play the role of building the tenant index. Ways to address this issue in order of preference:
 
@@ -132,24 +132,34 @@ play the role of building the tenant index. Ways to address this issue in order
     trace:
       blocklist_poll_tenant_index_builders: 2  # <- increase this value
   ```
-- Delete tenant index files to force other components to fallback to scanning the entire bucket. They are located at 
-  `/<tenant>/index.json.gz`
+- Delete tenant index files that are not being updated to force other components to fallback to scanning for these tenants. They 
+  are located at `/<tenant>/index.json.gz`
 
 ## TempoTenantIndexTooOld
 
 See [Polling Issues](#polling-issues) below for general information.
 
 If the tenant indexes are too old we need to review the compactor logs to determine why they are failing to update. Compactors
-with `tempodb_blocklist_tenant_index_builder` set to 1 are expected to be creating the tenant indexes are should be checked
-first. If no compactors are creating tenant indexes refer to [TempoNoTenantIndexBuilders](#temponotenantindexbuilders) above.
+with `tempodb_blocklist_tenant_index_builder` for the offending tenant set to 1 are expected to be creating the indexes for that
+tenant and should be checked first. If no compactors are creating tenant indexes refer to [TempoNoTenantIndexBuilders](#temponotenantindexbuilders)
+above.
 
 Additionally the metric `tempodb_blocklist_tenant_index_age_seconds` can be grouped by the `tenant` label. If only one (or few) 
-indexes are lagging these can be deleted to force components to manually rescan the bucket.
+indexes are lagging these can be deleted to force components to manually rescan just the offending tenants.
 
 ### Polling Issues
 
-In the case of all polling issues intermittent issues are not concerning. Sustained polling issues need to be addressed.  
+In the case of all polling issues intermittent issues are not concerning. Sustained polling issues need to be addressed. 
 
 Failure to poll just means that the component is not aware of the current state of the backend but will continue working
 otherwise.  Queriers, for instance, will start returning 404s as their internal representation of the backend grows stale. 
-Compactors will attempt to compact blocks that don't exist.
+Compactors will attempt to compact blocks that don't exist.
+
+If persistent backend issues are preventing any fixes to polling then reads will start to fail, but writes will remain fine.
+Alert your users accordingly!
+
+Note that tenant indexes are built independently and an issue may only be impacting one or very few tenants. `tempodb_blocklist_tenant_index_builder`,
+`tempodb_blocklist_tenant_index_age_seconds` and `tempodb_blocklist_tenant_index_errors_total` are all per-tenant metrics. If
+you can isolate the impacted tenants, attempt to take targeted action instead of making sweeping changes. Your easiest lever 
+to pull is to simply delete stale tenant indexes as all components will fallback to bucket listing.
+
@@ -7,9 +7,9 @@
         {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
       "runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoRequestErrors"
     "expr": |
-      100 * sum(rate(tempo_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route)
+      100 * sum(rate(tempo_request_duration_seconds_count{status_code=~"5.."}[1m])) by (cluster, namespace, job, route)
         /
-      sum(rate(tempo_request_duration_seconds_count[1m])) by (namespace, job, route)
+      sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)
         > 10
     "for": "15m"
     "labels":
@@ -20,7 +20,7 @@
         {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
       "runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoRequestLatency"
     "expr": |
-      namespace_job_route:tempo_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process"} > 3
+      cluster_namespace_job_route:tempo_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process"} > 3
     "for": "15m"
     "labels":
       "severity": "critical"
@@ -80,19 +80,19 @@
       "severity": "critical"
   - "alert": "TempoNoTenantIndexBuilders"
     "annotations":
-      "message": "No tenant index builders. Tenant index is out of date."
+      "message": "No tenant index builders for tenant {{ $labels.tenant }}. Tenant index will quickly become stale."
       "runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoNoTenantIndexBuilders"
     "expr": |
-      sum by (cluster, namespace) (tempodb_blocklist_tenant_index_builder{}) == 0
+      sum by (cluster, namespace, tenant) (tempodb_blocklist_tenant_index_builder{}) == 0
     "for": "5m"
     "labels":
       "severity": "critical"
   - "alert": "TempoTenantIndexTooOld"
     "annotations":
-      "message": "Tenant index age is 600 seconds old."
+      "message": "Tenant index age is 600 seconds old for tenant {{ $labels.tenant }}."
       "runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoTenantIndexTooOld"
     "expr": |
-      max by (cluster, namespace) (tempodb_blocklist_tenant_index_age_seconds{}) > 600
+      max by (cluster, namespace, tenant) (tempodb_blocklist_tenant_index_age_seconds{}) > 600
     "for": "5m"
     "labels":
       "severity": "critical"
@@ -1,15 +1,15 @@
 "groups":
 - "name": "tempo_rules"
   "rules":
-  - "expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, namespace, job, route))"
-    "record": "namespace_job_route:tempo_request_duration_seconds:99quantile"
-  - "expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, namespace, job, route))"
-    "record": "namespace_job_route:tempo_request_duration_seconds:50quantile"
-  - "expr": "sum(rate(tempo_request_duration_seconds_sum[1m])) by (namespace, job, route) / sum(rate(tempo_request_duration_seconds_count[1m])) by (namespace, job, route)"
-    "record": "namespace_job_route:tempo_request_duration_seconds:avg"
-  - "expr": "sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, namespace, job, route)"
-    "record": "namespace_job_route:tempo_request_duration_seconds_bucket:sum_rate"
-  - "expr": "sum(rate(tempo_request_duration_seconds_sum[1m])) by (namespace, job, route)"
-    "record": "namespace_job_route:tempo_request_duration_seconds_sum:sum_rate"
-  - "expr": "sum(rate(tempo_request_duration_seconds_count[1m])) by (namespace, job, route)"
-    "record": "namespace_job_route:tempo_request_duration_seconds_count:sum_rate"
+  - "expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))"
+    "record": "cluster_namespace_job_route:tempo_request_duration_seconds:99quantile"
+  - "expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))"
+    "record": "cluster_namespace_job_route:tempo_request_duration_seconds:50quantile"
+  - "expr": "sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)"
+    "record": "cluster_namespace_job_route:tempo_request_duration_seconds:avg"
+  - "expr": "sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)"
+    "record": "cluster_namespace_job_route:tempo_request_duration_seconds_bucket:sum_rate"
+  - "expr": "sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route)"
+    "record": "cluster_namespace_job_route:tempo_request_duration_seconds_sum:sum_rate"
+  - "expr": "sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)"
+    "record": "cluster_namespace_job_route:tempo_request_duration_seconds_count:sum_rate"
@@ -2,6 +2,7 @@ package blocklist
 
 import (
 	"context"
+	"fmt"
 	"sort"
 	"strconv"
 	"time"
@@ -38,11 +39,11 @@ var (
 		Name:      "blocklist_tenant_index_errors_total",
 		Help:      "Total number of times an error occurred while retrieving or building the tenant index.",
 	}, []string{"tenant"})
-	metricTenantIndexBuilder = promauto.NewGauge(prometheus.GaugeOpts{
+	metricTenantIndexBuilder = promauto.NewGaugeVec(prometheus.GaugeOpts{
 		Namespace: "tempodb",
 		Name:      "blocklist_tenant_index_builder",
 		Help:      "A value of 1 indicates this instance of tempodb is building the tenant index.",
-	})
+	}, []string{"tenant"})
 	metricTenantIndexAgeSeconds = promauto.NewGaugeVec(prometheus.GaugeOpts{
 		Namespace: "tempodb",
 		Name:      "blocklist_tenant_index_age_seconds",
@@ -55,6 +56,7 @@ type PollerConfig struct {
 	PollConcurrency     uint
 	PollFallback        bool
 	TenantIndexBuilders int
+	StaleTenantIndex    time.Duration
 }
 
 // JobSharder is used to determine if a particular job is owned by this process
@@ -122,10 +124,11 @@ func (p *Poller) Do() (PerTenant, PerTenantCompacted, error) {
 
 func (p *Poller) pollTenantAndCreateIndex(ctx context.Context, tenantID string) ([]*backend.BlockMeta, []*backend.CompactedBlockMeta, error) {
 	// are we a tenant index builder?
-	if !p.buildTenantIndex() {
-		metricTenantIndexBuilder.Set(0)
+	if !p.buildTenantIndex(tenantID) {
+		metricTenantIndexBuilder.WithLabelValues(tenantID).Set(0)
 
 		i, err := p.reader.TenantIndex(ctx, tenantID)
+		err = p.tenantIndexPollError(i, err)
 		if err == nil {
 			// success! return the retrieved index
 			metricTenantIndexAgeSeconds.WithLabelValues(tenantID).Set(float64(time.Since(i.CreatedAt) / time.Second))
@@ -146,7 +149,7 @@ func (p *Poller) pollTenantAndCreateIndex(ctx context.Context, tenantID string)
 
 	// if we're here then we have been configured to be a tenant index builder OR there was a failure to pull
 	// the tenant index and we are configured to fall back to polling
-	metricTenantIndexBuilder.Set(1)
+	metricTenantIndexBuilder.WithLabelValues(tenantID).Set(1)
 	blocklist, compactedBlocklist, err := p.pollTenantBlocks(ctx, tenantID)
 	if err != nil {
 		return nil, nil, err
@@ -240,13 +243,25 @@ func (p *Poller) pollBlock(ctx context.Context, tenantID string, blockID uuid.UU
 	return blockMeta, compactedBlockMeta, nil
 }
 
-func (p *Poller) buildTenantIndex() bool {
+func (p *Poller) buildTenantIndex(tenant string) bool {
 	for i := 0; i < p.cfg.TenantIndexBuilders; i++ {
-		job := jobPrefix + strconv.Itoa(i)
+		job := jobPrefix + strconv.Itoa(i) + "-" + tenant
 		if p.sharder.Owns(job) {
 			return true
 		}
 	}
 
 	return false
 }
+
+func (p *Poller) tenantIndexPollError(idx *backend.TenantIndex, err error) error {
+	if err != nil {
+		return err
+	}
+
+	if p.cfg.StaleTenantIndex != 0 && time.Since(idx.CreatedAt) > p.cfg.StaleTenantIndex {
+		return fmt.Errorf("tenant index created at %s is stale", idx.CreatedAt)
+	}
+
+	return nil
+}