From 39272aea3ad541ed6d5ee23bbca1c085aaf38782 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 15 Aug 2023 14:13:40 +0200 Subject: [PATCH 1/8] Distributor: add config flag in order to control error code when rate limit reached --- CHANGELOG.md | 1 + cmd/mimir/config-descriptor.json | 11 +++ cmd/mimir/help-all.txt.tmpl | 2 + .../mimir/configure/about-versioning.md | 2 + .../configuration-parameters/index.md | 6 ++ pkg/distributor/distributor.go | 5 +- pkg/distributor/distributor_test.go | 42 ++++++--- pkg/util/validation/limits.go | 93 ++++++++++--------- 8 files changed, 105 insertions(+), 57 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5285c6b482f..6c4480273de 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ * [CHANGE] Ingester: Do not log errors related to hitting per-instance limits to reduce resource usage when ingesters are under pressure. #5585 * [CHANGE] gRPC clients: use default connect timeout of 5s, and therefore enable default connect backoff max delay of 5s. #5562 * [CHANGE] The `-shutdown-delay` flag is no longer experimental. #5701 +* [FEATURE] Introduced `distributor.enable_service_unavailable_error_on_rate_limit` flag for configuring error code to 503 instead of 429 upon rate limit exhaustion. #5752 * [FEATURE] Cardinality API: Add a new `count_method` parameter which enables counting active series #5136 * [FEATURE] Query-frontend: added experimental support to cache cardinality, label names and label values query responses. The cache will be used when `-query-frontend.cache-results` is enabled, and `-query-frontend.results-cache-ttl-for-cardinality-query` or `-query-frontend.results-cache-ttl-for-labels-query` set to a value greater than 0. The following metrics have been added to track the query results cache hit ratio per `request_type`: #5212 #5235 #5426 #5524 * `cortex_frontend_query_result_cache_requests_total{request_type="query_range|cardinality|label_names_and_values"}` diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index 442e3f5ff6d..38dd3e09d9d 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -3020,6 +3020,17 @@ "fieldType": "relabel_config...", "fieldCategory": "experimental" }, + { + "kind": "field", + "name": "enable_service_unavailable_error_on_rate_limit", + "required": false, + "desc": "Flag to determines the error code returned when distributor rate limits are reached. If set to true, the error code will be 503; if set to false, a 429 error is returned", + "fieldValue": null, + "fieldDefaultValue": false, + "fieldFlag": "distributor.enable_service_unavailable_error_on_rate_limit", + "fieldType": "boolean", + "fieldCategory": "experimental" + }, { "kind": "field", "name": "max_global_series_per_user", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 65d1334f954..c7e62956c15 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -1031,6 +1031,8 @@ Usage of ./cmd/mimir/mimir: How frequently to clean up clients for ingesters that have gone away. (default 15s) -distributor.drop-label string This flag can be used to specify label names that to drop during sample ingestion within the distributor and can be repeated in order to drop multiple labels. + -distributor.enable_service_unavailable_error_on_rate_limit + [experimental] Flag to determines the error code returned when distributor rate limits are reached. If set to true, the error code will be 503; if set to false, a 429 error is returned -distributor.ha-tracker.cluster string Prometheus label to look for in samples to identify a Prometheus HA cluster. (default "cluster") -distributor.ha-tracker.consul.acl-token string diff --git a/docs/sources/mimir/configure/about-versioning.md b/docs/sources/mimir/configure/about-versioning.md index cd29879da1e..f2cf99336fc 100644 --- a/docs/sources/mimir/configure/about-versioning.md +++ b/docs/sources/mimir/configure/about-versioning.md @@ -60,6 +60,8 @@ The following features are currently experimental: - Distributor - Metrics relabeling - OTLP ingestion path + - Configuring error code to 503 instead of 429 upon rate limit exhaustion. + - `distributor.enable-service-unavailable-error-on-rate-limit` - Hash ring - Disabling ring heartbeat timeouts - `-distributor.ring.heartbeat-timeout=0` diff --git a/docs/sources/mimir/references/configuration-parameters/index.md b/docs/sources/mimir/references/configuration-parameters/index.md index e70abac2df8..13211db3d57 100644 --- a/docs/sources/mimir/references/configuration-parameters/index.md +++ b/docs/sources/mimir/references/configuration-parameters/index.md @@ -2753,6 +2753,12 @@ The `limits` block configures default and per-tenant limits imposed by component # during the relabeling phase and cleaned afterwards: __meta_tenant_id [metric_relabel_configs: | default = ] +# (experimental) Flag to determines the error code returned when distributor +# rate limits are reached. If set to true, the error code will be 503; if set to +# false, a 429 error is returned +# CLI flag: -distributor.enable_service_unavailable_error_on_rate_limit +[enable_service_unavailable_error_on_rate_limit: | default = false] + # The maximum number of in-memory series per tenant, across the cluster before # replication. 0 to disable. # CLI flag: -ingester.max-global-series-per-user diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 1d84cb56805..ac4615c9616 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -1038,9 +1038,12 @@ func (d *Distributor) limitsMiddleware(next push.Func) push.Func { if !d.requestRateLimiter.AllowN(now, userID, 1) { d.discardedRequestsRateLimited.WithLabelValues(userID).Add(1) - // Return a 429 here to tell the client it is going too fast. + // Return a 429 or a 503 here depending on configuration to tell the client it is going too fast. // Client may discard the data or slow down and re-send. // Prometheus v2.26 added a remote-write option 'retry_on_http_429'. + if d.limits.EnableServiceUnavailableErrorOnRateLimit(userID) { + return nil, httpgrpc.Errorf(http.StatusServiceUnavailable, validation.NewRequestRateLimitedError(d.limits.RequestRate(userID), d.limits.RequestBurstSize(userID)).Error()) + } return nil, httpgrpc.Errorf(http.StatusTooManyRequests, validation.NewRequestRateLimitedError(d.limits.RequestRate(userID), d.limits.RequestBurstSize(userID)).Error()) } diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index 79d9011a065..ce3dc7fca1b 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -472,15 +472,17 @@ func TestDistributor_PushRequestRateLimiter(t *testing.T) { } ctx := user.InjectOrgID(context.Background(), "user") tests := map[string]struct { - distributors int - requestRate float64 - requestBurstSize int - pushes []testPush + distributors int + requestRate float64 + requestBurstSize int + pushes []testPush + enableServiceUnavailableError bool }{ "request limit should be evenly shared across distributors": { - distributors: 2, - requestRate: 4, - requestBurstSize: 2, + distributors: 2, + requestRate: 4, + requestBurstSize: 2, + enableServiceUnavailableError: false, pushes: []testPush{ {expectedError: nil}, {expectedError: nil}, @@ -488,9 +490,10 @@ func TestDistributor_PushRequestRateLimiter(t *testing.T) { }, }, "request limit is disabled when set to 0": { - distributors: 2, - requestRate: 0, - requestBurstSize: 0, + distributors: 2, + requestRate: 0, + requestBurstSize: 0, + enableServiceUnavailableError: false, pushes: []testPush{ {expectedError: nil}, {expectedError: nil}, @@ -498,9 +501,10 @@ func TestDistributor_PushRequestRateLimiter(t *testing.T) { }, }, "request burst should set to each distributor": { - distributors: 2, - requestRate: 2, - requestBurstSize: 3, + distributors: 2, + requestRate: 2, + requestBurstSize: 3, + enableServiceUnavailableError: false, pushes: []testPush{ {expectedError: nil}, {expectedError: nil}, @@ -508,6 +512,17 @@ func TestDistributor_PushRequestRateLimiter(t *testing.T) { {expectedError: httpgrpc.Errorf(http.StatusTooManyRequests, validation.NewRequestRateLimitedError(2, 3).Error())}, }, }, + "request limit is reached return StatusServiceUnavailable when enable service unavailable error set to true": { + distributors: 2, + requestRate: 4, + requestBurstSize: 2, + enableServiceUnavailableError: true, + pushes: []testPush{ + {expectedError: nil}, + {expectedError: nil}, + {expectedError: httpgrpc.Errorf(http.StatusServiceUnavailable, validation.NewRequestRateLimitedError(4, 2).Error())}, + }, + }, } for testName, testData := range tests { @@ -518,6 +533,7 @@ func TestDistributor_PushRequestRateLimiter(t *testing.T) { flagext.DefaultValues(limits) limits.RequestRate = testData.requestRate limits.RequestBurstSize = testData.requestBurstSize + limits.EnableServiceUnavailableErrorOnRateLimit = testData.enableServiceUnavailableError // Start all expected distributors distributors, _, _ := prepare(t, prepConfig{ diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 3a74e698618..9b60f49dc5b 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -26,30 +26,31 @@ import ( ) const ( - MaxSeriesPerMetricFlag = "ingester.max-global-series-per-metric" - MaxMetadataPerMetricFlag = "ingester.max-global-metadata-per-metric" - MaxSeriesPerUserFlag = "ingester.max-global-series-per-user" - MaxMetadataPerUserFlag = "ingester.max-global-metadata-per-user" - MaxChunksPerQueryFlag = "querier.max-fetched-chunks-per-query" - MaxChunkBytesPerQueryFlag = "querier.max-fetched-chunk-bytes-per-query" - MaxSeriesPerQueryFlag = "querier.max-fetched-series-per-query" - maxLabelNamesPerSeriesFlag = "validation.max-label-names-per-series" - maxLabelNameLengthFlag = "validation.max-length-label-name" - maxLabelValueLengthFlag = "validation.max-length-label-value" - maxMetadataLengthFlag = "validation.max-metadata-length" - maxNativeHistogramBucketsFlag = "validation.max-native-histogram-buckets" - creationGracePeriodFlag = "validation.create-grace-period" - maxPartialQueryLengthFlag = "querier.max-partial-query-length" - maxTotalQueryLengthFlag = "query-frontend.max-total-query-length" - maxQueryExpressionSizeBytesFlag = "query-frontend.max-query-expression-size-bytes" - requestRateFlag = "distributor.request-rate-limit" - requestBurstSizeFlag = "distributor.request-burst-size" - ingestionRateFlag = "distributor.ingestion-rate-limit" - ingestionBurstSizeFlag = "distributor.ingestion-burst-size" - HATrackerMaxClustersFlag = "distributor.ha-tracker.max-clusters" - resultsCacheTTLFlag = "query-frontend.results-cache-ttl" - resultsCacheTTLForOutOfOrderWindowFlag = "query-frontend.results-cache-ttl-for-out-of-order-time-window" - QueryIngestersWithinFlag = "querier.query-ingesters-within" + MaxSeriesPerMetricFlag = "ingester.max-global-series-per-metric" + MaxMetadataPerMetricFlag = "ingester.max-global-metadata-per-metric" + MaxSeriesPerUserFlag = "ingester.max-global-series-per-user" + MaxMetadataPerUserFlag = "ingester.max-global-metadata-per-user" + MaxChunksPerQueryFlag = "querier.max-fetched-chunks-per-query" + MaxChunkBytesPerQueryFlag = "querier.max-fetched-chunk-bytes-per-query" + MaxSeriesPerQueryFlag = "querier.max-fetched-series-per-query" + maxLabelNamesPerSeriesFlag = "validation.max-label-names-per-series" + maxLabelNameLengthFlag = "validation.max-length-label-name" + maxLabelValueLengthFlag = "validation.max-length-label-value" + maxMetadataLengthFlag = "validation.max-metadata-length" + maxNativeHistogramBucketsFlag = "validation.max-native-histogram-buckets" + creationGracePeriodFlag = "validation.create-grace-period" + maxPartialQueryLengthFlag = "querier.max-partial-query-length" + maxTotalQueryLengthFlag = "query-frontend.max-total-query-length" + maxQueryExpressionSizeBytesFlag = "query-frontend.max-query-expression-size-bytes" + requestRateFlag = "distributor.request-rate-limit" + enableServiceUnavailableErrorOnRateLimitFlag = "distributor.enable_service_unavailable_error_on_rate_limit" + requestBurstSizeFlag = "distributor.request-burst-size" + ingestionRateFlag = "distributor.ingestion-rate-limit" + ingestionBurstSizeFlag = "distributor.ingestion-burst-size" + HATrackerMaxClustersFlag = "distributor.ha-tracker.max-clusters" + resultsCacheTTLFlag = "query-frontend.results-cache-ttl" + resultsCacheTTLForOutOfOrderWindowFlag = "query-frontend.results-cache-ttl-for-out-of-order-time-window" + QueryIngestersWithinFlag = "querier.query-ingesters-within" // MinCompactorPartialBlockDeletionDelay is the minimum partial blocks deletion delay that can be configured in Mimir. MinCompactorPartialBlockDeletionDelay = 4 * time.Hour @@ -66,25 +67,25 @@ func (e LimitError) Error() string { // limits via flags, or per-user limits via yaml config. type Limits struct { // Distributor enforced limits. - RequestRate float64 `yaml:"request_rate" json:"request_rate"` - RequestBurstSize int `yaml:"request_burst_size" json:"request_burst_size"` - IngestionRate float64 `yaml:"ingestion_rate" json:"ingestion_rate"` - IngestionBurstSize int `yaml:"ingestion_burst_size" json:"ingestion_burst_size"` - AcceptHASamples bool `yaml:"accept_ha_samples" json:"accept_ha_samples"` - HAClusterLabel string `yaml:"ha_cluster_label" json:"ha_cluster_label"` - HAReplicaLabel string `yaml:"ha_replica_label" json:"ha_replica_label"` - HAMaxClusters int `yaml:"ha_max_clusters" json:"ha_max_clusters"` - DropLabels flagext.StringSlice `yaml:"drop_labels" json:"drop_labels" category:"advanced"` - MaxLabelNameLength int `yaml:"max_label_name_length" json:"max_label_name_length"` - MaxLabelValueLength int `yaml:"max_label_value_length" json:"max_label_value_length"` - MaxLabelNamesPerSeries int `yaml:"max_label_names_per_series" json:"max_label_names_per_series"` - MaxMetadataLength int `yaml:"max_metadata_length" json:"max_metadata_length"` - MaxNativeHistogramBuckets int `yaml:"max_native_histogram_buckets" json:"max_native_histogram_buckets"` - CreationGracePeriod model.Duration `yaml:"creation_grace_period" json:"creation_grace_period" category:"advanced"` - EnforceMetadataMetricName bool `yaml:"enforce_metadata_metric_name" json:"enforce_metadata_metric_name" category:"advanced"` - IngestionTenantShardSize int `yaml:"ingestion_tenant_shard_size" json:"ingestion_tenant_shard_size"` - MetricRelabelConfigs []*relabel.Config `yaml:"metric_relabel_configs,omitempty" json:"metric_relabel_configs,omitempty" doc:"nocli|description=List of metric relabel configurations. Note that in most situations, it is more effective to use metrics relabeling directly in the Prometheus server, e.g. remote_write.write_relabel_configs. Labels available during the relabeling phase and cleaned afterwards: __meta_tenant_id" category:"experimental"` - + RequestRate float64 `yaml:"request_rate" json:"request_rate"` + RequestBurstSize int `yaml:"request_burst_size" json:"request_burst_size"` + IngestionRate float64 `yaml:"ingestion_rate" json:"ingestion_rate"` + IngestionBurstSize int `yaml:"ingestion_burst_size" json:"ingestion_burst_size"` + AcceptHASamples bool `yaml:"accept_ha_samples" json:"accept_ha_samples"` + HAClusterLabel string `yaml:"ha_cluster_label" json:"ha_cluster_label"` + HAReplicaLabel string `yaml:"ha_replica_label" json:"ha_replica_label"` + HAMaxClusters int `yaml:"ha_max_clusters" json:"ha_max_clusters"` + DropLabels flagext.StringSlice `yaml:"drop_labels" json:"drop_labels" category:"advanced"` + MaxLabelNameLength int `yaml:"max_label_name_length" json:"max_label_name_length"` + MaxLabelValueLength int `yaml:"max_label_value_length" json:"max_label_value_length"` + MaxLabelNamesPerSeries int `yaml:"max_label_names_per_series" json:"max_label_names_per_series"` + MaxMetadataLength int `yaml:"max_metadata_length" json:"max_metadata_length"` + MaxNativeHistogramBuckets int `yaml:"max_native_histogram_buckets" json:"max_native_histogram_buckets"` + CreationGracePeriod model.Duration `yaml:"creation_grace_period" json:"creation_grace_period" category:"advanced"` + EnforceMetadataMetricName bool `yaml:"enforce_metadata_metric_name" json:"enforce_metadata_metric_name" category:"advanced"` + IngestionTenantShardSize int `yaml:"ingestion_tenant_shard_size" json:"ingestion_tenant_shard_size"` + MetricRelabelConfigs []*relabel.Config `yaml:"metric_relabel_configs,omitempty" json:"metric_relabel_configs,omitempty" doc:"nocli|description=List of metric relabel configurations. Note that in most situations, it is more effective to use metrics relabeling directly in the Prometheus server, e.g. remote_write.write_relabel_configs. Labels available during the relabeling phase and cleaned afterwards: __meta_tenant_id" category:"experimental"` + EnableServiceUnavailableErrorOnRateLimit bool `yaml:"enable_service_unavailable_error_on_rate_limit" json:"enable_service_unavailable_error_on_rate_limit" category:"experimental"` // Ingester enforced limits. // Series MaxGlobalSeriesPerUser int `yaml:"max_global_series_per_user" json:"max_global_series_per_user"` @@ -185,6 +186,7 @@ type Limits struct { func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.IntVar(&l.IngestionTenantShardSize, "distributor.ingestion-tenant-shard-size", 0, "The tenant's shard size used by shuffle-sharding. This value is the total size of the shard (ie. it is not the number of ingesters in the shard per zone, but the number of ingesters in the shard across all zones, if zone-awareness is enabled). Must be set both on ingesters and distributors. 0 disables shuffle sharding.") f.Float64Var(&l.RequestRate, requestRateFlag, 0, "Per-tenant push request rate limit in requests per second. 0 to disable.") + f.BoolVar(&l.EnableServiceUnavailableErrorOnRateLimit, enableServiceUnavailableErrorOnRateLimitFlag, false, "Flag to determines the error code returned when distributor rate limits are reached. If set to true, the error code will be 503; if set to false, a 429 error is returned") f.IntVar(&l.RequestBurstSize, requestBurstSizeFlag, 0, "Per-tenant allowed push request burst size. 0 to disable.") f.Float64Var(&l.IngestionRate, ingestionRateFlag, 10000, "Per-tenant ingestion rate limit in samples per second.") f.IntVar(&l.IngestionBurstSize, ingestionBurstSizeFlag, 200000, "Per-tenant allowed ingestion burst size (in number of samples).") @@ -435,6 +437,11 @@ func (o *Overrides) AcceptHASamples(userID string) bool { return o.getOverridesForUser(userID).AcceptHASamples } +// EnableServiceUnavailableErrorOnRateLimit return whether the distributor uses error code 503 instead of 429 when the rate limit is exceeded. +func (o *Overrides) EnableServiceUnavailableErrorOnRateLimit(userID string) bool { + return o.getOverridesForUser(userID).EnableServiceUnavailableErrorOnRateLimit +} + // HAClusterLabel returns the cluster label to look for when deciding whether to accept a sample from a Prometheus HA replica. func (o *Overrides) HAClusterLabel(userID string) string { return o.getOverridesForUser(userID).HAClusterLabel From b5e53e2bcd7e13c2be8ddb26158fa94912af6cc1 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Wed, 16 Aug 2023 11:47:28 +0200 Subject: [PATCH 2/8] change error code to 529 --- CHANGELOG.md | 2 +- cmd/mimir/config-descriptor.json | 6 +- cmd/mimir/help-all.txt.tmpl | 4 +- .../mimir/configure/about-versioning.md | 4 +- .../configuration-parameters/index.md | 6 +- pkg/distributor/distributor.go | 9 +- pkg/distributor/distributor_test.go | 48 +++++----- pkg/util/validation/limits.go | 96 +++++++++---------- 8 files changed, 89 insertions(+), 86 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c4480273de..d6513ec1f2c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ * [CHANGE] Ingester: Do not log errors related to hitting per-instance limits to reduce resource usage when ingesters are under pressure. #5585 * [CHANGE] gRPC clients: use default connect timeout of 5s, and therefore enable default connect backoff max delay of 5s. #5562 * [CHANGE] The `-shutdown-delay` flag is no longer experimental. #5701 -* [FEATURE] Introduced `distributor.enable_service_unavailable_error_on_rate_limit` flag for configuring error code to 503 instead of 429 upon rate limit exhaustion. #5752 +* [FEATURE] Introduced `distributor.enable_service_overload_error_on_rate_limit` flag for configuring error code to 529 instead of 429 upon rate limit exhaustion. #5752 * [FEATURE] Cardinality API: Add a new `count_method` parameter which enables counting active series #5136 * [FEATURE] Query-frontend: added experimental support to cache cardinality, label names and label values query responses. The cache will be used when `-query-frontend.cache-results` is enabled, and `-query-frontend.results-cache-ttl-for-cardinality-query` or `-query-frontend.results-cache-ttl-for-labels-query` set to a value greater than 0. The following metrics have been added to track the query results cache hit ratio per `request_type`: #5212 #5235 #5426 #5524 * `cortex_frontend_query_result_cache_requests_total{request_type="query_range|cardinality|label_names_and_values"}` diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index 38dd3e09d9d..d599d3ebe85 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -3022,12 +3022,12 @@ }, { "kind": "field", - "name": "enable_service_unavailable_error_on_rate_limit", + "name": "enable_service_overload_error_on_rate_limit", "required": false, - "desc": "Flag to determines the error code returned when distributor rate limits are reached. If set to true, the error code will be 503; if set to false, a 429 error is returned", + "desc": "Flag to determines the error code returned when distributor rate limits are reached. If set to true, the error code will be 529; if set to false, a 429 error is returned", "fieldValue": null, "fieldDefaultValue": false, - "fieldFlag": "distributor.enable_service_unavailable_error_on_rate_limit", + "fieldFlag": "distributor.enable-service-overload-error-on-rate-limit", "fieldType": "boolean", "fieldCategory": "experimental" }, diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index c7e62956c15..54c54f8c241 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -1031,8 +1031,8 @@ Usage of ./cmd/mimir/mimir: How frequently to clean up clients for ingesters that have gone away. (default 15s) -distributor.drop-label string This flag can be used to specify label names that to drop during sample ingestion within the distributor and can be repeated in order to drop multiple labels. - -distributor.enable_service_unavailable_error_on_rate_limit - [experimental] Flag to determines the error code returned when distributor rate limits are reached. If set to true, the error code will be 503; if set to false, a 429 error is returned + -distributor.enable-service-overload-error-on-rate-limit + [experimental] Flag to determines the error code returned when distributor rate limits are reached. If set to true, the error code will be 529; if set to false, a 429 error is returned -distributor.ha-tracker.cluster string Prometheus label to look for in samples to identify a Prometheus HA cluster. (default "cluster") -distributor.ha-tracker.consul.acl-token string diff --git a/docs/sources/mimir/configure/about-versioning.md b/docs/sources/mimir/configure/about-versioning.md index f2cf99336fc..7c71eb70f1c 100644 --- a/docs/sources/mimir/configure/about-versioning.md +++ b/docs/sources/mimir/configure/about-versioning.md @@ -60,8 +60,8 @@ The following features are currently experimental: - Distributor - Metrics relabeling - OTLP ingestion path - - Configuring error code to 503 instead of 429 upon rate limit exhaustion. - - `distributor.enable-service-unavailable-error-on-rate-limit` + - Configuring error code to 529 instead of 429 upon rate limit exhaustion. + - `distributor.enable-service-overload-error-on-rate-limit` - Hash ring - Disabling ring heartbeat timeouts - `-distributor.ring.heartbeat-timeout=0` diff --git a/docs/sources/mimir/references/configuration-parameters/index.md b/docs/sources/mimir/references/configuration-parameters/index.md index 13211db3d57..550e294536e 100644 --- a/docs/sources/mimir/references/configuration-parameters/index.md +++ b/docs/sources/mimir/references/configuration-parameters/index.md @@ -2754,10 +2754,10 @@ The `limits` block configures default and per-tenant limits imposed by component [metric_relabel_configs: | default = ] # (experimental) Flag to determines the error code returned when distributor -# rate limits are reached. If set to true, the error code will be 503; if set to +# rate limits are reached. If set to true, the error code will be 529; if set to # false, a 429 error is returned -# CLI flag: -distributor.enable_service_unavailable_error_on_rate_limit -[enable_service_unavailable_error_on_rate_limit: | default = false] +# CLI flag: -distributor.enable-service-overload-error-on-rate-limit +[enable_service_overload_error_on_rate_limit: | default = false] # The maximum number of in-memory series per tenant, across the cluster before # replication. 0 to disable. diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index ac4615c9616..d7f287f9bd6 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -77,6 +77,8 @@ const ( // Size of "slab" when using pooled buffers for marshaling write requests. When handling single Push request // buffers for multiple write requests sent to ingesters will be allocated from single "slab", if there is enough space. writeRequestSlabPoolSize = 512 * 1024 + + statusServiceOverload = 529 ) // Distributor forwards appends and queries to individual ingesters. @@ -1038,11 +1040,12 @@ func (d *Distributor) limitsMiddleware(next push.Func) push.Func { if !d.requestRateLimiter.AllowN(now, userID, 1) { d.discardedRequestsRateLimited.WithLabelValues(userID).Add(1) - // Return a 429 or a 503 here depending on configuration to tell the client it is going too fast. + // Return a 429 or a 529 here depending on configuration to tell the client it is going too fast. // Client may discard the data or slow down and re-send. // Prometheus v2.26 added a remote-write option 'retry_on_http_429'. - if d.limits.EnableServiceUnavailableErrorOnRateLimit(userID) { - return nil, httpgrpc.Errorf(http.StatusServiceUnavailable, validation.NewRequestRateLimitedError(d.limits.RequestRate(userID), d.limits.RequestBurstSize(userID)).Error()) + if d.limits.EnableServiceOverloadErrorOnRateLimit(userID) { + // hard coded 529 error since http parckage does not contain a 529 error code + return nil, httpgrpc.Errorf(statusServiceOverload, validation.NewRequestRateLimitedError(d.limits.RequestRate(userID), d.limits.RequestBurstSize(userID)).Error()) } return nil, httpgrpc.Errorf(http.StatusTooManyRequests, validation.NewRequestRateLimitedError(d.limits.RequestRate(userID), d.limits.RequestBurstSize(userID)).Error()) } diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index ce3dc7fca1b..cf5bae78a7b 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -472,17 +472,17 @@ func TestDistributor_PushRequestRateLimiter(t *testing.T) { } ctx := user.InjectOrgID(context.Background(), "user") tests := map[string]struct { - distributors int - requestRate float64 - requestBurstSize int - pushes []testPush - enableServiceUnavailableError bool + distributors int + requestRate float64 + requestBurstSize int + pushes []testPush + enableServiceOverloadError bool }{ "request limit should be evenly shared across distributors": { - distributors: 2, - requestRate: 4, - requestBurstSize: 2, - enableServiceUnavailableError: false, + distributors: 2, + requestRate: 4, + requestBurstSize: 2, + enableServiceOverloadError: false, pushes: []testPush{ {expectedError: nil}, {expectedError: nil}, @@ -490,10 +490,10 @@ func TestDistributor_PushRequestRateLimiter(t *testing.T) { }, }, "request limit is disabled when set to 0": { - distributors: 2, - requestRate: 0, - requestBurstSize: 0, - enableServiceUnavailableError: false, + distributors: 2, + requestRate: 0, + requestBurstSize: 0, + enableServiceOverloadError: false, pushes: []testPush{ {expectedError: nil}, {expectedError: nil}, @@ -501,10 +501,10 @@ func TestDistributor_PushRequestRateLimiter(t *testing.T) { }, }, "request burst should set to each distributor": { - distributors: 2, - requestRate: 2, - requestBurstSize: 3, - enableServiceUnavailableError: false, + distributors: 2, + requestRate: 2, + requestBurstSize: 3, + enableServiceOverloadError: false, pushes: []testPush{ {expectedError: nil}, {expectedError: nil}, @@ -512,15 +512,15 @@ func TestDistributor_PushRequestRateLimiter(t *testing.T) { {expectedError: httpgrpc.Errorf(http.StatusTooManyRequests, validation.NewRequestRateLimitedError(2, 3).Error())}, }, }, - "request limit is reached return StatusServiceUnavailable when enable service unavailable error set to true": { - distributors: 2, - requestRate: 4, - requestBurstSize: 2, - enableServiceUnavailableError: true, + "request limit is reached return 529 when enable service overload error set to true": { + distributors: 2, + requestRate: 4, + requestBurstSize: 2, + enableServiceOverloadError: true, pushes: []testPush{ {expectedError: nil}, {expectedError: nil}, - {expectedError: httpgrpc.Errorf(http.StatusServiceUnavailable, validation.NewRequestRateLimitedError(4, 2).Error())}, + {expectedError: httpgrpc.Errorf(statusServiceOverload, validation.NewRequestRateLimitedError(4, 2).Error())}, }, }, } @@ -533,7 +533,7 @@ func TestDistributor_PushRequestRateLimiter(t *testing.T) { flagext.DefaultValues(limits) limits.RequestRate = testData.requestRate limits.RequestBurstSize = testData.requestBurstSize - limits.EnableServiceUnavailableErrorOnRateLimit = testData.enableServiceUnavailableError + limits.EnableServiceOverloadErrorOnRateLimit = testData.enableServiceOverloadError // Start all expected distributors distributors, _, _ := prepare(t, prepConfig{ diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 9b60f49dc5b..be6bd895580 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -26,31 +26,31 @@ import ( ) const ( - MaxSeriesPerMetricFlag = "ingester.max-global-series-per-metric" - MaxMetadataPerMetricFlag = "ingester.max-global-metadata-per-metric" - MaxSeriesPerUserFlag = "ingester.max-global-series-per-user" - MaxMetadataPerUserFlag = "ingester.max-global-metadata-per-user" - MaxChunksPerQueryFlag = "querier.max-fetched-chunks-per-query" - MaxChunkBytesPerQueryFlag = "querier.max-fetched-chunk-bytes-per-query" - MaxSeriesPerQueryFlag = "querier.max-fetched-series-per-query" - maxLabelNamesPerSeriesFlag = "validation.max-label-names-per-series" - maxLabelNameLengthFlag = "validation.max-length-label-name" - maxLabelValueLengthFlag = "validation.max-length-label-value" - maxMetadataLengthFlag = "validation.max-metadata-length" - maxNativeHistogramBucketsFlag = "validation.max-native-histogram-buckets" - creationGracePeriodFlag = "validation.create-grace-period" - maxPartialQueryLengthFlag = "querier.max-partial-query-length" - maxTotalQueryLengthFlag = "query-frontend.max-total-query-length" - maxQueryExpressionSizeBytesFlag = "query-frontend.max-query-expression-size-bytes" - requestRateFlag = "distributor.request-rate-limit" - enableServiceUnavailableErrorOnRateLimitFlag = "distributor.enable_service_unavailable_error_on_rate_limit" - requestBurstSizeFlag = "distributor.request-burst-size" - ingestionRateFlag = "distributor.ingestion-rate-limit" - ingestionBurstSizeFlag = "distributor.ingestion-burst-size" - HATrackerMaxClustersFlag = "distributor.ha-tracker.max-clusters" - resultsCacheTTLFlag = "query-frontend.results-cache-ttl" - resultsCacheTTLForOutOfOrderWindowFlag = "query-frontend.results-cache-ttl-for-out-of-order-time-window" - QueryIngestersWithinFlag = "querier.query-ingesters-within" + MaxSeriesPerMetricFlag = "ingester.max-global-series-per-metric" + MaxMetadataPerMetricFlag = "ingester.max-global-metadata-per-metric" + MaxSeriesPerUserFlag = "ingester.max-global-series-per-user" + MaxMetadataPerUserFlag = "ingester.max-global-metadata-per-user" + MaxChunksPerQueryFlag = "querier.max-fetched-chunks-per-query" + MaxChunkBytesPerQueryFlag = "querier.max-fetched-chunk-bytes-per-query" + MaxSeriesPerQueryFlag = "querier.max-fetched-series-per-query" + maxLabelNamesPerSeriesFlag = "validation.max-label-names-per-series" + maxLabelNameLengthFlag = "validation.max-length-label-name" + maxLabelValueLengthFlag = "validation.max-length-label-value" + maxMetadataLengthFlag = "validation.max-metadata-length" + maxNativeHistogramBucketsFlag = "validation.max-native-histogram-buckets" + creationGracePeriodFlag = "validation.create-grace-period" + maxPartialQueryLengthFlag = "querier.max-partial-query-length" + maxTotalQueryLengthFlag = "query-frontend.max-total-query-length" + maxQueryExpressionSizeBytesFlag = "query-frontend.max-query-expression-size-bytes" + requestRateFlag = "distributor.request-rate-limit" + enableServiceOverloadErrorOnRateLimitFlag = "distributor.enable-service-overload-error-on-rate-limit" + requestBurstSizeFlag = "distributor.request-burst-size" + ingestionRateFlag = "distributor.ingestion-rate-limit" + ingestionBurstSizeFlag = "distributor.ingestion-burst-size" + HATrackerMaxClustersFlag = "distributor.ha-tracker.max-clusters" + resultsCacheTTLFlag = "query-frontend.results-cache-ttl" + resultsCacheTTLForOutOfOrderWindowFlag = "query-frontend.results-cache-ttl-for-out-of-order-time-window" + QueryIngestersWithinFlag = "querier.query-ingesters-within" // MinCompactorPartialBlockDeletionDelay is the minimum partial blocks deletion delay that can be configured in Mimir. MinCompactorPartialBlockDeletionDelay = 4 * time.Hour @@ -67,25 +67,25 @@ func (e LimitError) Error() string { // limits via flags, or per-user limits via yaml config. type Limits struct { // Distributor enforced limits. - RequestRate float64 `yaml:"request_rate" json:"request_rate"` - RequestBurstSize int `yaml:"request_burst_size" json:"request_burst_size"` - IngestionRate float64 `yaml:"ingestion_rate" json:"ingestion_rate"` - IngestionBurstSize int `yaml:"ingestion_burst_size" json:"ingestion_burst_size"` - AcceptHASamples bool `yaml:"accept_ha_samples" json:"accept_ha_samples"` - HAClusterLabel string `yaml:"ha_cluster_label" json:"ha_cluster_label"` - HAReplicaLabel string `yaml:"ha_replica_label" json:"ha_replica_label"` - HAMaxClusters int `yaml:"ha_max_clusters" json:"ha_max_clusters"` - DropLabels flagext.StringSlice `yaml:"drop_labels" json:"drop_labels" category:"advanced"` - MaxLabelNameLength int `yaml:"max_label_name_length" json:"max_label_name_length"` - MaxLabelValueLength int `yaml:"max_label_value_length" json:"max_label_value_length"` - MaxLabelNamesPerSeries int `yaml:"max_label_names_per_series" json:"max_label_names_per_series"` - MaxMetadataLength int `yaml:"max_metadata_length" json:"max_metadata_length"` - MaxNativeHistogramBuckets int `yaml:"max_native_histogram_buckets" json:"max_native_histogram_buckets"` - CreationGracePeriod model.Duration `yaml:"creation_grace_period" json:"creation_grace_period" category:"advanced"` - EnforceMetadataMetricName bool `yaml:"enforce_metadata_metric_name" json:"enforce_metadata_metric_name" category:"advanced"` - IngestionTenantShardSize int `yaml:"ingestion_tenant_shard_size" json:"ingestion_tenant_shard_size"` - MetricRelabelConfigs []*relabel.Config `yaml:"metric_relabel_configs,omitempty" json:"metric_relabel_configs,omitempty" doc:"nocli|description=List of metric relabel configurations. Note that in most situations, it is more effective to use metrics relabeling directly in the Prometheus server, e.g. remote_write.write_relabel_configs. Labels available during the relabeling phase and cleaned afterwards: __meta_tenant_id" category:"experimental"` - EnableServiceUnavailableErrorOnRateLimit bool `yaml:"enable_service_unavailable_error_on_rate_limit" json:"enable_service_unavailable_error_on_rate_limit" category:"experimental"` + RequestRate float64 `yaml:"request_rate" json:"request_rate"` + RequestBurstSize int `yaml:"request_burst_size" json:"request_burst_size"` + IngestionRate float64 `yaml:"ingestion_rate" json:"ingestion_rate"` + IngestionBurstSize int `yaml:"ingestion_burst_size" json:"ingestion_burst_size"` + AcceptHASamples bool `yaml:"accept_ha_samples" json:"accept_ha_samples"` + HAClusterLabel string `yaml:"ha_cluster_label" json:"ha_cluster_label"` + HAReplicaLabel string `yaml:"ha_replica_label" json:"ha_replica_label"` + HAMaxClusters int `yaml:"ha_max_clusters" json:"ha_max_clusters"` + DropLabels flagext.StringSlice `yaml:"drop_labels" json:"drop_labels" category:"advanced"` + MaxLabelNameLength int `yaml:"max_label_name_length" json:"max_label_name_length"` + MaxLabelValueLength int `yaml:"max_label_value_length" json:"max_label_value_length"` + MaxLabelNamesPerSeries int `yaml:"max_label_names_per_series" json:"max_label_names_per_series"` + MaxMetadataLength int `yaml:"max_metadata_length" json:"max_metadata_length"` + MaxNativeHistogramBuckets int `yaml:"max_native_histogram_buckets" json:"max_native_histogram_buckets"` + CreationGracePeriod model.Duration `yaml:"creation_grace_period" json:"creation_grace_period" category:"advanced"` + EnforceMetadataMetricName bool `yaml:"enforce_metadata_metric_name" json:"enforce_metadata_metric_name" category:"advanced"` + IngestionTenantShardSize int `yaml:"ingestion_tenant_shard_size" json:"ingestion_tenant_shard_size"` + MetricRelabelConfigs []*relabel.Config `yaml:"metric_relabel_configs,omitempty" json:"metric_relabel_configs,omitempty" doc:"nocli|description=List of metric relabel configurations. Note that in most situations, it is more effective to use metrics relabeling directly in the Prometheus server, e.g. remote_write.write_relabel_configs. Labels available during the relabeling phase and cleaned afterwards: __meta_tenant_id" category:"experimental"` + EnableServiceOverloadErrorOnRateLimit bool `yaml:"enable_service_overload_error_on_rate_limit" json:"enable_service_overload_error_on_rate_limit" category:"experimental"` // Ingester enforced limits. // Series MaxGlobalSeriesPerUser int `yaml:"max_global_series_per_user" json:"max_global_series_per_user"` @@ -186,7 +186,7 @@ type Limits struct { func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.IntVar(&l.IngestionTenantShardSize, "distributor.ingestion-tenant-shard-size", 0, "The tenant's shard size used by shuffle-sharding. This value is the total size of the shard (ie. it is not the number of ingesters in the shard per zone, but the number of ingesters in the shard across all zones, if zone-awareness is enabled). Must be set both on ingesters and distributors. 0 disables shuffle sharding.") f.Float64Var(&l.RequestRate, requestRateFlag, 0, "Per-tenant push request rate limit in requests per second. 0 to disable.") - f.BoolVar(&l.EnableServiceUnavailableErrorOnRateLimit, enableServiceUnavailableErrorOnRateLimitFlag, false, "Flag to determines the error code returned when distributor rate limits are reached. If set to true, the error code will be 503; if set to false, a 429 error is returned") + f.BoolVar(&l.EnableServiceOverloadErrorOnRateLimit, enableServiceOverloadErrorOnRateLimitFlag, false, "Flag to determines the error code returned when distributor rate limits are reached. If set to true, the error code will be 529; if set to false, a 429 error is returned") f.IntVar(&l.RequestBurstSize, requestBurstSizeFlag, 0, "Per-tenant allowed push request burst size. 0 to disable.") f.Float64Var(&l.IngestionRate, ingestionRateFlag, 10000, "Per-tenant ingestion rate limit in samples per second.") f.IntVar(&l.IngestionBurstSize, ingestionBurstSizeFlag, 200000, "Per-tenant allowed ingestion burst size (in number of samples).") @@ -437,9 +437,9 @@ func (o *Overrides) AcceptHASamples(userID string) bool { return o.getOverridesForUser(userID).AcceptHASamples } -// EnableServiceUnavailableErrorOnRateLimit return whether the distributor uses error code 503 instead of 429 when the rate limit is exceeded. -func (o *Overrides) EnableServiceUnavailableErrorOnRateLimit(userID string) bool { - return o.getOverridesForUser(userID).EnableServiceUnavailableErrorOnRateLimit +// EnableServiceOverloadErrorOnRateLimit return whether the distributor uses error code 529 instead of 429 when the rate limit is exceeded. +func (o *Overrides) EnableServiceOverloadErrorOnRateLimit(userID string) bool { + return o.getOverridesForUser(userID).EnableServiceOverloadErrorOnRateLimit } // HAClusterLabel returns the cluster label to look for when deciding whether to accept a sample from a Prometheus HA replica. From 21a2eb850a770de9fffb2a2a06d80ccffd51d6f9 Mon Sep 17 00:00:00 2001 From: ying-jeanne <74549700+ying-jeanne@users.noreply.github.com> Date: Thu, 17 Aug 2023 11:06:40 +0200 Subject: [PATCH 3/8] Update pkg/distributor/distributor.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Peter Štibraný --- pkg/distributor/distributor.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index d7f287f9bd6..668320959cf 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -78,6 +78,7 @@ const ( // buffers for multiple write requests sent to ingesters will be allocated from single "slab", if there is enough space. writeRequestSlabPoolSize = 512 * 1024 + // until http.StatusServiceOverload exists statusServiceOverload = 529 ) From a2d4630fd7e7b57bef21a98a96f0847012ecde5c Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 17 Aug 2023 12:07:13 +0200 Subject: [PATCH 4/8] address comments --- CHANGELOG.md | 2 +- cmd/mimir/config-descriptor.json | 6 +- cmd/mimir/help-all.txt.tmpl | 4 +- .../mimir/configure/about-versioning.md | 2 +- .../configuration-parameters/index.md | 10 +- pkg/distributor/distributor.go | 3 +- pkg/distributor/distributor_test.go | 23 ++--- pkg/util/validation/limits.go | 95 +++++++++---------- 8 files changed, 70 insertions(+), 75 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d6513ec1f2c..40419846cbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ * [CHANGE] Ingester: Do not log errors related to hitting per-instance limits to reduce resource usage when ingesters are under pressure. #5585 * [CHANGE] gRPC clients: use default connect timeout of 5s, and therefore enable default connect backoff max delay of 5s. #5562 * [CHANGE] The `-shutdown-delay` flag is no longer experimental. #5701 -* [FEATURE] Introduced `distributor.enable_service_overload_error_on_rate_limit` flag for configuring error code to 529 instead of 429 upon rate limit exhaustion. #5752 +* [FEATURE] Introduced `distributor.service_overload_status_code_on_rate_limit_enabled` flag for configuring error code to 529 instead of 429 upon rate limit exhaustion. #5752 * [FEATURE] Cardinality API: Add a new `count_method` parameter which enables counting active series #5136 * [FEATURE] Query-frontend: added experimental support to cache cardinality, label names and label values query responses. The cache will be used when `-query-frontend.cache-results` is enabled, and `-query-frontend.results-cache-ttl-for-cardinality-query` or `-query-frontend.results-cache-ttl-for-labels-query` set to a value greater than 0. The following metrics have been added to track the query results cache hit ratio per `request_type`: #5212 #5235 #5426 #5524 * `cortex_frontend_query_result_cache_requests_total{request_type="query_range|cardinality|label_names_and_values"}` diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index d599d3ebe85..b04478fc2fe 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -3022,12 +3022,12 @@ }, { "kind": "field", - "name": "enable_service_overload_error_on_rate_limit", + "name": "service_overload_status_code_on_rate_limit_enabled", "required": false, - "desc": "Flag to determines the error code returned when distributor rate limits are reached. If set to true, the error code will be 529; if set to false, a 429 error is returned", + "desc": "If enabled, rate limit errors will be reported to the client with HTTP status code 529 (Service is overloaded). If disabled, status code 429 (Too Many Requests) is used.", "fieldValue": null, "fieldDefaultValue": false, - "fieldFlag": "distributor.enable-service-overload-error-on-rate-limit", + "fieldFlag": "distributor.service-overload-status-code-on-rate-limit-enabled", "fieldType": "boolean", "fieldCategory": "experimental" }, diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 54c54f8c241..855ef1ce6ee 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -1031,8 +1031,6 @@ Usage of ./cmd/mimir/mimir: How frequently to clean up clients for ingesters that have gone away. (default 15s) -distributor.drop-label string This flag can be used to specify label names that to drop during sample ingestion within the distributor and can be repeated in order to drop multiple labels. - -distributor.enable-service-overload-error-on-rate-limit - [experimental] Flag to determines the error code returned when distributor rate limits are reached. If set to true, the error code will be 529; if set to false, a 429 error is returned -distributor.ha-tracker.cluster string Prometheus label to look for in samples to identify a Prometheus HA cluster. (default "cluster") -distributor.ha-tracker.consul.acl-token string @@ -1189,6 +1187,8 @@ Usage of ./cmd/mimir/mimir: The prefix for the keys in the store. Should end with a /. (default "collectors/") -distributor.ring.store string Backend storage to use for the ring. Supported values are: consul, etcd, inmemory, memberlist, multi. (default "memberlist") + -distributor.service-overload-status-code-on-rate-limit-enabled + [experimental] If enabled, rate limit errors will be reported to the client with HTTP status code 529 (Service is overloaded). If disabled, status code 429 (Too Many Requests) is used. -enable-go-runtime-metrics Set to true to enable all Go runtime metrics, such as go_sched_* and go_memstats_*. -flusher.exit-after-flush diff --git a/docs/sources/mimir/configure/about-versioning.md b/docs/sources/mimir/configure/about-versioning.md index 7c71eb70f1c..a65490a7d23 100644 --- a/docs/sources/mimir/configure/about-versioning.md +++ b/docs/sources/mimir/configure/about-versioning.md @@ -61,7 +61,7 @@ The following features are currently experimental: - Metrics relabeling - OTLP ingestion path - Configuring error code to 529 instead of 429 upon rate limit exhaustion. - - `distributor.enable-service-overload-error-on-rate-limit` + - `distributor.service-overload-status-code-on-rate-limit-enabled` - Hash ring - Disabling ring heartbeat timeouts - `-distributor.ring.heartbeat-timeout=0` diff --git a/docs/sources/mimir/references/configuration-parameters/index.md b/docs/sources/mimir/references/configuration-parameters/index.md index 550e294536e..274f53a3e1e 100644 --- a/docs/sources/mimir/references/configuration-parameters/index.md +++ b/docs/sources/mimir/references/configuration-parameters/index.md @@ -2753,11 +2753,11 @@ The `limits` block configures default and per-tenant limits imposed by component # during the relabeling phase and cleaned afterwards: __meta_tenant_id [metric_relabel_configs: | default = ] -# (experimental) Flag to determines the error code returned when distributor -# rate limits are reached. If set to true, the error code will be 529; if set to -# false, a 429 error is returned -# CLI flag: -distributor.enable-service-overload-error-on-rate-limit -[enable_service_overload_error_on_rate_limit: | default = false] +# (experimental) If enabled, rate limit errors will be reported to the client +# with HTTP status code 529 (Service is overloaded). If disabled, status code +# 429 (Too Many Requests) is used. +# CLI flag: -distributor.service-overload-status-code-on-rate-limit-enabled +[service_overload_status_code_on_rate_limit_enabled: | default = false] # The maximum number of in-memory series per tenant, across the cluster before # replication. 0 to disable. diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 668320959cf..72b301101d6 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -1044,8 +1044,7 @@ func (d *Distributor) limitsMiddleware(next push.Func) push.Func { // Return a 429 or a 529 here depending on configuration to tell the client it is going too fast. // Client may discard the data or slow down and re-send. // Prometheus v2.26 added a remote-write option 'retry_on_http_429'. - if d.limits.EnableServiceOverloadErrorOnRateLimit(userID) { - // hard coded 529 error since http parckage does not contain a 529 error code + if d.limits.ServiceOverloadStatusCodeOnRateLimitEnabled(userID) { return nil, httpgrpc.Errorf(statusServiceOverload, validation.NewRequestRateLimitedError(d.limits.RequestRate(userID), d.limits.RequestBurstSize(userID)).Error()) } return nil, httpgrpc.Errorf(http.StatusTooManyRequests, validation.NewRequestRateLimitedError(d.limits.RequestRate(userID), d.limits.RequestBurstSize(userID)).Error()) diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index cf5bae78a7b..08a458bff27 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -479,10 +479,9 @@ func TestDistributor_PushRequestRateLimiter(t *testing.T) { enableServiceOverloadError bool }{ "request limit should be evenly shared across distributors": { - distributors: 2, - requestRate: 4, - requestBurstSize: 2, - enableServiceOverloadError: false, + distributors: 2, + requestRate: 4, + requestBurstSize: 2, pushes: []testPush{ {expectedError: nil}, {expectedError: nil}, @@ -490,10 +489,9 @@ func TestDistributor_PushRequestRateLimiter(t *testing.T) { }, }, "request limit is disabled when set to 0": { - distributors: 2, - requestRate: 0, - requestBurstSize: 0, - enableServiceOverloadError: false, + distributors: 2, + requestRate: 0, + requestBurstSize: 0, pushes: []testPush{ {expectedError: nil}, {expectedError: nil}, @@ -501,10 +499,9 @@ func TestDistributor_PushRequestRateLimiter(t *testing.T) { }, }, "request burst should set to each distributor": { - distributors: 2, - requestRate: 2, - requestBurstSize: 3, - enableServiceOverloadError: false, + distributors: 2, + requestRate: 2, + requestBurstSize: 3, pushes: []testPush{ {expectedError: nil}, {expectedError: nil}, @@ -533,7 +530,7 @@ func TestDistributor_PushRequestRateLimiter(t *testing.T) { flagext.DefaultValues(limits) limits.RequestRate = testData.requestRate limits.RequestBurstSize = testData.requestBurstSize - limits.EnableServiceOverloadErrorOnRateLimit = testData.enableServiceOverloadError + limits.ServiceOverloadStatusCodeOnRateLimitEnabled = testData.enableServiceOverloadError // Start all expected distributors distributors, _, _ := prepare(t, prepConfig{ diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index be6bd895580..03abf481788 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -26,31 +26,30 @@ import ( ) const ( - MaxSeriesPerMetricFlag = "ingester.max-global-series-per-metric" - MaxMetadataPerMetricFlag = "ingester.max-global-metadata-per-metric" - MaxSeriesPerUserFlag = "ingester.max-global-series-per-user" - MaxMetadataPerUserFlag = "ingester.max-global-metadata-per-user" - MaxChunksPerQueryFlag = "querier.max-fetched-chunks-per-query" - MaxChunkBytesPerQueryFlag = "querier.max-fetched-chunk-bytes-per-query" - MaxSeriesPerQueryFlag = "querier.max-fetched-series-per-query" - maxLabelNamesPerSeriesFlag = "validation.max-label-names-per-series" - maxLabelNameLengthFlag = "validation.max-length-label-name" - maxLabelValueLengthFlag = "validation.max-length-label-value" - maxMetadataLengthFlag = "validation.max-metadata-length" - maxNativeHistogramBucketsFlag = "validation.max-native-histogram-buckets" - creationGracePeriodFlag = "validation.create-grace-period" - maxPartialQueryLengthFlag = "querier.max-partial-query-length" - maxTotalQueryLengthFlag = "query-frontend.max-total-query-length" - maxQueryExpressionSizeBytesFlag = "query-frontend.max-query-expression-size-bytes" - requestRateFlag = "distributor.request-rate-limit" - enableServiceOverloadErrorOnRateLimitFlag = "distributor.enable-service-overload-error-on-rate-limit" - requestBurstSizeFlag = "distributor.request-burst-size" - ingestionRateFlag = "distributor.ingestion-rate-limit" - ingestionBurstSizeFlag = "distributor.ingestion-burst-size" - HATrackerMaxClustersFlag = "distributor.ha-tracker.max-clusters" - resultsCacheTTLFlag = "query-frontend.results-cache-ttl" - resultsCacheTTLForOutOfOrderWindowFlag = "query-frontend.results-cache-ttl-for-out-of-order-time-window" - QueryIngestersWithinFlag = "querier.query-ingesters-within" + MaxSeriesPerMetricFlag = "ingester.max-global-series-per-metric" + MaxMetadataPerMetricFlag = "ingester.max-global-metadata-per-metric" + MaxSeriesPerUserFlag = "ingester.max-global-series-per-user" + MaxMetadataPerUserFlag = "ingester.max-global-metadata-per-user" + MaxChunksPerQueryFlag = "querier.max-fetched-chunks-per-query" + MaxChunkBytesPerQueryFlag = "querier.max-fetched-chunk-bytes-per-query" + MaxSeriesPerQueryFlag = "querier.max-fetched-series-per-query" + maxLabelNamesPerSeriesFlag = "validation.max-label-names-per-series" + maxLabelNameLengthFlag = "validation.max-length-label-name" + maxLabelValueLengthFlag = "validation.max-length-label-value" + maxMetadataLengthFlag = "validation.max-metadata-length" + maxNativeHistogramBucketsFlag = "validation.max-native-histogram-buckets" + creationGracePeriodFlag = "validation.create-grace-period" + maxPartialQueryLengthFlag = "querier.max-partial-query-length" + maxTotalQueryLengthFlag = "query-frontend.max-total-query-length" + maxQueryExpressionSizeBytesFlag = "query-frontend.max-query-expression-size-bytes" + requestRateFlag = "distributor.request-rate-limit" + requestBurstSizeFlag = "distributor.request-burst-size" + ingestionRateFlag = "distributor.ingestion-rate-limit" + ingestionBurstSizeFlag = "distributor.ingestion-burst-size" + HATrackerMaxClustersFlag = "distributor.ha-tracker.max-clusters" + resultsCacheTTLFlag = "query-frontend.results-cache-ttl" + resultsCacheTTLForOutOfOrderWindowFlag = "query-frontend.results-cache-ttl-for-out-of-order-time-window" + QueryIngestersWithinFlag = "querier.query-ingesters-within" // MinCompactorPartialBlockDeletionDelay is the minimum partial blocks deletion delay that can be configured in Mimir. MinCompactorPartialBlockDeletionDelay = 4 * time.Hour @@ -67,25 +66,25 @@ func (e LimitError) Error() string { // limits via flags, or per-user limits via yaml config. type Limits struct { // Distributor enforced limits. - RequestRate float64 `yaml:"request_rate" json:"request_rate"` - RequestBurstSize int `yaml:"request_burst_size" json:"request_burst_size"` - IngestionRate float64 `yaml:"ingestion_rate" json:"ingestion_rate"` - IngestionBurstSize int `yaml:"ingestion_burst_size" json:"ingestion_burst_size"` - AcceptHASamples bool `yaml:"accept_ha_samples" json:"accept_ha_samples"` - HAClusterLabel string `yaml:"ha_cluster_label" json:"ha_cluster_label"` - HAReplicaLabel string `yaml:"ha_replica_label" json:"ha_replica_label"` - HAMaxClusters int `yaml:"ha_max_clusters" json:"ha_max_clusters"` - DropLabels flagext.StringSlice `yaml:"drop_labels" json:"drop_labels" category:"advanced"` - MaxLabelNameLength int `yaml:"max_label_name_length" json:"max_label_name_length"` - MaxLabelValueLength int `yaml:"max_label_value_length" json:"max_label_value_length"` - MaxLabelNamesPerSeries int `yaml:"max_label_names_per_series" json:"max_label_names_per_series"` - MaxMetadataLength int `yaml:"max_metadata_length" json:"max_metadata_length"` - MaxNativeHistogramBuckets int `yaml:"max_native_histogram_buckets" json:"max_native_histogram_buckets"` - CreationGracePeriod model.Duration `yaml:"creation_grace_period" json:"creation_grace_period" category:"advanced"` - EnforceMetadataMetricName bool `yaml:"enforce_metadata_metric_name" json:"enforce_metadata_metric_name" category:"advanced"` - IngestionTenantShardSize int `yaml:"ingestion_tenant_shard_size" json:"ingestion_tenant_shard_size"` - MetricRelabelConfigs []*relabel.Config `yaml:"metric_relabel_configs,omitempty" json:"metric_relabel_configs,omitempty" doc:"nocli|description=List of metric relabel configurations. Note that in most situations, it is more effective to use metrics relabeling directly in the Prometheus server, e.g. remote_write.write_relabel_configs. Labels available during the relabeling phase and cleaned afterwards: __meta_tenant_id" category:"experimental"` - EnableServiceOverloadErrorOnRateLimit bool `yaml:"enable_service_overload_error_on_rate_limit" json:"enable_service_overload_error_on_rate_limit" category:"experimental"` + RequestRate float64 `yaml:"request_rate" json:"request_rate"` + RequestBurstSize int `yaml:"request_burst_size" json:"request_burst_size"` + IngestionRate float64 `yaml:"ingestion_rate" json:"ingestion_rate"` + IngestionBurstSize int `yaml:"ingestion_burst_size" json:"ingestion_burst_size"` + AcceptHASamples bool `yaml:"accept_ha_samples" json:"accept_ha_samples"` + HAClusterLabel string `yaml:"ha_cluster_label" json:"ha_cluster_label"` + HAReplicaLabel string `yaml:"ha_replica_label" json:"ha_replica_label"` + HAMaxClusters int `yaml:"ha_max_clusters" json:"ha_max_clusters"` + DropLabels flagext.StringSlice `yaml:"drop_labels" json:"drop_labels" category:"advanced"` + MaxLabelNameLength int `yaml:"max_label_name_length" json:"max_label_name_length"` + MaxLabelValueLength int `yaml:"max_label_value_length" json:"max_label_value_length"` + MaxLabelNamesPerSeries int `yaml:"max_label_names_per_series" json:"max_label_names_per_series"` + MaxMetadataLength int `yaml:"max_metadata_length" json:"max_metadata_length"` + MaxNativeHistogramBuckets int `yaml:"max_native_histogram_buckets" json:"max_native_histogram_buckets"` + CreationGracePeriod model.Duration `yaml:"creation_grace_period" json:"creation_grace_period" category:"advanced"` + EnforceMetadataMetricName bool `yaml:"enforce_metadata_metric_name" json:"enforce_metadata_metric_name" category:"advanced"` + IngestionTenantShardSize int `yaml:"ingestion_tenant_shard_size" json:"ingestion_tenant_shard_size"` + MetricRelabelConfigs []*relabel.Config `yaml:"metric_relabel_configs,omitempty" json:"metric_relabel_configs,omitempty" doc:"nocli|description=List of metric relabel configurations. Note that in most situations, it is more effective to use metrics relabeling directly in the Prometheus server, e.g. remote_write.write_relabel_configs. Labels available during the relabeling phase and cleaned afterwards: __meta_tenant_id" category:"experimental"` + ServiceOverloadStatusCodeOnRateLimitEnabled bool `yaml:"service_overload_status_code_on_rate_limit_enabled" json:"service_overload_status_code_on_rate_limit_enabled" category:"experimental"` // Ingester enforced limits. // Series MaxGlobalSeriesPerUser int `yaml:"max_global_series_per_user" json:"max_global_series_per_user"` @@ -186,7 +185,7 @@ type Limits struct { func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.IntVar(&l.IngestionTenantShardSize, "distributor.ingestion-tenant-shard-size", 0, "The tenant's shard size used by shuffle-sharding. This value is the total size of the shard (ie. it is not the number of ingesters in the shard per zone, but the number of ingesters in the shard across all zones, if zone-awareness is enabled). Must be set both on ingesters and distributors. 0 disables shuffle sharding.") f.Float64Var(&l.RequestRate, requestRateFlag, 0, "Per-tenant push request rate limit in requests per second. 0 to disable.") - f.BoolVar(&l.EnableServiceOverloadErrorOnRateLimit, enableServiceOverloadErrorOnRateLimitFlag, false, "Flag to determines the error code returned when distributor rate limits are reached. If set to true, the error code will be 529; if set to false, a 429 error is returned") + f.BoolVar(&l.ServiceOverloadStatusCodeOnRateLimitEnabled, "distributor.service-overload-status-code-on-rate-limit-enabled", false, "If enabled, rate limit errors will be reported to the client with HTTP status code 529 (Service is overloaded). If disabled, status code 429 (Too Many Requests) is used.") f.IntVar(&l.RequestBurstSize, requestBurstSizeFlag, 0, "Per-tenant allowed push request burst size. 0 to disable.") f.Float64Var(&l.IngestionRate, ingestionRateFlag, 10000, "Per-tenant ingestion rate limit in samples per second.") f.IntVar(&l.IngestionBurstSize, ingestionBurstSizeFlag, 200000, "Per-tenant allowed ingestion burst size (in number of samples).") @@ -437,9 +436,9 @@ func (o *Overrides) AcceptHASamples(userID string) bool { return o.getOverridesForUser(userID).AcceptHASamples } -// EnableServiceOverloadErrorOnRateLimit return whether the distributor uses error code 529 instead of 429 when the rate limit is exceeded. -func (o *Overrides) EnableServiceOverloadErrorOnRateLimit(userID string) bool { - return o.getOverridesForUser(userID).EnableServiceOverloadErrorOnRateLimit +// ServiceOverloadStatusCodeOnRateLimitEnabled return whether the distributor uses error code 529 instead of 429 when the rate limit is exceeded. +func (o *Overrides) ServiceOverloadStatusCodeOnRateLimitEnabled(userID string) bool { + return o.getOverridesForUser(userID).ServiceOverloadStatusCodeOnRateLimitEnabled } // HAClusterLabel returns the cluster label to look for when deciding whether to accept a sample from a Prometheus HA replica. From 746b5f94bee955585abcd4fcd3f31e3efec4783f Mon Sep 17 00:00:00 2001 From: ying-jeanne <74549700+ying-jeanne@users.noreply.github.com> Date: Thu, 17 Aug 2023 14:00:32 +0200 Subject: [PATCH 5/8] Update pkg/util/validation/limits.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Peter Štibraný --- pkg/util/validation/limits.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 6e2ddb62ada..36d9f59fa3b 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -436,7 +436,7 @@ func (o *Overrides) AcceptHASamples(userID string) bool { return o.getOverridesForUser(userID).AcceptHASamples } -// ServiceOverloadStatusCodeOnRateLimitEnabled return whether the distributor uses error code 529 instead of 429 when the rate limit is exceeded. +// ServiceOverloadStatusCodeOnRateLimitEnabled return whether the distributor uses status code 529 instead of 429 when the rate limit is exceeded. func (o *Overrides) ServiceOverloadStatusCodeOnRateLimitEnabled(userID string) bool { return o.getOverridesForUser(userID).ServiceOverloadStatusCodeOnRateLimitEnabled } From e7e39271657e0c55b83c8ac9e7265f3056442232 Mon Sep 17 00:00:00 2001 From: ying-jeanne <74549700+ying-jeanne@users.noreply.github.com> Date: Thu, 17 Aug 2023 14:00:48 +0200 Subject: [PATCH 6/8] Update CHANGELOG.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Peter Štibraný --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 98a19c7804d..069f6e853bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,7 +16,7 @@ * [CHANGE] The `-shutdown-delay` flag is no longer experimental. #5701 * [CHANGE] The `-validation.create-grace-period` is now enforced in the ingester too, other than distributor and query-frontend. If you've configured `-validation.create-grace-period` then make sure the configuration is applied to ingesters too. #5712 * [CHANGE] The `-validation.create-grace-period` is now enforced for examplars too in the distributor. If an examplar has timestamp greater than "now + grace_period", then the exemplar will be dropped and the metric `cortex_discarded_exemplars_total{reason="exemplar_too_far_in_future",user="..."}` increased. #5761 -* [FEATURE] Introduced `distributor.service_overload_status_code_on_rate_limit_enabled` flag for configuring error code to 529 instead of 429 upon rate limit exhaustion. #5752 +* [FEATURE] Introduced `distributor.service_overload_status_code_on_rate_limit_enabled` flag for configuring status code to 529 instead of 429 upon rate limit exhaustion. #5752 * [FEATURE] Cardinality API: Add a new `count_method` parameter which enables counting active series #5136 * [FEATURE] Query-frontend: added experimental support to cache cardinality, label names and label values query responses. The cache will be used when `-query-frontend.cache-results` is enabled, and `-query-frontend.results-cache-ttl-for-cardinality-query` or `-query-frontend.results-cache-ttl-for-labels-query` set to a value greater than 0. The following metrics have been added to track the query results cache hit ratio per `request_type`: #5212 #5235 #5426 #5524 * `cortex_frontend_query_result_cache_requests_total{request_type="query_range|cardinality|label_names_and_values"}` From 5001120e09fd070d25fa747e131ba70225c44ceb Mon Sep 17 00:00:00 2001 From: ying-jeanne <74549700+ying-jeanne@users.noreply.github.com> Date: Thu, 17 Aug 2023 14:00:54 +0200 Subject: [PATCH 7/8] Update docs/sources/mimir/configure/about-versioning.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Peter Štibraný --- docs/sources/mimir/configure/about-versioning.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sources/mimir/configure/about-versioning.md b/docs/sources/mimir/configure/about-versioning.md index a65490a7d23..d1a247b47e3 100644 --- a/docs/sources/mimir/configure/about-versioning.md +++ b/docs/sources/mimir/configure/about-versioning.md @@ -60,7 +60,7 @@ The following features are currently experimental: - Distributor - Metrics relabeling - OTLP ingestion path - - Configuring error code to 529 instead of 429 upon rate limit exhaustion. + - Using status code 529 instead of 429 upon rate limit exhaustion. - `distributor.service-overload-status-code-on-rate-limit-enabled` - Hash ring - Disabling ring heartbeat timeouts From 31c89abaa8b1b5dca7b5427a6b7eb37fde1a05b4 Mon Sep 17 00:00:00 2001 From: ying-jeanne <74549700+ying-jeanne@users.noreply.github.com> Date: Thu, 17 Aug 2023 14:00:59 +0200 Subject: [PATCH 8/8] Update pkg/distributor/distributor.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Peter Štibraný --- pkg/distributor/distributor.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 2747dafd14a..a88a2622b23 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -78,7 +78,7 @@ const ( // buffers for multiple write requests sent to ingesters will be allocated from single "slab", if there is enough space. writeRequestSlabPoolSize = 512 * 1024 - // until http.StatusServiceOverload exists + // 529 is non-standard status code used by some services to signal that "The service is overloaded". statusServiceOverload = 529 )