From cc21b4d9509a53862da1054288f28849d126a3a1 Mon Sep 17 00:00:00 2001
From: Ying WANG <ying.wang@grafana.com>
Date: Thu, 2 Nov 2023 21:46:03 +0100
Subject: [PATCH 1/5] Distributor: return also 529 for ingestion rate limit
 when serviceOverloadErrorEnabled

---
 CHANGELOG.md                 | 1 +
 pkg/distributor/push.go      | 7 +------
 pkg/distributor/push_test.go | 6 ++++++
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 21f5b300214..a3ed460629c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -61,6 +61,7 @@
 * [ENHANCEMENT] Query-frontend: added "queue_time_seconds" field to "query stats" log. This is total time that query and subqueries spent in the queue, before queriers picked it up. #6537
 * [ENHANCEMENT] Server: Add `-server.report-grpc-codes-in-instrumentation-label-enabled` CLI flag to specify whether gRPC status codes should be used in `status_code` label of `cortex_request_duration_seconds` metric. It defaults to false, meaning that successful and erroneous gRPC status codes are represented with `success` and `error` respectively. #6562
 * [ENHANCEMENT] Server: Add `-ingester.client.report-grpc-codes-in-instrumentation-label-enabled` CLI flag to specify whether gRPC status codes should be used in `status_code` label of `cortex_ingester_client_request_duration_seconds` metric. It defaults to false, meaning that successful and erroneous gRPC status codes are represented with `2xx` and `error` respectively. #6562
+* [BUGFIX] Distributor: return server overload error in the event of exceeding the ingestion rate limit. #6549
 * [BUGFIX] Ring: Ensure network addresses used for component hash rings are formatted correctly when using IPv6. #6068
 * [BUGFIX] Query-scheduler: don't retain connections from queriers that have shut down, leading to gradually increasing enqueue latency over time. #6100 #6145
 * [BUGFIX] Ingester: prevent query logic from continuing to execute after queries are canceled. #6085
diff --git a/pkg/distributor/push.go b/pkg/distributor/push.go
index 171cf049ae7..9a4e7bafff9 100644
--- a/pkg/distributor/push.go
+++ b/pkg/distributor/push.go
@@ -160,12 +160,7 @@ func toHTTPStatus(ctx context.Context, pushErr error, limits *validation.Overrid
 		switch distributorErr.errorCause() {
 		case mimirpb.BAD_DATA:
 			return http.StatusBadRequest
-		case mimirpb.INGESTION_RATE_LIMITED:
-			// Return a 429 here to tell the client it is going too fast.
-			// Client may discard the data or slow down and re-send.
-			// Prometheus v2.26 added a remote-write option 'retry_on_http_429'.
-			return http.StatusTooManyRequests
-		case mimirpb.REQUEST_RATE_LIMITED:
+		case mimirpb.INGESTION_RATE_LIMITED, mimirpb.REQUEST_RATE_LIMITED:
 			serviceOverloadErrorEnabled := false
 			userID, err := tenant.TenantID(ctx)
 			if err == nil {
diff --git a/pkg/distributor/push_test.go b/pkg/distributor/push_test.go
index 1c78c74b6f9..469eafea2c5 100644
--- a/pkg/distributor/push_test.go
+++ b/pkg/distributor/push_test.go
@@ -913,6 +913,12 @@ func TestHandler_ToHTTPStatus(t *testing.T) {
 			expectedHTTPStatus: http.StatusTooManyRequests,
 			expectedErrorMsg:   ingestionRateLimitedErr.Error(),
 		},
+		"an ingestionRateLimitedError with serviceOverloadErrorEnabled gets translated into an HTTP 529": {
+			err:                         ingestionRateLimitedErr,
+			serviceOverloadErrorEnabled: true,
+			expectedHTTPStatus:          StatusServiceOverloaded,
+			expectedErrorMsg:            ingestionRateLimitedErr.Error(),
+		},
 		"a DoNotLogError of an ingestionRateLimitedError gets translated into an HTTP 429": {
 			err:                middleware.DoNotLogError{Err: ingestionRateLimitedErr},
 			expectedHTTPStatus: http.StatusTooManyRequests,

From ed3c3a9404ca08d0513dbc6642618d76a3606c98 Mon Sep 17 00:00:00 2001
From: Ying WANG <ying.wang@grafana.com>
Date: Fri, 10 Nov 2023 00:24:39 +0100
Subject: [PATCH 2/5] update the document

---
 cmd/mimir/config-descriptor.json                              | 2 +-
 cmd/mimir/help-all.txt.tmpl                                   | 2 +-
 .../mimir/references/configuration-parameters/index.md        | 4 +++-
 pkg/util/validation/limits.go                                 | 2 +-
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json
index fada4570e2e..26dd6f92c0a 100644
--- a/cmd/mimir/config-descriptor.json
+++ b/cmd/mimir/config-descriptor.json
@@ -3208,7 +3208,7 @@
           "kind": "field",
           "name": "service_overload_status_code_on_rate_limit_enabled",
           "required": false,
-          "desc": "If enabled, rate limit errors will be reported to the client with HTTP status code 529 (Service is overloaded). If disabled, status code 429 (Too Many Requests) is used.",
+          "desc": "If enabled, rate limit errors will be reported to the client with HTTP status code 529 (Service is overloaded). If disabled, status code 429 (Too Many Requests) is used. Enabling distributor.retry.enabled before utilizing this option is strongly recommended as it helps prevent premature request retries by the client.",
           "fieldValue": null,
           "fieldDefaultValue": false,
           "fieldFlag": "distributor.service-overload-status-code-on-rate-limit-enabled",
diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl
index c000fecd9c3..9a3f8c0f51f 100644
--- a/cmd/mimir/help-all.txt.tmpl
+++ b/cmd/mimir/help-all.txt.tmpl
@@ -1212,7 +1212,7 @@ Usage of ./cmd/mimir/mimir:
   -distributor.ring.store string
     	Backend storage to use for the ring. Supported values are: consul, etcd, inmemory, memberlist, multi. (default "memberlist")
   -distributor.service-overload-status-code-on-rate-limit-enabled
-    	[experimental] If enabled, rate limit errors will be reported to the client with HTTP status code 529 (Service is overloaded). If disabled, status code 429 (Too Many Requests) is used.
+    	[experimental] If enabled, rate limit errors will be reported to the client with HTTP status code 529 (Service is overloaded). If disabled, status code 429 (Too Many Requests) is used. Enabling distributor.retry.enabled before utilizing this option is strongly recommended as it helps prevent premature request retries by the client.
   -distributor.write-requests-buffer-pooling-enabled
     	[experimental] Enable pooling of buffers used for marshaling write requests.
   -enable-go-runtime-metrics
diff --git a/docs/sources/mimir/references/configuration-parameters/index.md b/docs/sources/mimir/references/configuration-parameters/index.md
index ee062c1174f..2cf16f5b144 100644
--- a/docs/sources/mimir/references/configuration-parameters/index.md
+++ b/docs/sources/mimir/references/configuration-parameters/index.md
@@ -2915,7 +2915,9 @@ The `limits` block configures default and per-tenant limits imposed by component
 
 # (experimental) If enabled, rate limit errors will be reported to the client
 # with HTTP status code 529 (Service is overloaded). If disabled, status code
-# 429 (Too Many Requests) is used.
+# 429 (Too Many Requests) is used. Enabling distributor.retry.enabled before
+# utilizing this option is strongly recommended as it helps prevent premature
+# request retries by the client.
 # CLI flag: -distributor.service-overload-status-code-on-rate-limit-enabled
 [service_overload_status_code_on_rate_limit_enabled: <boolean> | default = false]
 
diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go
index 96a664d49f9..f5181cc5e29 100644
--- a/pkg/util/validation/limits.go
+++ b/pkg/util/validation/limits.go
@@ -205,7 +205,7 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) {
 	_ = l.CreationGracePeriod.Set("10m")
 	f.Var(&l.CreationGracePeriod, CreationGracePeriodFlag, "Controls how far into the future incoming samples and exemplars are accepted compared to the wall clock. Any sample or exemplar will be rejected if its timestamp is greater than '(now + grace_period)'. This configuration is enforced in the distributor, ingester and query-frontend (to avoid querying too far into the future).")
 	f.BoolVar(&l.EnforceMetadataMetricName, "validation.enforce-metadata-metric-name", true, "Enforce every metadata has a metric name.")
-	f.BoolVar(&l.ServiceOverloadStatusCodeOnRateLimitEnabled, "distributor.service-overload-status-code-on-rate-limit-enabled", false, "If enabled, rate limit errors will be reported to the client with HTTP status code 529 (Service is overloaded). If disabled, status code 429 (Too Many Requests) is used.")
+	f.BoolVar(&l.ServiceOverloadStatusCodeOnRateLimitEnabled, "distributor.service-overload-status-code-on-rate-limit-enabled", false, "If enabled, rate limit errors will be reported to the client with HTTP status code 529 (Service is overloaded). If disabled, status code 429 (Too Many Requests) is used. Enabling distributor.retry.enabled before utilizing this option is strongly recommended as it helps prevent premature request retries by the client.")
 
 	f.IntVar(&l.MaxGlobalSeriesPerUser, MaxSeriesPerUserFlag, 150000, "The maximum number of in-memory series per tenant, across the cluster before replication. 0 to disable.")
 	f.IntVar(&l.MaxGlobalSeriesPerMetric, MaxSeriesPerMetricFlag, 0, "The maximum number of in-memory series per metric name, across the cluster before replication. 0 to disable.")

From b6c6a4a9b8babfb937d692c4e7963e0d435e2f07 Mon Sep 17 00:00:00 2001
From: Ying WANG <ying.wang@grafana.com>
Date: Tue, 14 Nov 2023 05:14:10 +0100
Subject: [PATCH 3/5] update doc

---
 .../mimir/references/configuration-parameters/index.md     | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/sources/mimir/references/configuration-parameters/index.md b/docs/sources/mimir/references/configuration-parameters/index.md
index 027a9da028c..1d0fef62480 100644
--- a/docs/sources/mimir/references/configuration-parameters/index.md
+++ b/docs/sources/mimir/references/configuration-parameters/index.md
@@ -2920,9 +2920,10 @@ The `limits` block configures default and per-tenant limits imposed by component
 
 # (experimental) If enabled, rate limit errors will be reported to the client
 # with HTTP status code 529 (Service is overloaded). If disabled, status code
-# 429 (Too Many Requests) is used. Enabling distributor.retry.enabled before
-# utilizing this option is strongly recommended as it helps prevent premature
-# request retries by the client.
+# 429 (Too Many Requests) is used. Enabling
+# distributor.retry-after-header.enabled before utilizing this option is
+# strongly recommended as it helps prevent premature request retries by the
+# client.
 # CLI flag: -distributor.service-overload-status-code-on-rate-limit-enabled
 [service_overload_status_code_on_rate_limit_enabled: <boolean> | default = false]
 

From 8aab037b8312c44e049f5e28ebdd440e8ea89c26 Mon Sep 17 00:00:00 2001
From: Ying WANG <74549700+ying-jeanne@users.noreply.github.com>
Date: Tue, 14 Nov 2023 14:36:08 +0100
Subject: [PATCH 4/5] Update pkg/util/validation/limits.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Peter Štibraný <pstibrany@gmail.com>
---
 pkg/util/validation/limits.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go
index e61fa3c4e78..bfd13db047f 100644
--- a/pkg/util/validation/limits.go
+++ b/pkg/util/validation/limits.go
@@ -205,7 +205,7 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) {
 	_ = l.CreationGracePeriod.Set("10m")
 	f.Var(&l.CreationGracePeriod, CreationGracePeriodFlag, "Controls how far into the future incoming samples and exemplars are accepted compared to the wall clock. Any sample or exemplar will be rejected if its timestamp is greater than '(now + grace_period)'. This configuration is enforced in the distributor, ingester and query-frontend (to avoid querying too far into the future).")
 	f.BoolVar(&l.EnforceMetadataMetricName, "validation.enforce-metadata-metric-name", true, "Enforce every metadata has a metric name.")
-	f.BoolVar(&l.ServiceOverloadStatusCodeOnRateLimitEnabled, "distributor.service-overload-status-code-on-rate-limit-enabled", false, "If enabled, rate limit errors will be reported to the client with HTTP status code 529 (Service is overloaded). If disabled, status code 429 (Too Many Requests) is used. Enabling distributor.retry-after-header.enabled before utilizing this option is strongly recommended as it helps prevent premature request retries by the client.")
+	f.BoolVar(&l.ServiceOverloadStatusCodeOnRateLimitEnabled, "distributor.service-overload-status-code-on-rate-limit-enabled", false, "If enabled, rate limit errors will be reported to the client with HTTP status code 529 (Service is overloaded). If disabled, status code 429 (Too Many Requests) is used. Enabling -distributor.retry-after-header.enabled before utilizing this option is strongly recommended as it helps prevent premature request retries by the client.")
 
 	f.IntVar(&l.MaxGlobalSeriesPerUser, MaxSeriesPerUserFlag, 150000, "The maximum number of in-memory series per tenant, across the cluster before replication. 0 to disable.")
 	f.IntVar(&l.MaxGlobalSeriesPerMetric, MaxSeriesPerMetricFlag, 0, "The maximum number of in-memory series per metric name, across the cluster before replication. 0 to disable.")

From 045ffd8d4d95a1d7886ad2a09867e7bb1fe8b0bd Mon Sep 17 00:00:00 2001
From: Ying WANG <ying.wang@grafana.com>
Date: Tue, 14 Nov 2023 15:12:24 +0100
Subject: [PATCH 5/5] update docs

---
 cmd/mimir/config-descriptor.json                                | 2 +-
 cmd/mimir/help-all.txt.tmpl                                     | 2 +-
 docs/sources/mimir/references/configuration-parameters/index.md | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json
index f6d6529dc1d..4e058bff79c 100644
--- a/cmd/mimir/config-descriptor.json
+++ b/cmd/mimir/config-descriptor.json
@@ -3218,7 +3218,7 @@
           "kind": "field",
           "name": "service_overload_status_code_on_rate_limit_enabled",
           "required": false,
-          "desc": "If enabled, rate limit errors will be reported to the client with HTTP status code 529 (Service is overloaded). If disabled, status code 429 (Too Many Requests) is used. Enabling distributor.retry-after-header.enabled before utilizing this option is strongly recommended as it helps prevent premature request retries by the client.",
+          "desc": "If enabled, rate limit errors will be reported to the client with HTTP status code 529 (Service is overloaded). If disabled, status code 429 (Too Many Requests) is used. Enabling -distributor.retry-after-header.enabled before utilizing this option is strongly recommended as it helps prevent premature request retries by the client.",
           "fieldValue": null,
           "fieldDefaultValue": false,
           "fieldFlag": "distributor.service-overload-status-code-on-rate-limit-enabled",
diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl
index c81cd30bd3f..4028fefe93d 100644
--- a/cmd/mimir/help-all.txt.tmpl
+++ b/cmd/mimir/help-all.txt.tmpl
@@ -1212,7 +1212,7 @@ Usage of ./cmd/mimir/mimir:
   -distributor.ring.store string
     	Backend storage to use for the ring. Supported values are: consul, etcd, inmemory, memberlist, multi. (default "memberlist")
   -distributor.service-overload-status-code-on-rate-limit-enabled
-    	[experimental] If enabled, rate limit errors will be reported to the client with HTTP status code 529 (Service is overloaded). If disabled, status code 429 (Too Many Requests) is used. Enabling distributor.retry-after-header.enabled before utilizing this option is strongly recommended as it helps prevent premature request retries by the client.
+    	[experimental] If enabled, rate limit errors will be reported to the client with HTTP status code 529 (Service is overloaded). If disabled, status code 429 (Too Many Requests) is used. Enabling -distributor.retry-after-header.enabled before utilizing this option is strongly recommended as it helps prevent premature request retries by the client.
   -distributor.write-requests-buffer-pooling-enabled
     	[experimental] Enable pooling of buffers used for marshaling write requests.
   -enable-go-runtime-metrics
diff --git a/docs/sources/mimir/references/configuration-parameters/index.md b/docs/sources/mimir/references/configuration-parameters/index.md
index 6bd2ef5d1de..4ce6c6b4e8a 100644
--- a/docs/sources/mimir/references/configuration-parameters/index.md
+++ b/docs/sources/mimir/references/configuration-parameters/index.md
@@ -2920,7 +2920,7 @@ The `limits` block configures default and per-tenant limits imposed by component
 # (experimental) If enabled, rate limit errors will be reported to the client
 # with HTTP status code 529 (Service is overloaded). If disabled, status code
 # 429 (Too Many Requests) is used. Enabling
-# distributor.retry-after-header.enabled before utilizing this option is
+# -distributor.retry-after-header.enabled before utilizing this option is
 # strongly recommended as it helps prevent premature request retries by the
 # client.
 # CLI flag: -distributor.service-overload-status-code-on-rate-limit-enabled