From 97c4104148dba60a1b2755d56e349999ffe71dfa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= <pstibrany@gmail.com>
Date: Fri, 22 Mar 2024 18:07:29 +0100
Subject: [PATCH 01/19] Add IngesterFailedToReadRecordsFromKafka and
 IngesterKafkaFetchErrorsRateTooHigh alerts.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Peter Štibraný <pstibrany@gmail.com>
---
 .../alerts.yaml                               | 24 ++++++++++++
 operations/mimir-mixin-compiled/alerts.yaml   | 24 ++++++++++++
 .../alerts/ingest-storage.libsonnet           | 37 +++++++++++++++++++
 pkg/storage/ingest/reader.go                  | 12 ++++--
 pkg/storage/ingest/reader_test.go             | 35 +++++++++++++++++-
 5 files changed, 126 insertions(+), 6 deletions(-)

diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml
index da9d6919d7b..ca400150edd 100644
--- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml
+++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml
@@ -953,6 +953,30 @@ groups:
     for: 15m
     labels:
       severity: critical
+  - alert: MimirIngesterFailedToReadRecordsFromKafka
+    annotations:
+      message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} is failing to read records from Kafka.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailedtoreadrecordsfromkafka
+    expr: |
+      sum by(cluster, namespace, instance, node_id) (rate(cortex_ingest_storage_reader_read_errors_total[1m]))
+      > 0
+    for: 5m
+    labels:
+      severity: critical
+  - alert: MimirIngesterKafkaFetchErrorsRateTooHigh
+    annotations:
+      message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} is receiving fetch errors when reading records from Kafka.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterkafkafetcherrorsratetoohigh
+    expr: |
+      sum by (cluster, namespace, instance) (rate (cortex_ingest_storage_reader_fetch_errors_total}[5m]))
+      /
+      sum by (cluster, namespace, instance) (rate (cortex_ingest_storage_reader_fetches_total[5m]))
+      > 0.1
+    for: 15m
+    labels:
+      severity: critical
 - name: mimir_continuous_test
   rules:
   - alert: MimirContinuousTestNotRunningOnWrites
diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml
index 7599b0d3700..a2cbe41dc92 100644
--- a/operations/mimir-mixin-compiled/alerts.yaml
+++ b/operations/mimir-mixin-compiled/alerts.yaml
@@ -966,6 +966,30 @@ groups:
     for: 15m
     labels:
       severity: critical
+  - alert: MimirIngesterFailedToReadRecordsFromKafka
+    annotations:
+      message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} is failing to read records from Kafka.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailedtoreadrecordsfromkafka
+    expr: |
+      sum by(cluster, namespace, pod, node_id) (rate(cortex_ingest_storage_reader_read_errors_total[1m]))
+      > 0
+    for: 5m
+    labels:
+      severity: critical
+  - alert: MimirIngesterKafkaFetchErrorsRateTooHigh
+    annotations:
+      message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} is receiving fetch errors when reading records from Kafka.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterkafkafetcherrorsratetoohigh
+    expr: |
+      sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetch_errors_total}[5m]))
+      /
+      sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetches_total[5m]))
+      > 0.1
+    for: 15m
+    labels:
+      severity: critical
 - name: mimir_continuous_test
   rules:
   - alert: MimirContinuousTestNotRunningOnWrites
diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet
index db80497b50f..eb4e826cd91 100644
--- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet
+++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet
@@ -19,6 +19,43 @@
             message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s is failing to commit the last consumed offset.' % $._config,
           },
         },
+
+        {
+          alert: $.alertName('IngesterFailedToReadRecordsFromKafka'),
+          'for': '5m',
+
+          // Metric used by this alert is reported by Kafka client on read errors from connection to Kafka.
+          // We use node_id to only alert if problems to the same Kafka node are repeating.
+          // If problems are for different nodes (eg. during rollout), that is not a problem, and we don't need to trigger alert.
+          expr: |||
+            sum by(%(alert_aggregation_labels)s, %(per_instance_label)s, node_id) (rate(cortex_ingest_storage_reader_read_errors_total[1m]))
+            > 0
+          ||| % $._config,
+          labels: {
+            severity: 'critical',
+          },
+          annotations: {
+            message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s is failing to read records from Kafka.' % $._config,
+          },
+        },
+
+        {
+          alert: $.alertName('IngesterKafkaFetchErrorsRateTooHigh'),
+          'for': '15m',
+          // See https://github.com/grafana/mimir/blob/24591ae56cd7d6ef24a7cc1541a41405676773f4/vendor/github.com/twmb/franz-go/pkg/kgo/record_and_fetch.go#L332-L366 for errors that can be reported here.
+          expr: |||
+            sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate (cortex_ingest_storage_reader_fetch_errors_total}[5m]))
+            /
+            sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate (cortex_ingest_storage_reader_fetches_total[5m]))
+            > 0.1
+          ||| % $._config,
+          labels: {
+            severity: 'critical',
+          },
+          annotations: {
+            message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s is receiving fetch errors when reading records from Kafka.' % $._config,
+          },
+        },
       ],
     },
   ],
diff --git a/pkg/storage/ingest/reader.go b/pkg/storage/ingest/reader.go
index 1718adec583..42b5864ea01 100644
--- a/pkg/storage/ingest/reader.go
+++ b/pkg/storage/ingest/reader.go
@@ -191,7 +191,7 @@ func (r *PartitionReader) run(ctx context.Context) error {
 func (r *PartitionReader) processNextFetches(ctx context.Context, delayObserver prometheus.Observer) {
 	fetches := r.client.PollFetches(ctx)
 	r.recordFetchesMetrics(fetches, delayObserver)
-	r.logFetchErrs(fetches)
+	r.logFetchErrors(fetches)
 	fetches = filterOutErrFetches(fetches)
 
 	// TODO consumeFetches() may get interrupted in the middle because of ctx canceled due to PartitionReader stopped.
@@ -274,12 +274,16 @@ func isErrFetch(fetch kgo.Fetch) bool {
 	return false
 }
 
-func (r *PartitionReader) logFetchErrs(fetches kgo.Fetches) {
+func (r *PartitionReader) logFetchErrors(fetches kgo.Fetches) {
 	mErr := multierror.New()
-	fetches.EachError(func(s string, i int32, err error) {
+	fetches.EachError(func(topic string, partition int32, err error) {
+		if errors.Is(err, context.Canceled) {
+			return
+		}
+
 		// kgo advises to "restart" the kafka client if the returned error is a kerr.Error.
 		// Recreating the client would cause duplicate metrics registration, so we don't do it for now.
-		mErr.Add(fmt.Errorf("topic %q, partition %d: %w", s, i, err))
+		mErr.Add(fmt.Errorf("topic %q, partition %d: %w", topic, partition, err))
 	})
 	if len(mErr) == 0 {
 		return
diff --git a/pkg/storage/ingest/reader_test.go b/pkg/storage/ingest/reader_test.go
index ee49c76c715..3481dbe5c54 100644
--- a/pkg/storage/ingest/reader_test.go
+++ b/pkg/storage/ingest/reader_test.go
@@ -69,6 +69,37 @@ func TestPartitionReader(t *testing.T) {
 	assert.Equal(t, [][]byte{content, content}, records)
 }
 
+func TestPartitionReader_logFetchErrors(t *testing.T) {
+	const (
+		topicName   = "test"
+		partitionID = 1
+	)
+
+	cfg := defaultReaderTestConfig(t, "", topicName, partitionID, nil)
+	reader, err := newPartitionReader(cfg.kafka, cfg.partitionID, "test-group", cfg.consumer, cfg.logger, cfg.registry)
+	require.NoError(t, err)
+
+	reader.logFetchErrors(kgo.Fetches{
+		kgo.Fetch{Topics: []kgo.FetchTopic{
+			{
+				Topic: topicName,
+				Partitions: []kgo.FetchPartition{
+					{Partition: partitionID, Err: nil},
+					{Partition: partitionID, Err: context.Canceled},                            // not counted in metrics
+					{Partition: partitionID, Err: fmt.Errorf("wrapped: %w", context.Canceled)}, // not counted in metrics
+					{Partition: partitionID, Err: fmt.Errorf("real error")},                    // counted
+				},
+			},
+		}},
+	})
+
+	assert.NoError(t, promtest.GatherAndCompare(cfg.registry, strings.NewReader(`
+			# HELP cortex_ingest_storage_reader_fetch_errors_total The number of fetch errors encountered by the consumer.
+        	# TYPE cortex_ingest_storage_reader_fetch_errors_total counter
+        	cortex_ingest_storage_reader_fetch_errors_total 1
+	`), "cortex_ingest_storage_reader_fetch_errors_total"))
+}
+
 func TestPartitionReader_ConsumerError(t *testing.T) {
 	const (
 		topicName   = "test"
@@ -1114,7 +1145,7 @@ type readerTestCfg struct {
 	kafka          KafkaConfig
 	partitionID    int32
 	consumer       recordConsumer
-	registry       prometheus.Registerer
+	registry       *prometheus.Registry
 	logger         log.Logger
 	commitInterval time.Duration
 }
@@ -1145,7 +1176,7 @@ func withConsumeFromPositionAtStartup(position string) func(cfg *readerTestCfg)
 	}
 }
 
-func withRegistry(reg prometheus.Registerer) func(cfg *readerTestCfg) {
+func withRegistry(reg *prometheus.Registry) func(cfg *readerTestCfg) {
 	return func(cfg *readerTestCfg) {
 		cfg.registry = reg
 	}

From fa844446f5eda7d9bd6450b2f083b572e2900d7e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= <pstibrany@gmail.com>
Date: Mon, 25 Mar 2024 12:04:17 +0100
Subject: [PATCH 02/19] Alerts for ingester kafka lag.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Peter Štibraný <pstibrany@gmail.com>
---
 .../alerts.yaml                               | 22 ++++++++++++++
 operations/mimir-mixin-compiled/alerts.yaml   | 22 ++++++++++++++
 .../alerts/ingest-storage.libsonnet           | 29 +++++++++++++++++++
 3 files changed, 73 insertions(+)

diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml
index ca400150edd..2a5abaee3c4 100644
--- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml
+++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml
@@ -977,6 +977,28 @@ groups:
     for: 15m
     labels:
       severity: critical
+  - alert: MimirStartingIngesterKafkaLagNotDecreasing
+    annotations:
+      message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} in "starting" phase is not reducing consumption lag of write requests read
+        from Kafka.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkalagnotdecreasing
+    expr: |
+      deriv(histogram_quantile(0.99, sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))[5m:1m]) > 0
+    for: 5m
+    labels:
+      severity: warning
+  - alert: MimirRunningIngesterKafkaLagTooHigh
+    annotations:
+      message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} in "running" phase is too far behind in its consumption of write requests
+        from Kafka.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterkafkalagtoohigh
+    expr: |
+      histogram_quantile(0.99, sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[$__rate_interval]))) > (10*60)
+    for: 5m
+    labels:
+      severity: critical
 - name: mimir_continuous_test
   rules:
   - alert: MimirContinuousTestNotRunningOnWrites
diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml
index a2cbe41dc92..e9e67a82718 100644
--- a/operations/mimir-mixin-compiled/alerts.yaml
+++ b/operations/mimir-mixin-compiled/alerts.yaml
@@ -990,6 +990,28 @@ groups:
     for: 15m
     labels:
       severity: critical
+  - alert: MimirStartingIngesterKafkaLagNotDecreasing
+    annotations:
+      message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} in "starting" phase is not reducing consumption lag of write requests read
+        from Kafka.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkalagnotdecreasing
+    expr: |
+      deriv(histogram_quantile(0.99, sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))[5m:1m]) > 0
+    for: 5m
+    labels:
+      severity: warning
+  - alert: MimirRunningIngesterKafkaLagTooHigh
+    annotations:
+      message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} in "running" phase is too far behind in its consumption of write requests
+        from Kafka.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterkafkalagtoohigh
+    expr: |
+      histogram_quantile(0.99, sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[$__rate_interval]))) > (10*60)
+    for: 5m
+    labels:
+      severity: critical
 - name: mimir_continuous_test
   rules:
   - alert: MimirContinuousTestNotRunningOnWrites
diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet
index eb4e826cd91..aba0b5b8644 100644
--- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet
+++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet
@@ -56,6 +56,35 @@
             message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s is receiving fetch errors when reading records from Kafka.' % $._config,
           },
         },
+
+        // This is an experiment. We compute derivatition (ie. rate of consumption lag change) over 5 minutes. If derivation is above 0, it means consumption lag is increasing, instead of decreasing.
+        {
+          alert: $.alertName('StartingIngesterKafkaLagNotDecreasing'),
+          'for': '5m',
+          expr: |||
+            deriv(histogram_quantile(0.99, sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))[5m:1m]) > 0
+          ||| % $._config,
+          labels: {
+            severity: 'warning',
+          },
+          annotations: {
+            message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s in "starting" phase is not reducing consumption lag of write requests read from Kafka.' % $._config,
+          },
+        },
+
+        {
+          alert: $.alertName('RunningIngesterKafkaLagTooHigh'),
+          'for': '5m',
+          expr: |||
+            histogram_quantile(0.99, sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[$__rate_interval]))) > (10*60)
+          ||| % $._config,
+          labels: {
+            severity: 'critical',
+          },
+          annotations: {
+            message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s in "running" phase is too far behind in its consumption of write requests from Kafka.' % $._config,
+          },
+        },
       ],
     },
   ],

From b4134400c1461a63a0948edc55621797375a900e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= <pstibrany@gmail.com>
Date: Mon, 25 Mar 2024 12:08:28 +0100
Subject: [PATCH 03/19] Add alert for failures to consume write requests.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Peter Štibraný <pstibrany@gmail.com>
---
 .../mimir-mixin-compiled-baremetal/alerts.yaml     | 10 ++++++++++
 operations/mimir-mixin-compiled/alerts.yaml        | 10 ++++++++++
 .../mimir-mixin/alerts/ingest-storage.libsonnet    | 14 ++++++++++++++
 3 files changed, 34 insertions(+)

diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml
index 2a5abaee3c4..bd045ac16f3 100644
--- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml
+++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml
@@ -999,6 +999,16 @@ groups:
     for: 5m
     labels:
       severity: critical
+  - alert: MimirIngesterFailsToProcessRecordsFromKafka
+    annotations:
+      message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} fails to consume write requests read from Kafka due to internal errors.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka
+    expr: |
+      sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[5m]) > 0
+    for: 5m
+    labels:
+      severity: critical
 - name: mimir_continuous_test
   rules:
   - alert: MimirContinuousTestNotRunningOnWrites
diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml
index e9e67a82718..b9422ed687d 100644
--- a/operations/mimir-mixin-compiled/alerts.yaml
+++ b/operations/mimir-mixin-compiled/alerts.yaml
@@ -1012,6 +1012,16 @@ groups:
     for: 5m
     labels:
       severity: critical
+  - alert: MimirIngesterFailsToProcessRecordsFromKafka
+    annotations:
+      message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} fails to consume write requests read from Kafka due to internal errors.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka
+    expr: |
+      sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[5m]) > 0
+    for: 5m
+    labels:
+      severity: critical
 - name: mimir_continuous_test
   rules:
   - alert: MimirContinuousTestNotRunningOnWrites
diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet
index aba0b5b8644..d2df0f9f5a4 100644
--- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet
+++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet
@@ -85,6 +85,20 @@
             message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s in "running" phase is too far behind in its consumption of write requests from Kafka.' % $._config,
           },
         },
+
+        {
+          alert: $.alertName('IngesterFailsToProcessRecordsFromKafka'),
+          'for': '5m',
+          expr: |||
+            sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[5m]) > 0
+          ||| % $._config,
+          labels: {
+            severity: 'critical',
+          },
+          annotations: {
+            message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s fails to consume write requests read from Kafka due to internal errors.' % $._config,
+          },
+        },
       ],
     },
   ],

From 124d3aff87947a2ef9e25bccd709eaaaedff82c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= <pstibrany@gmail.com>
Date: Mon, 25 Mar 2024 12:26:20 +0100
Subject: [PATCH 04/19] Add alert for failures to enforce strong consistency.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Peter Štibraný <pstibrany@gmail.com>
---
 .../mimir-mixin-compiled-baremetal/alerts.yaml     | 10 ++++++++++
 operations/mimir-mixin-compiled/alerts.yaml        | 10 ++++++++++
 .../mimir-mixin/alerts/ingest-storage.libsonnet    | 14 ++++++++++++++
 3 files changed, 34 insertions(+)

diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml
index bd045ac16f3..c15f7529b44 100644
--- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml
+++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml
@@ -1009,6 +1009,16 @@ groups:
     for: 5m
     labels:
       severity: critical
+  - alert: MimirIngesterFailsEnforceStrongConsistencyOnReadPath
+    annotations:
+      message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} fails to enforce strong-consistency on read-path.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailsenforcestrongconsistencyonreadpath
+    expr: |
+      sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_strong_consistency_failures_total[5m])) > 0
+    for: 5m
+    labels:
+      severity: critical
 - name: mimir_continuous_test
   rules:
   - alert: MimirContinuousTestNotRunningOnWrites
diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml
index b9422ed687d..2acea51bfa1 100644
--- a/operations/mimir-mixin-compiled/alerts.yaml
+++ b/operations/mimir-mixin-compiled/alerts.yaml
@@ -1022,6 +1022,16 @@ groups:
     for: 5m
     labels:
       severity: critical
+  - alert: MimirIngesterFailsEnforceStrongConsistencyOnReadPath
+    annotations:
+      message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+        }} fails to enforce strong-consistency on read-path.
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailsenforcestrongconsistencyonreadpath
+    expr: |
+      sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_strong_consistency_failures_total[5m])) > 0
+    for: 5m
+    labels:
+      severity: critical
 - name: mimir_continuous_test
   rules:
   - alert: MimirContinuousTestNotRunningOnWrites
diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet
index d2df0f9f5a4..698b4df1241 100644
--- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet
+++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet
@@ -99,6 +99,20 @@
             message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s fails to consume write requests read from Kafka due to internal errors.' % $._config,
           },
         },
+
+        {
+          alert: $.alertName('IngesterFailsEnforceStrongConsistencyOnReadPath'),
+          'for': '5m',
+          expr: |||
+            sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_strong_consistency_failures_total[5m])) > 0
+          ||| % $._config,
+          labels: {
+            severity: 'critical',
+          },
+          annotations: {
+            message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s fails to enforce strong-consistency on read-path.' % $._config,
+          },
+        },
       ],
     },
   ],

From 64ca93fec39d778d2036afd16833a2d9231e58f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= <pstibrany@gmail.com>
Date: Mon, 25 Mar 2024 12:45:48 +0100
Subject: [PATCH 05/19] Runbooks.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Peter Štibraný <pstibrany@gmail.com>
---
 .../mimir/manage/mimir-runbooks/_index.md     | 46 +++++++++++++++++++
 .../alerts.yaml                               |  8 ++--
 operations/mimir-mixin-compiled/alerts.yaml   |  8 ++--
 .../alerts/ingest-storage.libsonnet           |  4 +-
 4 files changed, 56 insertions(+), 10 deletions(-)

diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md
index 294d3e98ee1..60d83fdf68b 100644
--- a/docs/sources/mimir/manage/mimir-runbooks/_index.md
+++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md
@@ -1327,6 +1327,52 @@ How to **investigate**:
 - Check ingester logs to find details about the error.
 - Check Kafka logs and health.
 
+### MimirIngesterIngesterFailedToReadRecordsFromKafka
+
+This alert fires when an ingester is failing to read records from Kafka backend.
+
+How it **works**:
+
+- Ingester connects to Kafka brokers and reads records from it.
+- When ingester fails to read more records from Kafka due to error, ingester logs such error.
+- This can be normal if Kafka brokers are restarting, however if read errors continue for some time, alert is raised. 
+
+How to **investigate**:
+
+- Check ingester logs to find details about the error.
+- Check Kafka logs and health.
+
+### MimirIngesterKafkaFetchErrorsRateTooHigh
+
+This alert fires when an ingester is receiving errors instead of "fetches" from Kafka.
+
+How it **works**:
+
+- Ingester uses Kafka client to read records from Kafka.
+- Kafka client can return errors instead of more records.
+- If rate of returned errors compared to returned records is too high, alert is raised.
+- Kafka client can return errors [documented in the source code](https://github.com/grafana/mimir/blob/24591ae56cd7d6ef24a7cc1541a41405676773f4/vendor/github.com/twmb/franz-go/pkg/kgo/record_and_fetch.go#L332-L366).
+
+How to **investigate**:
+
+- Check ingester logs to find details about the error.
+- Check Kafka logs and health.
+
+### MimirStartingIngesterKafkaReceiveDelayIncreasing
+
+This alert fires when consumption lag reported by ingester during "starting" phase is not decreasing.
+
+How it **works**:
+
+- When ingester is starting, it needs to fetch and process records from Kafka until preconfigured consumption lag is honored.
+- Each record has a timestamp when it was stored to Kafka. When ingester reads the record, it computes "receive delay" as a difference between current time (when record was read) and time when record was stored to Kafka. This receive delay is reported in metrics.
+- Under normal conditions when ingester is processing records faster than records are appearing, receive delay should be decreasing.
+- When ingester is starting, and observed "receive delay" is increasing, alert is raised.
+
+How to **investigate**:
+
+- Check if ingester is fast enough to process all data in Kafka. If not, configure ingesters to start with later offset instead.
+
 ## Errors catalog
 
 Mimir has some codified error IDs that you might see in HTTP responses or logs.
diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml
index c15f7529b44..f365206028d 100644
--- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml
+++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml
@@ -977,23 +977,23 @@ groups:
     for: 15m
     labels:
       severity: critical
-  - alert: MimirStartingIngesterKafkaLagNotDecreasing
+  - alert: MimirStartingIngesterKafkaReceiveDelayIncreasing
     annotations:
       message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace
         }} in "starting" phase is not reducing consumption lag of write requests read
         from Kafka.
-      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkalagnotdecreasing
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing
     expr: |
       deriv(histogram_quantile(0.99, sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))[5m:1m]) > 0
     for: 5m
     labels:
       severity: warning
-  - alert: MimirRunningIngesterKafkaLagTooHigh
+  - alert: MimirRunningIngesterReceiveDelayTooHigh
     annotations:
       message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace
         }} in "running" phase is too far behind in its consumption of write requests
         from Kafka.
-      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterkafkalagtoohigh
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh
     expr: |
       histogram_quantile(0.99, sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[$__rate_interval]))) > (10*60)
     for: 5m
diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml
index 2acea51bfa1..7ed12bb6a68 100644
--- a/operations/mimir-mixin-compiled/alerts.yaml
+++ b/operations/mimir-mixin-compiled/alerts.yaml
@@ -990,23 +990,23 @@ groups:
     for: 15m
     labels:
       severity: critical
-  - alert: MimirStartingIngesterKafkaLagNotDecreasing
+  - alert: MimirStartingIngesterKafkaReceiveDelayIncreasing
     annotations:
       message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
         }} in "starting" phase is not reducing consumption lag of write requests read
         from Kafka.
-      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkalagnotdecreasing
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing
     expr: |
       deriv(histogram_quantile(0.99, sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))[5m:1m]) > 0
     for: 5m
     labels:
       severity: warning
-  - alert: MimirRunningIngesterKafkaLagTooHigh
+  - alert: MimirRunningIngesterReceiveDelayTooHigh
     annotations:
       message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
         }} in "running" phase is too far behind in its consumption of write requests
         from Kafka.
-      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterkafkalagtoohigh
+      runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh
     expr: |
       histogram_quantile(0.99, sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[$__rate_interval]))) > (10*60)
     for: 5m
diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet
index 698b4df1241..15a7de8bc8a 100644
--- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet
+++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet
@@ -59,7 +59,7 @@
 
         // This is an experiment. We compute derivatition (ie. rate of consumption lag change) over 5 minutes. If derivation is above 0, it means consumption lag is increasing, instead of decreasing.
         {
-          alert: $.alertName('StartingIngesterKafkaLagNotDecreasing'),
+          alert: $.alertName('StartingIngesterKafkaReceiveDelayIncreasing'),
           'for': '5m',
           expr: |||
             deriv(histogram_quantile(0.99, sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))[5m:1m]) > 0
@@ -73,7 +73,7 @@
         },
 
         {
-          alert: $.alertName('RunningIngesterKafkaLagTooHigh'),
+          alert: $.alertName('RunningIngesterReceiveDelayTooHigh'),
           'for': '5m',
           expr: |||
             histogram_quantile(0.99, sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[$__rate_interval]))) > (10*60)

From 0d3eab3736ed960d028ff748e3e1e9ae6b07f5ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= <pstibrany@gmail.com>
Date: Mon, 25 Mar 2024 13:11:14 +0100
Subject: [PATCH 06/19] Runbooks.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Peter Štibraný <pstibrany@gmail.com>
---
 .../mimir/manage/mimir-runbooks/_index.md     | 48 ++++++++++++++++++-
 .../alerts.yaml                               |  4 +-
 operations/mimir-mixin-compiled/alerts.yaml   |  4 +-
 .../alerts/ingest-storage.libsonnet           |  4 +-
 4 files changed, 53 insertions(+), 7 deletions(-)

diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md
index 60d83fdf68b..92184579837 100644
--- a/docs/sources/mimir/manage/mimir-runbooks/_index.md
+++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md
@@ -1360,7 +1360,7 @@ How to **investigate**:
 
 ### MimirStartingIngesterKafkaReceiveDelayIncreasing
 
-This alert fires when consumption lag reported by ingester during "starting" phase is not decreasing.
+This alert fires when "receive delay" reported by ingester during "starting" phase is not decreasing.
 
 How it **works**:
 
@@ -1373,6 +1373,52 @@ How to **investigate**:
 
 - Check if ingester is fast enough to process all data in Kafka. If not, configure ingesters to start with later offset instead.
 
+### MimirRunningIngesterReceiveDelayTooHigh
+
+This alert fires when "receive delay" reported by ingester while it's running reaches alert threshold.
+
+How it **works**:
+
+- After ingester start and catches up with records in Kafka, ingester switches to "running" mode. 
+- In running mode, ingester continues to process incoming samples from Kafka and continues to report "receive delay". See [`MimirStartingIngesterKafkaReceiveDelayIncreasing`](#MimirStartingIngesterKafkaReceiveDelayIncreasing) runbook for details about this metric.
+- Under normal conditions when ingester is running and it is processing records faster than records are appearing, receive delay should be stable.
+- If observed "receive delay" increases and reaches certain threshold, alert is raised.
+
+How to **investigate**:
+
+- Check if ingester is fast enough to process all data in Kafka.
+- If ingesters are too slow, consider scaling ingesters, either vertically (to make ingesters faster), or horizontally to spread incoming series between more ingesters.
+
+### MimirIngesterFailsToProcessRecordsFromKafka
+
+This alert fires when ingester is unable to process incoming records from Kafka due to internal errors. If ingest-storage wasn't used, such push requests would end up with 5xx errors.
+
+How it **works**:
+
+- Ingester reads records from Kafka, and processes them locally. Processing means unmarshalling the data and handling write requests stored in records.
+- Write requests can fail due to "user" or "server" errors. Typical user error is too low limit for number of series. Server error can be for example ingester hitting an instance limit.
+- If requests keep failing due to server errors, this alert is raised.
+
+How to **investigate**:
+
+- Check ingester logs to see why requests are failing, and troubleshoot based on that.
+
+### MimirIngesterFailsEnforceStrongConsistencyOnReadPath
+
+This alert fires when too many read-requests with strong consistency are failing.
+
+How it **works**:
+
+- When read request asks for strong-consistency guarantee, ingester will read the last produced offset from Kafka, and wait until record with this offset is consumed.
+- If read request times out during this wait, that is considered to be a failure of request with strong-consistency.
+- If requests keep failing due to failure to enforce strong-consistency, this alert is raised.
+
+How to **investigate**:
+
+- Check wait latency of requests with strong-consistency.
+- Check if ingester needs to process too many records, and whether ingesters need to be scaled up (vertically or horizontally).
+- Consider increasing read-timeout of requests.
+
 ## Errors catalog
 
 Mimir has some codified error IDs that you might see in HTTP responses or logs.
diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml
index f365206028d..bd32e2f844e 100644
--- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml
+++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml
@@ -1005,7 +1005,7 @@ groups:
         }} fails to consume write requests read from Kafka due to internal errors.
       runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka
     expr: |
-      sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[5m]) > 0
+      sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m]) > 0
     for: 5m
     labels:
       severity: critical
@@ -1015,7 +1015,7 @@ groups:
         }} fails to enforce strong-consistency on read-path.
       runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailsenforcestrongconsistencyonreadpath
     expr: |
-      sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_strong_consistency_failures_total[5m])) > 0
+      sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_strong_consistency_failures_total[1m])) > 0
     for: 5m
     labels:
       severity: critical
diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml
index 7ed12bb6a68..97402bd970a 100644
--- a/operations/mimir-mixin-compiled/alerts.yaml
+++ b/operations/mimir-mixin-compiled/alerts.yaml
@@ -1018,7 +1018,7 @@ groups:
         }} fails to consume write requests read from Kafka due to internal errors.
       runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka
     expr: |
-      sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[5m]) > 0
+      sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m]) > 0
     for: 5m
     labels:
       severity: critical
@@ -1028,7 +1028,7 @@ groups:
         }} fails to enforce strong-consistency on read-path.
       runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailsenforcestrongconsistencyonreadpath
     expr: |
-      sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_strong_consistency_failures_total[5m])) > 0
+      sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_strong_consistency_failures_total[1m])) > 0
     for: 5m
     labels:
       severity: critical
diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet
index 15a7de8bc8a..fd915ee07bf 100644
--- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet
+++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet
@@ -90,7 +90,7 @@
           alert: $.alertName('IngesterFailsToProcessRecordsFromKafka'),
           'for': '5m',
           expr: |||
-            sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[5m]) > 0
+            sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m]) > 0
           ||| % $._config,
           labels: {
             severity: 'critical',
@@ -104,7 +104,7 @@
           alert: $.alertName('IngesterFailsEnforceStrongConsistencyOnReadPath'),
           'for': '5m',
           expr: |||
-            sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_strong_consistency_failures_total[5m])) > 0
+            sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_strong_consistency_failures_total[1m])) > 0
           ||| % $._config,
           labels: {
             severity: 'critical',

From 9a6b3e60b7926ce63e5f71d49108cf9443c0ab35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= <pstibrany@gmail.com>
Date: Fri, 22 Mar 2024 16:03:48 +0100
Subject: [PATCH 07/19] Mention cases ingest-storage cases.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Peter Štibraný <pstibrany@gmail.com>
---
 docs/sources/mimir/manage/mimir-runbooks/_index.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md
index 92184579837..fd98180d95b 100644
--- a/docs/sources/mimir/manage/mimir-runbooks/_index.md
+++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md
@@ -190,7 +190,7 @@ How to **investigate**:
 
 - Check the `Mimir / Writes` dashboard
   - Looking at the dashboard you should see in which Mimir service the high latency originates
-  - The panels in the dashboard are vertically sorted by the network path (eg. gateway -> distributor -> ingester)
+  - The panels in the dashboard are vertically sorted by the network path (eg. gateway -> distributor -> ingester). When using ingest-storage, network path changes to gateway -> distributor -> Kafka instead. 
 - Deduce where in the stack the latency is being introduced
   - **`gateway`**
     - Latency may be caused by the time taken for the gateway to receive the entire request from the client. There are a multitude of reasons this can occur, so communication with the user may be necessary. For example:
@@ -201,6 +201,7 @@ How to **investigate**:
     - There could be a problem with authentication (eg. slow to run auth layer)
   - **`distributor`**
     - Typically, distributor p99 latency is in the range 50-100ms. If the distributor latency is higher than this, you may need to scale up the distributors.
+    - When using Mimir ingest-storage, distributors are writing requests to Kafka-compatible backend. Increased latency in distributor may also come from this backend.
   - **`ingester`**
     - Typically, ingester p99 latency is in the range 5-50ms. If the ingester latency is higher than this, you should investigate the root cause before scaling up ingesters.
     - Check out the following alerts and fix them if firing:
@@ -243,6 +244,9 @@ How to **investigate**:
       - If queries are not waiting in queue
         - Consider [enabling query sharding]({{< relref "../../references/architecture/query-sharding#how-to-enable-query-sharding" >}}) if not already enabled, to increase query parallelism
         - If query sharding already enabled, consider increasing total number of query shards (`query_sharding_total_shards`) for tenants submitting slow queries, so their queries can be further parallelized
+  - **`ingester`**
+    - Check if ingesters are not overloaded. If they are and you can scale up ingesters vertically, that may be the best action. If that's not possible, scaling horizontally can help as well, but it can take several hours for ingesters to fully redistribute their series.
+    - When using ingest-storage, check ratio of queries using strong-consistency, and latency of queries using strong-consistency.
 
 #### Alertmanager
 
@@ -278,6 +282,7 @@ How to **investigate**:
 - If the failing service is crashing / panicking: look for the stack trace in the logs and investigate from there
   - If crashing service is query-frontend, querier or store-gateway, and you have "activity tracker" feature enabled, look for `found unfinished activities from previous run` message and subsequent `activity` messages in the log file to see which queries caused the crash.
 - When using Memberlist as KV store for hash rings, ensure that Memberlist is working correctly. See instructions for the [`MimirGossipMembersTooHigh`](#MimirGossipMembersTooHigh) and [`MimirGossipMembersTooLow`](#MimirGossipMembersTooLow) alerts.
+- When using ingest-storage and distributors are failing to write requests to Kafka, make sure that Kafka is up and running correctly.
 
 #### Alertmanager
 

From 976a1627ec51b4a1f77e5fa0564126db57b598ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= <pstibrany@gmail.com>
Date: Mon, 25 Mar 2024 13:16:56 +0100
Subject: [PATCH 08/19] Fix query.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Peter Štibraný <pstibrany@gmail.com>
---
 operations/mimir-mixin-compiled-baremetal/alerts.yaml  | 2 +-
 operations/mimir-mixin-compiled/alerts.yaml            | 2 +-
 operations/mimir-mixin/alerts/ingest-storage.libsonnet | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml
index bd32e2f844e..bdf3b2e12b8 100644
--- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml
+++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml
@@ -970,7 +970,7 @@ groups:
         }} is receiving fetch errors when reading records from Kafka.
       runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterkafkafetcherrorsratetoohigh
     expr: |
-      sum by (cluster, namespace, instance) (rate (cortex_ingest_storage_reader_fetch_errors_total}[5m]))
+      sum by (cluster, namespace, instance) (rate (cortex_ingest_storage_reader_fetch_errors_total[5m]))
       /
       sum by (cluster, namespace, instance) (rate (cortex_ingest_storage_reader_fetches_total[5m]))
       > 0.1
diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml
index 97402bd970a..e181079a9b4 100644
--- a/operations/mimir-mixin-compiled/alerts.yaml
+++ b/operations/mimir-mixin-compiled/alerts.yaml
@@ -983,7 +983,7 @@ groups:
         }} is receiving fetch errors when reading records from Kafka.
       runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterkafkafetcherrorsratetoohigh
     expr: |
-      sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetch_errors_total}[5m]))
+      sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetch_errors_total[5m]))
       /
       sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetches_total[5m]))
       > 0.1
diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet
index fd915ee07bf..26d0d7b6e5d 100644
--- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet
+++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet
@@ -44,7 +44,7 @@
           'for': '15m',
           // See https://github.com/grafana/mimir/blob/24591ae56cd7d6ef24a7cc1541a41405676773f4/vendor/github.com/twmb/franz-go/pkg/kgo/record_and_fetch.go#L332-L366 for errors that can be reported here.
           expr: |||
-            sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate (cortex_ingest_storage_reader_fetch_errors_total}[5m]))
+            sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate (cortex_ingest_storage_reader_fetch_errors_total[5m]))
             /
             sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate (cortex_ingest_storage_reader_fetches_total[5m]))
             > 0.1

From 24cd11c4a7e4105f65fe08c176e1ad0a58a7a0ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= <pstibrany@gmail.com>
Date: Mon, 25 Mar 2024 13:19:13 +0100
Subject: [PATCH 09/19] Add links to ingest storage.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Peter Štibraný <pstibrany@gmail.com>
---
 docs/sources/mimir/manage/mimir-runbooks/_index.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md
index fd98180d95b..6542bd8842c 100644
--- a/docs/sources/mimir/manage/mimir-runbooks/_index.md
+++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md
@@ -190,7 +190,7 @@ How to **investigate**:
 
 - Check the `Mimir / Writes` dashboard
   - Looking at the dashboard you should see in which Mimir service the high latency originates
-  - The panels in the dashboard are vertically sorted by the network path (eg. gateway -> distributor -> ingester). When using ingest-storage, network path changes to gateway -> distributor -> Kafka instead. 
+  - The panels in the dashboard are vertically sorted by the network path (eg. gateway -> distributor -> ingester). When using [ingest-storage](#mimir-ingest-storage-experimental), network path changes to gateway -> distributor -> Kafka instead. 
 - Deduce where in the stack the latency is being introduced
   - **`gateway`**
     - Latency may be caused by the time taken for the gateway to receive the entire request from the client. There are a multitude of reasons this can occur, so communication with the user may be necessary. For example:
@@ -201,7 +201,7 @@ How to **investigate**:
     - There could be a problem with authentication (eg. slow to run auth layer)
   - **`distributor`**
     - Typically, distributor p99 latency is in the range 50-100ms. If the distributor latency is higher than this, you may need to scale up the distributors.
-    - When using Mimir ingest-storage, distributors are writing requests to Kafka-compatible backend. Increased latency in distributor may also come from this backend.
+    - When using Mimir [ingest-storage](#mimir-ingest-storage-experimental), distributors are writing requests to Kafka-compatible backend. Increased latency in distributor may also come from this backend.
   - **`ingester`**
     - Typically, ingester p99 latency is in the range 5-50ms. If the ingester latency is higher than this, you should investigate the root cause before scaling up ingesters.
     - Check out the following alerts and fix them if firing:
@@ -246,7 +246,7 @@ How to **investigate**:
         - If query sharding already enabled, consider increasing total number of query shards (`query_sharding_total_shards`) for tenants submitting slow queries, so their queries can be further parallelized
   - **`ingester`**
     - Check if ingesters are not overloaded. If they are and you can scale up ingesters vertically, that may be the best action. If that's not possible, scaling horizontally can help as well, but it can take several hours for ingesters to fully redistribute their series.
-    - When using ingest-storage, check ratio of queries using strong-consistency, and latency of queries using strong-consistency.
+    - When using [ingest-storage](#mimir-ingest-storage-experimental), check ratio of queries using strong-consistency, and latency of queries using strong-consistency.
 
 #### Alertmanager
 
@@ -282,7 +282,7 @@ How to **investigate**:
 - If the failing service is crashing / panicking: look for the stack trace in the logs and investigate from there
   - If crashing service is query-frontend, querier or store-gateway, and you have "activity tracker" feature enabled, look for `found unfinished activities from previous run` message and subsequent `activity` messages in the log file to see which queries caused the crash.
 - When using Memberlist as KV store for hash rings, ensure that Memberlist is working correctly. See instructions for the [`MimirGossipMembersTooHigh`](#MimirGossipMembersTooHigh) and [`MimirGossipMembersTooLow`](#MimirGossipMembersTooLow) alerts.
-- When using ingest-storage and distributors are failing to write requests to Kafka, make sure that Kafka is up and running correctly.
+- When using [ingest-storage](#mimir-ingest-storage-experimental) and distributors are failing to write requests to Kafka, make sure that Kafka is up and running correctly.
 
 #### Alertmanager
 

From 4bf495f32d14353f48715e533862d0ec6fb2fa22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= <pstibrany@gmail.com>
Date: Mon, 25 Mar 2024 13:22:21 +0100
Subject: [PATCH 10/19] Fix range.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Peter Štibraný <pstibrany@gmail.com>
---
 operations/mimir-mixin-compiled-baremetal/alerts.yaml  | 2 +-
 operations/mimir-mixin-compiled/alerts.yaml            | 2 +-
 operations/mimir-mixin/alerts/ingest-storage.libsonnet | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml
index bdf3b2e12b8..8eb31e31ee1 100644
--- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml
+++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml
@@ -995,7 +995,7 @@ groups:
         from Kafka.
       runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh
     expr: |
-      histogram_quantile(0.99, sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[$__rate_interval]))) > (10*60)
+      histogram_quantile(0.99, sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) > (10*60)
     for: 5m
     labels:
       severity: critical
diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml
index e181079a9b4..e2a2f156cbb 100644
--- a/operations/mimir-mixin-compiled/alerts.yaml
+++ b/operations/mimir-mixin-compiled/alerts.yaml
@@ -1008,7 +1008,7 @@ groups:
         from Kafka.
       runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh
     expr: |
-      histogram_quantile(0.99, sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[$__rate_interval]))) > (10*60)
+      histogram_quantile(0.99, sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) > (10*60)
     for: 5m
     labels:
       severity: critical
diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet
index 26d0d7b6e5d..0bd50d89f25 100644
--- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet
+++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet
@@ -76,7 +76,7 @@
           alert: $.alertName('RunningIngesterReceiveDelayTooHigh'),
           'for': '5m',
           expr: |||
-            histogram_quantile(0.99, sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[$__rate_interval]))) > (10*60)
+            histogram_quantile(0.99, sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) > (10*60)
           ||| % $._config,
           labels: {
             severity: 'critical',

From aad59a4287a5c3ecc036ffcc73561c92e3398776 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= <pstibrany@gmail.com>
Date: Mon, 25 Mar 2024 13:22:27 +0100
Subject: [PATCH 11/19] Fix helm alerts.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Peter Štibraný <pstibrany@gmail.com>
---
 .../metamonitoring/mixin-alerts.yaml          | 66 +++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml
index 72ad38a6c72..2edcf051d44 100644
--- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml
+++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml
@@ -978,6 +978,72 @@ spec:
       for: 15m
       labels:
         severity: critical
+    - alert: MimirIngesterFailedToReadRecordsFromKafka
+      annotations:
+        message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+          }} is failing to read records from Kafka.
+        runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailedtoreadrecordsfromkafka
+      expr: |
+        sum by(cluster, namespace, pod, node_id) (rate(cortex_ingest_storage_reader_read_errors_total[1m]))
+        > 0
+      for: 5m
+      labels:
+        severity: critical
+    - alert: MimirIngesterKafkaFetchErrorsRateTooHigh
+      annotations:
+        message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+          }} is receiving fetch errors when reading records from Kafka.
+        runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterkafkafetcherrorsratetoohigh
+      expr: |
+        sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetch_errors_total[5m]))
+        /
+        sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetches_total[5m]))
+        > 0.1
+      for: 15m
+      labels:
+        severity: critical
+    - alert: MimirStartingIngesterKafkaReceiveDelayIncreasing
+      annotations:
+        message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+          }} in "starting" phase is not reducing consumption lag of write requests read
+          from Kafka.
+        runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing
+      expr: |
+        deriv(histogram_quantile(0.99, sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))[5m:1m]) > 0
+      for: 5m
+      labels:
+        severity: warning
+    - alert: MimirRunningIngesterReceiveDelayTooHigh
+      annotations:
+        message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+          }} in "running" phase is too far behind in its consumption of write requests
+          from Kafka.
+        runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh
+      expr: |
+        histogram_quantile(0.99, sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) > (10*60)
+      for: 5m
+      labels:
+        severity: critical
+    - alert: MimirIngesterFailsToProcessRecordsFromKafka
+      annotations:
+        message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+          }} fails to consume write requests read from Kafka due to internal errors.
+        runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka
+      expr: |
+        sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m]) > 0
+      for: 5m
+      labels:
+        severity: critical
+    - alert: MimirIngesterFailsEnforceStrongConsistencyOnReadPath
+      annotations:
+        message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
+          }} fails to enforce strong-consistency on read-path.
+        runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailsenforcestrongconsistencyonreadpath
+      expr: |
+        sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_strong_consistency_failures_total[1m])) > 0
+      for: 5m
+      labels:
+        severity: critical
   - name: mimir_continuous_test
     rules:
     - alert: MimirContinuousTestNotRunningOnWrites

From 56c6c2175e9dc1c4cb1990d504c283c96bf13d6a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= <pstibrany@gmail.com>
Date: Mon, 25 Mar 2024 13:45:11 +0100
Subject: [PATCH 12/19] Fix typos.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Peter Štibraný <pstibrany@gmail.com>
---
 docs/sources/mimir/manage/mimir-runbooks/_index.md          | 6 +++---
 .../templates/metamonitoring/mixin-alerts.yaml              | 2 +-
 operations/mimir-mixin-compiled-baremetal/alerts.yaml       | 2 +-
 operations/mimir-mixin-compiled/alerts.yaml                 | 2 +-
 operations/mimir-mixin/alerts/ingest-storage.libsonnet      | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md
index 6542bd8842c..646b0ef35eb 100644
--- a/docs/sources/mimir/manage/mimir-runbooks/_index.md
+++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md
@@ -190,7 +190,7 @@ How to **investigate**:
 
 - Check the `Mimir / Writes` dashboard
   - Looking at the dashboard you should see in which Mimir service the high latency originates
-  - The panels in the dashboard are vertically sorted by the network path (eg. gateway -> distributor -> ingester). When using [ingest-storage](#mimir-ingest-storage-experimental), network path changes to gateway -> distributor -> Kafka instead. 
+  - The panels in the dashboard are vertically sorted by the network path (eg. gateway -> distributor -> ingester). When using [ingest-storage](#mimir-ingest-storage-experimental), network path changes to gateway -> distributor -> Kafka instead.
 - Deduce where in the stack the latency is being introduced
   - **`gateway`**
     - Latency may be caused by the time taken for the gateway to receive the entire request from the client. There are a multitude of reasons this can occur, so communication with the user may be necessary. For example:
@@ -1340,7 +1340,7 @@ How it **works**:
 
 - Ingester connects to Kafka brokers and reads records from it.
 - When ingester fails to read more records from Kafka due to error, ingester logs such error.
-- This can be normal if Kafka brokers are restarting, however if read errors continue for some time, alert is raised. 
+- This can be normal if Kafka brokers are restarting, however if read errors continue for some time, alert is raised.
 
 How to **investigate**:
 
@@ -1384,7 +1384,7 @@ This alert fires when "receive delay" reported by ingester while it's running re
 
 How it **works**:
 
-- After ingester start and catches up with records in Kafka, ingester switches to "running" mode. 
+- After ingester start and catches up with records in Kafka, ingester switches to "running" mode.
 - In running mode, ingester continues to process incoming samples from Kafka and continues to report "receive delay". See [`MimirStartingIngesterKafkaReceiveDelayIncreasing`](#MimirStartingIngesterKafkaReceiveDelayIncreasing) runbook for details about this metric.
 - Under normal conditions when ingester is running and it is processing records faster than records are appearing, receive delay should be stable.
 - If observed "receive delay" increases and reaches certain threshold, alert is raised.
diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml
index 2edcf051d44..fb9769c7c7a 100644
--- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml
+++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml
@@ -1030,7 +1030,7 @@ spec:
           }} fails to consume write requests read from Kafka due to internal errors.
         runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka
       expr: |
-        sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m]) > 0
+        sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m])) > 0
       for: 5m
       labels:
         severity: critical
diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml
index 8eb31e31ee1..e438e9d32a2 100644
--- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml
+++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml
@@ -1005,7 +1005,7 @@ groups:
         }} fails to consume write requests read from Kafka due to internal errors.
       runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka
     expr: |
-      sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m]) > 0
+      sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m])) > 0
     for: 5m
     labels:
       severity: critical
diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml
index e2a2f156cbb..e78fded5b11 100644
--- a/operations/mimir-mixin-compiled/alerts.yaml
+++ b/operations/mimir-mixin-compiled/alerts.yaml
@@ -1018,7 +1018,7 @@ groups:
         }} fails to consume write requests read from Kafka due to internal errors.
       runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka
     expr: |
-      sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m]) > 0
+      sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m])) > 0
     for: 5m
     labels:
       severity: critical
diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet
index 0bd50d89f25..de03a51ae3f 100644
--- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet
+++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet
@@ -90,7 +90,7 @@
           alert: $.alertName('IngesterFailsToProcessRecordsFromKafka'),
           'for': '5m',
           expr: |||
-            sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m]) > 0
+            sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m])) > 0
           ||| % $._config,
           labels: {
             severity: 'critical',

From f41860bd13c017046ab93f5210a3373a0776e8f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= <pstibrany@gmail.com>
Date: Mon, 25 Mar 2024 13:51:50 +0100
Subject: [PATCH 13/19] Fix runbook name.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Peter Štibraný <pstibrany@gmail.com>
---
 docs/sources/mimir/manage/mimir-runbooks/_index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md
index 646b0ef35eb..9325ccbaab2 100644
--- a/docs/sources/mimir/manage/mimir-runbooks/_index.md
+++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md
@@ -1332,7 +1332,7 @@ How to **investigate**:
 - Check ingester logs to find details about the error.
 - Check Kafka logs and health.
 
-### MimirIngesterIngesterFailedToReadRecordsFromKafka
+### MimirIngesterFailedToReadRecordsFromKafka
 
 This alert fires when an ingester is failing to read records from Kafka backend.
 

From 2b8f2b7ec491e90cd9367334cb9bd32e26fe276d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= <pstibrany@gmail.com>
Date: Mon, 25 Mar 2024 14:21:32 +0100
Subject: [PATCH 14/19] Update changelog entry.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Peter Štibraný <pstibrany@gmail.com>
---
 CHANGELOG.md | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7539640db68..1cc77a6ad7b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -24,9 +24,15 @@
 
 * [ENHANCEMENT] Alerts: allow configuring alerts range interval via `_config.base_alerts_range_interval_minutes`. #7591
 * [ENHANCEMENT] Dashboards: Add panels for monitoring distributor and ingester when using ingest-storage. These panels are disabled by default, but can be enabled using `show_ingest_storage_panels: true` config option. Similarly existing panels used when distributors and ingesters use gRPC for forwarding requests can be disabled by setting `show_grpc_ingestion_panels: false`. #7670 #7699
-* [ENHANCEMENT] Alerts: add the following alerts when using ingest-storage: #7699
+* [ENHANCEMENT] Alerts: add the following alerts when using ingest-storage: #7699 #7702
   * `MimirIngesterLastConsumedOffsetCommitFailed`
-* [BUGFIX] Dashobards: Fix regular expression for matching read-path gRPC ingester methods to include querying of exemplars, label-related queries, or active series queries. #7676
+  * `MimirIngesterFailedToReadRecordsFromKafka`
+  * `MimirIngesterKafkaFetchErrorsRateTooHigh`
+  * `MimirStartingIngesterKafkaReceiveDelayIncreasing`
+  * `MimirRunningIngesterReceiveDelayTooHigh`
+  * `MimirIngesterFailsToProcessRecordsFromKafka`
+  * `MimirIngesterFailsEnforceStrongConsistencyOnReadPath`
+* [BUGFIX] Dashboards: Fix regular expression for matching read-path gRPC ingester methods to include querying of exemplars, label-related queries, or active series queries. #7676
 
 ### Jsonnet
 

From 99be6d40f00b46aeaab64079f5966520cd1de38d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= <pstibrany@gmail.com>
Date: Tue, 26 Mar 2024 16:06:02 +0100
Subject: [PATCH 15/19] Apply suggestions from code review

Co-authored-by: Marco Pracucci <marco@pracucci.com>
---
 docs/sources/mimir/manage/mimir-runbooks/_index.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md
index 9325ccbaab2..d8c3697d115 100644
--- a/docs/sources/mimir/manage/mimir-runbooks/_index.md
+++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md
@@ -1338,7 +1338,7 @@ This alert fires when an ingester is failing to read records from Kafka backend.
 
 How it **works**:
 
-- Ingester connects to Kafka brokers and reads records from it.
+- Ingester connects to Kafka brokers and reads records from it. Records contain write requests committed by distributors.
 - When ingester fails to read more records from Kafka due to error, ingester logs such error.
 - This can be normal if Kafka brokers are restarting, however if read errors continue for some time, alert is raised.
 
@@ -1353,7 +1353,7 @@ This alert fires when an ingester is receiving errors instead of "fetches" from
 
 How it **works**:
 
-- Ingester uses Kafka client to read records from Kafka.
+- Ingester uses Kafka client to read records (containing write requests) from Kafka.
 - Kafka client can return errors instead of more records.
 - If rate of returned errors compared to returned records is too high, alert is raised.
 - Kafka client can return errors [documented in the source code](https://github.com/grafana/mimir/blob/24591ae56cd7d6ef24a7cc1541a41405676773f4/vendor/github.com/twmb/franz-go/pkg/kgo/record_and_fetch.go#L332-L366).
@@ -1369,9 +1369,9 @@ This alert fires when "receive delay" reported by ingester during "starting" pha
 
 How it **works**:
 
-- When ingester is starting, it needs to fetch and process records from Kafka until preconfigured consumption lag is honored.
-- Each record has a timestamp when it was stored to Kafka. When ingester reads the record, it computes "receive delay" as a difference between current time (when record was read) and time when record was stored to Kafka. This receive delay is reported in metrics.
-- Under normal conditions when ingester is processing records faster than records are appearing, receive delay should be decreasing.
+- When ingester is starting, it needs to fetch and process records from Kafka until preconfigured consumption lag is honored. The maximum tolerated lag before an ingester is considered to have caught up reading from a partition at startup can be configured via `-ingest-storage.kafka.max-consumer-lag-at-startup`.
+- Each record has a timestamp when it was sent to Kafka by the distributor. When ingester reads the record, it computes "receive delay" as a difference between current time (when record was read) and time when record was sent to Kafka. This receive delay is reported in the metric `cortex_ingest_storage_reader_receive_delay_seconds`.
+- Under normal conditions when ingester is processing records faster than records are appearing, receive delay should be decreasing, until `-ingest-storage.kafka.max-consumer-lag-at-startup` is honored.
 - When ingester is starting, and observed "receive delay" is increasing, alert is raised.
 
 How to **investigate**:
@@ -1385,8 +1385,8 @@ This alert fires when "receive delay" reported by ingester while it's running re
 How it **works**:
 
 - After ingester start and catches up with records in Kafka, ingester switches to "running" mode.
-- In running mode, ingester continues to process incoming samples from Kafka and continues to report "receive delay". See [`MimirStartingIngesterKafkaReceiveDelayIncreasing`](#MimirStartingIngesterKafkaReceiveDelayIncreasing) runbook for details about this metric.
-- Under normal conditions when ingester is running and it is processing records faster than records are appearing, receive delay should be stable.
+- In running mode, ingester continues to process incoming records from Kafka and continues to report "receive delay". See [`MimirStartingIngesterKafkaReceiveDelayIncreasing`](#MimirStartingIngesterKafkaReceiveDelayIncreasing) runbook for details about this metric.
+- Under normal conditions when ingester is running and it is processing records faster than records are appearing, receive delay should be stable and low.
 - If observed "receive delay" increases and reaches certain threshold, alert is raised.
 
 How to **investigate**:

From c8b119a00cedd606e1f6c79da7b609ffded4bac7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= <pstibrany@gmail.com>
Date: Tue, 26 Mar 2024 18:02:38 +0100
Subject: [PATCH 16/19] Address review feedback.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Peter Štibraný <pstibrany@gmail.com>
---
 docs/sources/mimir/manage/mimir-runbooks/_index.md | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md
index d8c3697d115..374790dc0ee 100644
--- a/docs/sources/mimir/manage/mimir-runbooks/_index.md
+++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md
@@ -1376,7 +1376,7 @@ How it **works**:
 
 How to **investigate**:
 
-- Check if ingester is fast enough to process all data in Kafka. If not, configure ingesters to start with later offset instead.
+- Check if ingester is fast enough to process all data in Kafka.
 
 ### MimirRunningIngesterReceiveDelayTooHigh
 
@@ -1392,7 +1392,7 @@ How it **works**:
 How to **investigate**:
 
 - Check if ingester is fast enough to process all data in Kafka.
-- If ingesters are too slow, consider scaling ingesters, either vertically (to make ingesters faster), or horizontally to spread incoming series between more ingesters.
+- If ingesters are too slow, consider scaling ingesters horizontally to spread incoming series between more ingesters.
 
 ### MimirIngesterFailsToProcessRecordsFromKafka
 
@@ -1401,7 +1401,7 @@ This alert fires when ingester is unable to process incoming records from Kafka
 How it **works**:
 
 - Ingester reads records from Kafka, and processes them locally. Processing means unmarshalling the data and handling write requests stored in records.
-- Write requests can fail due to "user" or "server" errors. Typical user error is too low limit for number of series. Server error can be for example ingester hitting an instance limit.
+- Write requests can fail due to "client" or "server" errors. An example of client error is too low limit for number of series. Server error can be for example ingester hitting an instance limit.
 - If requests keep failing due to server errors, this alert is raised.
 
 How to **investigate**:
@@ -1420,9 +1420,8 @@ How it **works**:
 
 How to **investigate**:
 
-- Check wait latency of requests with strong-consistency.
+- Check wait latency of requests with strong-consistency on `Mimir / Queries` dashboard.
 - Check if ingester needs to process too many records, and whether ingesters need to be scaled up (vertically or horizontally).
-- Consider increasing read-timeout of requests.
 
 ## Errors catalog
 

From f87ebacde6eb5811b1c1c1d202a868b7f78ec1c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= <pstibrany@gmail.com>
Date: Tue, 26 Mar 2024 18:07:38 +0100
Subject: [PATCH 17/19] Refer to dashboard.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Peter Štibraný <pstibrany@gmail.com>
---
 docs/sources/mimir/manage/mimir-runbooks/_index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md
index 374790dc0ee..414bd340f88 100644
--- a/docs/sources/mimir/manage/mimir-runbooks/_index.md
+++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md
@@ -1370,7 +1370,7 @@ This alert fires when "receive delay" reported by ingester during "starting" pha
 How it **works**:
 
 - When ingester is starting, it needs to fetch and process records from Kafka until preconfigured consumption lag is honored. The maximum tolerated lag before an ingester is considered to have caught up reading from a partition at startup can be configured via `-ingest-storage.kafka.max-consumer-lag-at-startup`.
-- Each record has a timestamp when it was sent to Kafka by the distributor. When ingester reads the record, it computes "receive delay" as a difference between current time (when record was read) and time when record was sent to Kafka. This receive delay is reported in the metric `cortex_ingest_storage_reader_receive_delay_seconds`.
+- Each record has a timestamp when it was sent to Kafka by the distributor. When ingester reads the record, it computes "receive delay" as a difference between current time (when record was read) and time when record was sent to Kafka. This receive delay is reported in the metric `cortex_ingest_storage_reader_receive_delay_seconds`. You can see receive delay on `Mimir / Writes` dashboard, in section "Ingester (ingest storage – end-to-end latency)".
 - Under normal conditions when ingester is processing records faster than records are appearing, receive delay should be decreasing, until `-ingest-storage.kafka.max-consumer-lag-at-startup` is honored.
 - When ingester is starting, and observed "receive delay" is increasing, alert is raised.
 

From c91e214d4912a286ed06c626bb3a4beebf63ad8b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= <pstibrany@gmail.com>
Date: Wed, 27 Mar 2024 08:54:28 +0100
Subject: [PATCH 18/19] Use average instead of p99 for
 StartingIngesterKafkaReceiveDelayIncreasing and
 RunningIngesterReceiveDelayTooHigh alerts.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Peter Štibraný <pstibrany@gmail.com>
---
 .../templates/metamonitoring/mixin-alerts.yaml       | 12 ++++++++++--
 .../mimir-mixin-compiled-baremetal/alerts.yaml       | 12 ++++++++++--
 operations/mimir-mixin-compiled/alerts.yaml          | 12 ++++++++++--
 .../mimir-mixin/alerts/ingest-storage.libsonnet      | 12 ++++++++++--
 4 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml
index fb9769c7c7a..775f12e2003 100644
--- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml
+++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml
@@ -1009,7 +1009,11 @@ spec:
           from Kafka.
         runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing
       expr: |
-        deriv(histogram_quantile(0.99, sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))[5m:1m]) > 0
+        deriv((
+            histogram_sum(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))
+            /
+            histogram_count(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))
+        )[5m:1m]) > 0
       for: 5m
       labels:
         severity: warning
@@ -1020,7 +1024,11 @@ spec:
           from Kafka.
         runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh
       expr: |
-        histogram_quantile(0.99, sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) > (10*60)
+        (
+          histogram_sum(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
+          /
+          histogram_count(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
+        ) > (10 * 60)
       for: 5m
       labels:
         severity: critical
diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml
index e438e9d32a2..73b881dd39d 100644
--- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml
+++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml
@@ -984,7 +984,11 @@ groups:
         from Kafka.
       runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing
     expr: |
-      deriv(histogram_quantile(0.99, sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))[5m:1m]) > 0
+      deriv((
+          histogram_sum(sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))
+          /
+          histogram_count(sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))
+      )[5m:1m]) > 0
     for: 5m
     labels:
       severity: warning
@@ -995,7 +999,11 @@ groups:
         from Kafka.
       runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh
     expr: |
-      histogram_quantile(0.99, sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) > (10*60)
+      (
+        histogram_sum(sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
+        /
+        histogram_count(sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
+      ) > (10 * 60)
     for: 5m
     labels:
       severity: critical
diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml
index e78fded5b11..4c6870d4877 100644
--- a/operations/mimir-mixin-compiled/alerts.yaml
+++ b/operations/mimir-mixin-compiled/alerts.yaml
@@ -997,7 +997,11 @@ groups:
         from Kafka.
       runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing
     expr: |
-      deriv(histogram_quantile(0.99, sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))[5m:1m]) > 0
+      deriv((
+          histogram_sum(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))
+          /
+          histogram_count(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))
+      )[5m:1m]) > 0
     for: 5m
     labels:
       severity: warning
@@ -1008,7 +1012,11 @@ groups:
         from Kafka.
       runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh
     expr: |
-      histogram_quantile(0.99, sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) > (10*60)
+      (
+        histogram_sum(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
+        /
+        histogram_count(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
+      ) > (10 * 60)
     for: 5m
     labels:
       severity: critical
diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet
index de03a51ae3f..c8d1057534f 100644
--- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet
+++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet
@@ -62,7 +62,11 @@
           alert: $.alertName('StartingIngesterKafkaReceiveDelayIncreasing'),
           'for': '5m',
           expr: |||
-            deriv(histogram_quantile(0.99, sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))[5m:1m]) > 0
+            deriv((
+                histogram_sum(sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))
+                /
+                histogram_count(sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))
+            )[5m:1m]) > 0
           ||| % $._config,
           labels: {
             severity: 'warning',
@@ -76,7 +80,11 @@
           alert: $.alertName('RunningIngesterReceiveDelayTooHigh'),
           'for': '5m',
           expr: |||
-            histogram_quantile(0.99, sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) > (10*60)
+            (
+              histogram_sum(sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
+              /
+              histogram_count(sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
+            ) > (10 * 60)
           ||| % $._config,
           labels: {
             severity: 'critical',

From fe281bd31d492bc5c015179aab1282afa44d5f71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= <pstibrany@gmail.com>
Date: Wed, 27 Mar 2024 09:49:58 +0100
Subject: [PATCH 19/19] Use series from classic histograms for
 StartingIngesterKafkaReceiveDelayIncreasing and
 RunningIngesterReceiveDelayTooHigh alerts, because mixtool can't handle
 native histogram functions.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Peter Štibraný <pstibrany@gmail.com>
---
 .../templates/metamonitoring/mixin-alerts.yaml         |  8 ++++----
 operations/mimir-mixin-compiled-baremetal/alerts.yaml  |  8 ++++----
 operations/mimir-mixin-compiled/alerts.yaml            |  8 ++++----
 operations/mimir-mixin/alerts/ingest-storage.libsonnet | 10 ++++++----
 4 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml
index 775f12e2003..48d0638ffc0 100644
--- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml
+++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml
@@ -1010,9 +1010,9 @@ spec:
         runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing
       expr: |
         deriv((
-            histogram_sum(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))
+            sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="starting"}[1m]))
             /
-            histogram_count(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))
+            sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="starting"}[1m]))
         )[5m:1m]) > 0
       for: 5m
       labels:
@@ -1025,9 +1025,9 @@ spec:
         runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh
       expr: |
         (
-          histogram_sum(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
+          sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="running"}[1m]))
           /
-          histogram_count(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
+          sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="running"}[1m]))
         ) > (10 * 60)
       for: 5m
       labels:
diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml
index 73b881dd39d..12f92aea2e7 100644
--- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml
+++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml
@@ -985,9 +985,9 @@ groups:
       runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing
     expr: |
       deriv((
-          histogram_sum(sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))
+          sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="starting"}[1m]))
           /
-          histogram_count(sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))
+          sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="starting"}[1m]))
       )[5m:1m]) > 0
     for: 5m
     labels:
@@ -1000,9 +1000,9 @@ groups:
       runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh
     expr: |
       (
-        histogram_sum(sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
+        sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="running"}[1m]))
         /
-        histogram_count(sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
+        sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="running"}[1m]))
       ) > (10 * 60)
     for: 5m
     labels:
diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml
index 4c6870d4877..9fff3bc2888 100644
--- a/operations/mimir-mixin-compiled/alerts.yaml
+++ b/operations/mimir-mixin-compiled/alerts.yaml
@@ -998,9 +998,9 @@ groups:
       runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing
     expr: |
       deriv((
-          histogram_sum(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))
+          sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="starting"}[1m]))
           /
-          histogram_count(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))
+          sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="starting"}[1m]))
       )[5m:1m]) > 0
     for: 5m
     labels:
@@ -1013,9 +1013,9 @@ groups:
       runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh
     expr: |
       (
-        histogram_sum(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
+        sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="running"}[1m]))
         /
-        histogram_count(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
+        sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="running"}[1m]))
       ) > (10 * 60)
     for: 5m
     labels:
diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet
index c8d1057534f..81944eb040e 100644
--- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet
+++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet
@@ -61,11 +61,12 @@
         {
           alert: $.alertName('StartingIngesterKafkaReceiveDelayIncreasing'),
           'for': '5m',
+          // We're using series from classic histogram here, because mixtool lint doesn't support histogram_sum, histogram_count functions yet.
           expr: |||
             deriv((
-                histogram_sum(sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))
+                sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="starting"}[1m]))
                 /
-                histogram_count(sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))
+                sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="starting"}[1m]))
             )[5m:1m]) > 0
           ||| % $._config,
           labels: {
@@ -79,11 +80,12 @@
         {
           alert: $.alertName('RunningIngesterReceiveDelayTooHigh'),
           'for': '5m',
+          // We're using series from classic histogram here, because mixtool lint doesn't support histogram_sum, histogram_count functions yet.
           expr: |||
             (
-              histogram_sum(sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
+              sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="running"}[1m]))
               /
-              histogram_count(sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
+              sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="running"}[1m]))
             ) > (10 * 60)
           ||| % $._config,
           labels: {