From 97c4104148dba60a1b2755d56e349999ffe71dfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Fri, 22 Mar 2024 18:07:29 +0100 Subject: [PATCH 01/19] Add IngesterFailedToReadRecordsFromKafka and IngesterKafkaFetchErrorsRateTooHigh alerts. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- .../alerts.yaml | 24 ++++++++++++ operations/mimir-mixin-compiled/alerts.yaml | 24 ++++++++++++ .../alerts/ingest-storage.libsonnet | 37 +++++++++++++++++++ pkg/storage/ingest/reader.go | 12 ++++-- pkg/storage/ingest/reader_test.go | 35 +++++++++++++++++- 5 files changed, 126 insertions(+), 6 deletions(-) diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index da9d6919d7b..ca400150edd 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -953,6 +953,30 @@ groups: for: 15m labels: severity: critical + - alert: MimirIngesterFailedToReadRecordsFromKafka + annotations: + message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace + }} is failing to read records from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailedtoreadrecordsfromkafka + expr: | + sum by(cluster, namespace, instance, node_id) (rate(cortex_ingest_storage_reader_read_errors_total[1m])) + > 0 + for: 5m + labels: + severity: critical + - alert: MimirIngesterKafkaFetchErrorsRateTooHigh + annotations: + message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace + }} is receiving fetch errors when reading records from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterkafkafetcherrorsratetoohigh + expr: | + sum by (cluster, namespace, instance) (rate (cortex_ingest_storage_reader_fetch_errors_total}[5m])) + / + sum by (cluster, namespace, instance) (rate (cortex_ingest_storage_reader_fetches_total[5m])) + > 0.1 + for: 15m + labels: + severity: critical - name: mimir_continuous_test rules: - alert: MimirContinuousTestNotRunningOnWrites diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index 7599b0d3700..a2cbe41dc92 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -966,6 +966,30 @@ groups: for: 15m labels: severity: critical + - alert: MimirIngesterFailedToReadRecordsFromKafka + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} is failing to read records from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailedtoreadrecordsfromkafka + expr: | + sum by(cluster, namespace, pod, node_id) (rate(cortex_ingest_storage_reader_read_errors_total[1m])) + > 0 + for: 5m + labels: + severity: critical + - alert: MimirIngesterKafkaFetchErrorsRateTooHigh + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} is receiving fetch errors when reading records from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterkafkafetcherrorsratetoohigh + expr: | + sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetch_errors_total}[5m])) + / + sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetches_total[5m])) + > 0.1 + for: 15m + labels: + severity: critical - name: mimir_continuous_test rules: - alert: MimirContinuousTestNotRunningOnWrites diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet index db80497b50f..eb4e826cd91 100644 --- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet +++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet @@ -19,6 +19,43 @@ message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s is failing to commit the last consumed offset.' % $._config, }, }, + + { + alert: $.alertName('IngesterFailedToReadRecordsFromKafka'), + 'for': '5m', + + // Metric used by this alert is reported by Kafka client on read errors from connection to Kafka. + // We use node_id to only alert if problems to the same Kafka node are repeating. + // If problems are for different nodes (eg. during rollout), that is not a problem, and we don't need to trigger alert. + expr: ||| + sum by(%(alert_aggregation_labels)s, %(per_instance_label)s, node_id) (rate(cortex_ingest_storage_reader_read_errors_total[1m])) + > 0 + ||| % $._config, + labels: { + severity: 'critical', + }, + annotations: { + message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s is failing to read records from Kafka.' % $._config, + }, + }, + + { + alert: $.alertName('IngesterKafkaFetchErrorsRateTooHigh'), + 'for': '15m', + // See https://github.com/grafana/mimir/blob/24591ae56cd7d6ef24a7cc1541a41405676773f4/vendor/github.com/twmb/franz-go/pkg/kgo/record_and_fetch.go#L332-L366 for errors that can be reported here. + expr: ||| + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate (cortex_ingest_storage_reader_fetch_errors_total}[5m])) + / + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate (cortex_ingest_storage_reader_fetches_total[5m])) + > 0.1 + ||| % $._config, + labels: { + severity: 'critical', + }, + annotations: { + message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s is receiving fetch errors when reading records from Kafka.' % $._config, + }, + }, ], }, ], diff --git a/pkg/storage/ingest/reader.go b/pkg/storage/ingest/reader.go index 1718adec583..42b5864ea01 100644 --- a/pkg/storage/ingest/reader.go +++ b/pkg/storage/ingest/reader.go @@ -191,7 +191,7 @@ func (r *PartitionReader) run(ctx context.Context) error { func (r *PartitionReader) processNextFetches(ctx context.Context, delayObserver prometheus.Observer) { fetches := r.client.PollFetches(ctx) r.recordFetchesMetrics(fetches, delayObserver) - r.logFetchErrs(fetches) + r.logFetchErrors(fetches) fetches = filterOutErrFetches(fetches) // TODO consumeFetches() may get interrupted in the middle because of ctx canceled due to PartitionReader stopped. @@ -274,12 +274,16 @@ func isErrFetch(fetch kgo.Fetch) bool { return false } -func (r *PartitionReader) logFetchErrs(fetches kgo.Fetches) { +func (r *PartitionReader) logFetchErrors(fetches kgo.Fetches) { mErr := multierror.New() - fetches.EachError(func(s string, i int32, err error) { + fetches.EachError(func(topic string, partition int32, err error) { + if errors.Is(err, context.Canceled) { + return + } + // kgo advises to "restart" the kafka client if the returned error is a kerr.Error. // Recreating the client would cause duplicate metrics registration, so we don't do it for now. - mErr.Add(fmt.Errorf("topic %q, partition %d: %w", s, i, err)) + mErr.Add(fmt.Errorf("topic %q, partition %d: %w", topic, partition, err)) }) if len(mErr) == 0 { return diff --git a/pkg/storage/ingest/reader_test.go b/pkg/storage/ingest/reader_test.go index ee49c76c715..3481dbe5c54 100644 --- a/pkg/storage/ingest/reader_test.go +++ b/pkg/storage/ingest/reader_test.go @@ -69,6 +69,37 @@ func TestPartitionReader(t *testing.T) { assert.Equal(t, [][]byte{content, content}, records) } +func TestPartitionReader_logFetchErrors(t *testing.T) { + const ( + topicName = "test" + partitionID = 1 + ) + + cfg := defaultReaderTestConfig(t, "", topicName, partitionID, nil) + reader, err := newPartitionReader(cfg.kafka, cfg.partitionID, "test-group", cfg.consumer, cfg.logger, cfg.registry) + require.NoError(t, err) + + reader.logFetchErrors(kgo.Fetches{ + kgo.Fetch{Topics: []kgo.FetchTopic{ + { + Topic: topicName, + Partitions: []kgo.FetchPartition{ + {Partition: partitionID, Err: nil}, + {Partition: partitionID, Err: context.Canceled}, // not counted in metrics + {Partition: partitionID, Err: fmt.Errorf("wrapped: %w", context.Canceled)}, // not counted in metrics + {Partition: partitionID, Err: fmt.Errorf("real error")}, // counted + }, + }, + }}, + }) + + assert.NoError(t, promtest.GatherAndCompare(cfg.registry, strings.NewReader(` + # HELP cortex_ingest_storage_reader_fetch_errors_total The number of fetch errors encountered by the consumer. + # TYPE cortex_ingest_storage_reader_fetch_errors_total counter + cortex_ingest_storage_reader_fetch_errors_total 1 + `), "cortex_ingest_storage_reader_fetch_errors_total")) +} + func TestPartitionReader_ConsumerError(t *testing.T) { const ( topicName = "test" @@ -1114,7 +1145,7 @@ type readerTestCfg struct { kafka KafkaConfig partitionID int32 consumer recordConsumer - registry prometheus.Registerer + registry *prometheus.Registry logger log.Logger commitInterval time.Duration } @@ -1145,7 +1176,7 @@ func withConsumeFromPositionAtStartup(position string) func(cfg *readerTestCfg) } } -func withRegistry(reg prometheus.Registerer) func(cfg *readerTestCfg) { +func withRegistry(reg *prometheus.Registry) func(cfg *readerTestCfg) { return func(cfg *readerTestCfg) { cfg.registry = reg } From fa844446f5eda7d9bd6450b2f083b572e2900d7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Mon, 25 Mar 2024 12:04:17 +0100 Subject: [PATCH 02/19] Alerts for ingester kafka lag. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- .../alerts.yaml | 22 ++++++++++++++ operations/mimir-mixin-compiled/alerts.yaml | 22 ++++++++++++++ .../alerts/ingest-storage.libsonnet | 29 +++++++++++++++++++ 3 files changed, 73 insertions(+) diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index ca400150edd..2a5abaee3c4 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -977,6 +977,28 @@ groups: for: 15m labels: severity: critical + - alert: MimirStartingIngesterKafkaLagNotDecreasing + annotations: + message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace + }} in "starting" phase is not reducing consumption lag of write requests read + from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkalagnotdecreasing + expr: | + deriv(histogram_quantile(0.99, sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))[5m:1m]) > 0 + for: 5m + labels: + severity: warning + - alert: MimirRunningIngesterKafkaLagTooHigh + annotations: + message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace + }} in "running" phase is too far behind in its consumption of write requests + from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterkafkalagtoohigh + expr: | + histogram_quantile(0.99, sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[$__rate_interval]))) > (10*60) + for: 5m + labels: + severity: critical - name: mimir_continuous_test rules: - alert: MimirContinuousTestNotRunningOnWrites diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index a2cbe41dc92..e9e67a82718 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -990,6 +990,28 @@ groups: for: 15m labels: severity: critical + - alert: MimirStartingIngesterKafkaLagNotDecreasing + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} in "starting" phase is not reducing consumption lag of write requests read + from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkalagnotdecreasing + expr: | + deriv(histogram_quantile(0.99, sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))[5m:1m]) > 0 + for: 5m + labels: + severity: warning + - alert: MimirRunningIngesterKafkaLagTooHigh + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} in "running" phase is too far behind in its consumption of write requests + from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterkafkalagtoohigh + expr: | + histogram_quantile(0.99, sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[$__rate_interval]))) > (10*60) + for: 5m + labels: + severity: critical - name: mimir_continuous_test rules: - alert: MimirContinuousTestNotRunningOnWrites diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet index eb4e826cd91..aba0b5b8644 100644 --- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet +++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet @@ -56,6 +56,35 @@ message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s is receiving fetch errors when reading records from Kafka.' % $._config, }, }, + + // This is an experiment. We compute derivatition (ie. rate of consumption lag change) over 5 minutes. If derivation is above 0, it means consumption lag is increasing, instead of decreasing. + { + alert: $.alertName('StartingIngesterKafkaLagNotDecreasing'), + 'for': '5m', + expr: ||| + deriv(histogram_quantile(0.99, sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))[5m:1m]) > 0 + ||| % $._config, + labels: { + severity: 'warning', + }, + annotations: { + message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s in "starting" phase is not reducing consumption lag of write requests read from Kafka.' % $._config, + }, + }, + + { + alert: $.alertName('RunningIngesterKafkaLagTooHigh'), + 'for': '5m', + expr: ||| + histogram_quantile(0.99, sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[$__rate_interval]))) > (10*60) + ||| % $._config, + labels: { + severity: 'critical', + }, + annotations: { + message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s in "running" phase is too far behind in its consumption of write requests from Kafka.' % $._config, + }, + }, ], }, ], From b4134400c1461a63a0948edc55621797375a900e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Mon, 25 Mar 2024 12:08:28 +0100 Subject: [PATCH 03/19] Add alert for failures to consume write requests. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- .../mimir-mixin-compiled-baremetal/alerts.yaml | 10 ++++++++++ operations/mimir-mixin-compiled/alerts.yaml | 10 ++++++++++ .../mimir-mixin/alerts/ingest-storage.libsonnet | 14 ++++++++++++++ 3 files changed, 34 insertions(+) diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index 2a5abaee3c4..bd045ac16f3 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -999,6 +999,16 @@ groups: for: 5m labels: severity: critical + - alert: MimirIngesterFailsToProcessRecordsFromKafka + annotations: + message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace + }} fails to consume write requests read from Kafka due to internal errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka + expr: | + sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[5m]) > 0 + for: 5m + labels: + severity: critical - name: mimir_continuous_test rules: - alert: MimirContinuousTestNotRunningOnWrites diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index e9e67a82718..b9422ed687d 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -1012,6 +1012,16 @@ groups: for: 5m labels: severity: critical + - alert: MimirIngesterFailsToProcessRecordsFromKafka + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} fails to consume write requests read from Kafka due to internal errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka + expr: | + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[5m]) > 0 + for: 5m + labels: + severity: critical - name: mimir_continuous_test rules: - alert: MimirContinuousTestNotRunningOnWrites diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet index aba0b5b8644..d2df0f9f5a4 100644 --- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet +++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet @@ -85,6 +85,20 @@ message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s in "running" phase is too far behind in its consumption of write requests from Kafka.' % $._config, }, }, + + { + alert: $.alertName('IngesterFailsToProcessRecordsFromKafka'), + 'for': '5m', + expr: ||| + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[5m]) > 0 + ||| % $._config, + labels: { + severity: 'critical', + }, + annotations: { + message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s fails to consume write requests read from Kafka due to internal errors.' % $._config, + }, + }, ], }, ], From 124d3aff87947a2ef9e25bccd709eaaaedff82c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Mon, 25 Mar 2024 12:26:20 +0100 Subject: [PATCH 04/19] Add alert for failures to enforce strong consistency. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- .../mimir-mixin-compiled-baremetal/alerts.yaml | 10 ++++++++++ operations/mimir-mixin-compiled/alerts.yaml | 10 ++++++++++ .../mimir-mixin/alerts/ingest-storage.libsonnet | 14 ++++++++++++++ 3 files changed, 34 insertions(+) diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index bd045ac16f3..c15f7529b44 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -1009,6 +1009,16 @@ groups: for: 5m labels: severity: critical + - alert: MimirIngesterFailsEnforceStrongConsistencyOnReadPath + annotations: + message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace + }} fails to enforce strong-consistency on read-path. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailsenforcestrongconsistencyonreadpath + expr: | + sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_strong_consistency_failures_total[5m])) > 0 + for: 5m + labels: + severity: critical - name: mimir_continuous_test rules: - alert: MimirContinuousTestNotRunningOnWrites diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index b9422ed687d..2acea51bfa1 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -1022,6 +1022,16 @@ groups: for: 5m labels: severity: critical + - alert: MimirIngesterFailsEnforceStrongConsistencyOnReadPath + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} fails to enforce strong-consistency on read-path. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailsenforcestrongconsistencyonreadpath + expr: | + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_strong_consistency_failures_total[5m])) > 0 + for: 5m + labels: + severity: critical - name: mimir_continuous_test rules: - alert: MimirContinuousTestNotRunningOnWrites diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet index d2df0f9f5a4..698b4df1241 100644 --- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet +++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet @@ -99,6 +99,20 @@ message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s fails to consume write requests read from Kafka due to internal errors.' % $._config, }, }, + + { + alert: $.alertName('IngesterFailsEnforceStrongConsistencyOnReadPath'), + 'for': '5m', + expr: ||| + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_strong_consistency_failures_total[5m])) > 0 + ||| % $._config, + labels: { + severity: 'critical', + }, + annotations: { + message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s fails to enforce strong-consistency on read-path.' % $._config, + }, + }, ], }, ], From 64ca93fec39d778d2036afd16833a2d9231e58f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Mon, 25 Mar 2024 12:45:48 +0100 Subject: [PATCH 05/19] Runbooks. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- .../mimir/manage/mimir-runbooks/_index.md | 46 +++++++++++++++++++ .../alerts.yaml | 8 ++-- operations/mimir-mixin-compiled/alerts.yaml | 8 ++-- .../alerts/ingest-storage.libsonnet | 4 +- 4 files changed, 56 insertions(+), 10 deletions(-) diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md index 294d3e98ee1..60d83fdf68b 100644 --- a/docs/sources/mimir/manage/mimir-runbooks/_index.md +++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md @@ -1327,6 +1327,52 @@ How to **investigate**: - Check ingester logs to find details about the error. - Check Kafka logs and health. +### MimirIngesterIngesterFailedToReadRecordsFromKafka + +This alert fires when an ingester is failing to read records from Kafka backend. + +How it **works**: + +- Ingester connects to Kafka brokers and reads records from it. +- When ingester fails to read more records from Kafka due to error, ingester logs such error. +- This can be normal if Kafka brokers are restarting, however if read errors continue for some time, alert is raised. + +How to **investigate**: + +- Check ingester logs to find details about the error. +- Check Kafka logs and health. + +### MimirIngesterKafkaFetchErrorsRateTooHigh + +This alert fires when an ingester is receiving errors instead of "fetches" from Kafka. + +How it **works**: + +- Ingester uses Kafka client to read records from Kafka. +- Kafka client can return errors instead of more records. +- If rate of returned errors compared to returned records is too high, alert is raised. +- Kafka client can return errors [documented in the source code](https://github.com/grafana/mimir/blob/24591ae56cd7d6ef24a7cc1541a41405676773f4/vendor/github.com/twmb/franz-go/pkg/kgo/record_and_fetch.go#L332-L366). + +How to **investigate**: + +- Check ingester logs to find details about the error. +- Check Kafka logs and health. + +### MimirStartingIngesterKafkaReceiveDelayIncreasing + +This alert fires when consumption lag reported by ingester during "starting" phase is not decreasing. + +How it **works**: + +- When ingester is starting, it needs to fetch and process records from Kafka until preconfigured consumption lag is honored. +- Each record has a timestamp when it was stored to Kafka. When ingester reads the record, it computes "receive delay" as a difference between current time (when record was read) and time when record was stored to Kafka. This receive delay is reported in metrics. +- Under normal conditions when ingester is processing records faster than records are appearing, receive delay should be decreasing. +- When ingester is starting, and observed "receive delay" is increasing, alert is raised. + +How to **investigate**: + +- Check if ingester is fast enough to process all data in Kafka. If not, configure ingesters to start with later offset instead. + ## Errors catalog Mimir has some codified error IDs that you might see in HTTP responses or logs. diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index c15f7529b44..f365206028d 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -977,23 +977,23 @@ groups: for: 15m labels: severity: critical - - alert: MimirStartingIngesterKafkaLagNotDecreasing + - alert: MimirStartingIngesterKafkaReceiveDelayIncreasing annotations: message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "starting" phase is not reducing consumption lag of write requests read from Kafka. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkalagnotdecreasing + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing expr: | deriv(histogram_quantile(0.99, sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))[5m:1m]) > 0 for: 5m labels: severity: warning - - alert: MimirRunningIngesterKafkaLagTooHigh + - alert: MimirRunningIngesterReceiveDelayTooHigh annotations: message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "running" phase is too far behind in its consumption of write requests from Kafka. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterkafkalagtoohigh + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh expr: | histogram_quantile(0.99, sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[$__rate_interval]))) > (10*60) for: 5m diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index 2acea51bfa1..7ed12bb6a68 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -990,23 +990,23 @@ groups: for: 15m labels: severity: critical - - alert: MimirStartingIngesterKafkaLagNotDecreasing + - alert: MimirStartingIngesterKafkaReceiveDelayIncreasing annotations: message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "starting" phase is not reducing consumption lag of write requests read from Kafka. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkalagnotdecreasing + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing expr: | deriv(histogram_quantile(0.99, sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))[5m:1m]) > 0 for: 5m labels: severity: warning - - alert: MimirRunningIngesterKafkaLagTooHigh + - alert: MimirRunningIngesterReceiveDelayTooHigh annotations: message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "running" phase is too far behind in its consumption of write requests from Kafka. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterkafkalagtoohigh + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh expr: | histogram_quantile(0.99, sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[$__rate_interval]))) > (10*60) for: 5m diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet index 698b4df1241..15a7de8bc8a 100644 --- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet +++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet @@ -59,7 +59,7 @@ // This is an experiment. We compute derivatition (ie. rate of consumption lag change) over 5 minutes. If derivation is above 0, it means consumption lag is increasing, instead of decreasing. { - alert: $.alertName('StartingIngesterKafkaLagNotDecreasing'), + alert: $.alertName('StartingIngesterKafkaReceiveDelayIncreasing'), 'for': '5m', expr: ||| deriv(histogram_quantile(0.99, sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))[5m:1m]) > 0 @@ -73,7 +73,7 @@ }, { - alert: $.alertName('RunningIngesterKafkaLagTooHigh'), + alert: $.alertName('RunningIngesterReceiveDelayTooHigh'), 'for': '5m', expr: ||| histogram_quantile(0.99, sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[$__rate_interval]))) > (10*60) From 0d3eab3736ed960d028ff748e3e1e9ae6b07f5ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Mon, 25 Mar 2024 13:11:14 +0100 Subject: [PATCH 06/19] Runbooks. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- .../mimir/manage/mimir-runbooks/_index.md | 48 ++++++++++++++++++- .../alerts.yaml | 4 +- operations/mimir-mixin-compiled/alerts.yaml | 4 +- .../alerts/ingest-storage.libsonnet | 4 +- 4 files changed, 53 insertions(+), 7 deletions(-) diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md index 60d83fdf68b..92184579837 100644 --- a/docs/sources/mimir/manage/mimir-runbooks/_index.md +++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md @@ -1360,7 +1360,7 @@ How to **investigate**: ### MimirStartingIngesterKafkaReceiveDelayIncreasing -This alert fires when consumption lag reported by ingester during "starting" phase is not decreasing. +This alert fires when "receive delay" reported by ingester during "starting" phase is not decreasing. How it **works**: @@ -1373,6 +1373,52 @@ How to **investigate**: - Check if ingester is fast enough to process all data in Kafka. If not, configure ingesters to start with later offset instead. +### MimirRunningIngesterReceiveDelayTooHigh + +This alert fires when "receive delay" reported by ingester while it's running reaches alert threshold. + +How it **works**: + +- After ingester start and catches up with records in Kafka, ingester switches to "running" mode. +- In running mode, ingester continues to process incoming samples from Kafka and continues to report "receive delay". See [`MimirStartingIngesterKafkaReceiveDelayIncreasing`](#MimirStartingIngesterKafkaReceiveDelayIncreasing) runbook for details about this metric. +- Under normal conditions when ingester is running and it is processing records faster than records are appearing, receive delay should be stable. +- If observed "receive delay" increases and reaches certain threshold, alert is raised. + +How to **investigate**: + +- Check if ingester is fast enough to process all data in Kafka. +- If ingesters are too slow, consider scaling ingesters, either vertically (to make ingesters faster), or horizontally to spread incoming series between more ingesters. + +### MimirIngesterFailsToProcessRecordsFromKafka + +This alert fires when ingester is unable to process incoming records from Kafka due to internal errors. If ingest-storage wasn't used, such push requests would end up with 5xx errors. + +How it **works**: + +- Ingester reads records from Kafka, and processes them locally. Processing means unmarshalling the data and handling write requests stored in records. +- Write requests can fail due to "user" or "server" errors. Typical user error is too low limit for number of series. Server error can be for example ingester hitting an instance limit. +- If requests keep failing due to server errors, this alert is raised. + +How to **investigate**: + +- Check ingester logs to see why requests are failing, and troubleshoot based on that. + +### MimirIngesterFailsEnforceStrongConsistencyOnReadPath + +This alert fires when too many read-requests with strong consistency are failing. + +How it **works**: + +- When read request asks for strong-consistency guarantee, ingester will read the last produced offset from Kafka, and wait until record with this offset is consumed. +- If read request times out during this wait, that is considered to be a failure of request with strong-consistency. +- If requests keep failing due to failure to enforce strong-consistency, this alert is raised. + +How to **investigate**: + +- Check wait latency of requests with strong-consistency. +- Check if ingester needs to process too many records, and whether ingesters need to be scaled up (vertically or horizontally). +- Consider increasing read-timeout of requests. + ## Errors catalog Mimir has some codified error IDs that you might see in HTTP responses or logs. diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index f365206028d..bd32e2f844e 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -1005,7 +1005,7 @@ groups: }} fails to consume write requests read from Kafka due to internal errors. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka expr: | - sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[5m]) > 0 + sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m]) > 0 for: 5m labels: severity: critical @@ -1015,7 +1015,7 @@ groups: }} fails to enforce strong-consistency on read-path. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailsenforcestrongconsistencyonreadpath expr: | - sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_strong_consistency_failures_total[5m])) > 0 + sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_strong_consistency_failures_total[1m])) > 0 for: 5m labels: severity: critical diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index 7ed12bb6a68..97402bd970a 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -1018,7 +1018,7 @@ groups: }} fails to consume write requests read from Kafka due to internal errors. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka expr: | - sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[5m]) > 0 + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m]) > 0 for: 5m labels: severity: critical @@ -1028,7 +1028,7 @@ groups: }} fails to enforce strong-consistency on read-path. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailsenforcestrongconsistencyonreadpath expr: | - sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_strong_consistency_failures_total[5m])) > 0 + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_strong_consistency_failures_total[1m])) > 0 for: 5m labels: severity: critical diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet index 15a7de8bc8a..fd915ee07bf 100644 --- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet +++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet @@ -90,7 +90,7 @@ alert: $.alertName('IngesterFailsToProcessRecordsFromKafka'), 'for': '5m', expr: ||| - sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[5m]) > 0 + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m]) > 0 ||| % $._config, labels: { severity: 'critical', @@ -104,7 +104,7 @@ alert: $.alertName('IngesterFailsEnforceStrongConsistencyOnReadPath'), 'for': '5m', expr: ||| - sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_strong_consistency_failures_total[5m])) > 0 + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_strong_consistency_failures_total[1m])) > 0 ||| % $._config, labels: { severity: 'critical', From 9a6b3e60b7926ce63e5f71d49108cf9443c0ab35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Fri, 22 Mar 2024 16:03:48 +0100 Subject: [PATCH 07/19] Mention cases ingest-storage cases. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- docs/sources/mimir/manage/mimir-runbooks/_index.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md index 92184579837..fd98180d95b 100644 --- a/docs/sources/mimir/manage/mimir-runbooks/_index.md +++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md @@ -190,7 +190,7 @@ How to **investigate**: - Check the `Mimir / Writes` dashboard - Looking at the dashboard you should see in which Mimir service the high latency originates - - The panels in the dashboard are vertically sorted by the network path (eg. gateway -> distributor -> ingester) + - The panels in the dashboard are vertically sorted by the network path (eg. gateway -> distributor -> ingester). When using ingest-storage, network path changes to gateway -> distributor -> Kafka instead. - Deduce where in the stack the latency is being introduced - **`gateway`** - Latency may be caused by the time taken for the gateway to receive the entire request from the client. There are a multitude of reasons this can occur, so communication with the user may be necessary. For example: @@ -201,6 +201,7 @@ How to **investigate**: - There could be a problem with authentication (eg. slow to run auth layer) - **`distributor`** - Typically, distributor p99 latency is in the range 50-100ms. If the distributor latency is higher than this, you may need to scale up the distributors. + - When using Mimir ingest-storage, distributors are writing requests to Kafka-compatible backend. Increased latency in distributor may also come from this backend. - **`ingester`** - Typically, ingester p99 latency is in the range 5-50ms. If the ingester latency is higher than this, you should investigate the root cause before scaling up ingesters. - Check out the following alerts and fix them if firing: @@ -243,6 +244,9 @@ How to **investigate**: - If queries are not waiting in queue - Consider [enabling query sharding]({{< relref "../../references/architecture/query-sharding#how-to-enable-query-sharding" >}}) if not already enabled, to increase query parallelism - If query sharding already enabled, consider increasing total number of query shards (`query_sharding_total_shards`) for tenants submitting slow queries, so their queries can be further parallelized + - **`ingester`** + - Check if ingesters are not overloaded. If they are and you can scale up ingesters vertically, that may be the best action. If that's not possible, scaling horizontally can help as well, but it can take several hours for ingesters to fully redistribute their series. + - When using ingest-storage, check ratio of queries using strong-consistency, and latency of queries using strong-consistency. #### Alertmanager @@ -278,6 +282,7 @@ How to **investigate**: - If the failing service is crashing / panicking: look for the stack trace in the logs and investigate from there - If crashing service is query-frontend, querier or store-gateway, and you have "activity tracker" feature enabled, look for `found unfinished activities from previous run` message and subsequent `activity` messages in the log file to see which queries caused the crash. - When using Memberlist as KV store for hash rings, ensure that Memberlist is working correctly. See instructions for the [`MimirGossipMembersTooHigh`](#MimirGossipMembersTooHigh) and [`MimirGossipMembersTooLow`](#MimirGossipMembersTooLow) alerts. +- When using ingest-storage and distributors are failing to write requests to Kafka, make sure that Kafka is up and running correctly. #### Alertmanager From 976a1627ec51b4a1f77e5fa0564126db57b598ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Mon, 25 Mar 2024 13:16:56 +0100 Subject: [PATCH 08/19] Fix query. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- operations/mimir-mixin-compiled-baremetal/alerts.yaml | 2 +- operations/mimir-mixin-compiled/alerts.yaml | 2 +- operations/mimir-mixin/alerts/ingest-storage.libsonnet | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index bd32e2f844e..bdf3b2e12b8 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -970,7 +970,7 @@ groups: }} is receiving fetch errors when reading records from Kafka. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterkafkafetcherrorsratetoohigh expr: | - sum by (cluster, namespace, instance) (rate (cortex_ingest_storage_reader_fetch_errors_total}[5m])) + sum by (cluster, namespace, instance) (rate (cortex_ingest_storage_reader_fetch_errors_total[5m])) / sum by (cluster, namespace, instance) (rate (cortex_ingest_storage_reader_fetches_total[5m])) > 0.1 diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index 97402bd970a..e181079a9b4 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -983,7 +983,7 @@ groups: }} is receiving fetch errors when reading records from Kafka. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterkafkafetcherrorsratetoohigh expr: | - sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetch_errors_total}[5m])) + sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetch_errors_total[5m])) / sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetches_total[5m])) > 0.1 diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet index fd915ee07bf..26d0d7b6e5d 100644 --- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet +++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet @@ -44,7 +44,7 @@ 'for': '15m', // See https://github.com/grafana/mimir/blob/24591ae56cd7d6ef24a7cc1541a41405676773f4/vendor/github.com/twmb/franz-go/pkg/kgo/record_and_fetch.go#L332-L366 for errors that can be reported here. expr: ||| - sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate (cortex_ingest_storage_reader_fetch_errors_total}[5m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate (cortex_ingest_storage_reader_fetch_errors_total[5m])) / sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate (cortex_ingest_storage_reader_fetches_total[5m])) > 0.1 From 24cd11c4a7e4105f65fe08c176e1ad0a58a7a0ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Mon, 25 Mar 2024 13:19:13 +0100 Subject: [PATCH 09/19] Add links to ingest storage. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- docs/sources/mimir/manage/mimir-runbooks/_index.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md index fd98180d95b..6542bd8842c 100644 --- a/docs/sources/mimir/manage/mimir-runbooks/_index.md +++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md @@ -190,7 +190,7 @@ How to **investigate**: - Check the `Mimir / Writes` dashboard - Looking at the dashboard you should see in which Mimir service the high latency originates - - The panels in the dashboard are vertically sorted by the network path (eg. gateway -> distributor -> ingester). When using ingest-storage, network path changes to gateway -> distributor -> Kafka instead. + - The panels in the dashboard are vertically sorted by the network path (eg. gateway -> distributor -> ingester). When using [ingest-storage](#mimir-ingest-storage-experimental), network path changes to gateway -> distributor -> Kafka instead. - Deduce where in the stack the latency is being introduced - **`gateway`** - Latency may be caused by the time taken for the gateway to receive the entire request from the client. There are a multitude of reasons this can occur, so communication with the user may be necessary. For example: @@ -201,7 +201,7 @@ How to **investigate**: - There could be a problem with authentication (eg. slow to run auth layer) - **`distributor`** - Typically, distributor p99 latency is in the range 50-100ms. If the distributor latency is higher than this, you may need to scale up the distributors. - - When using Mimir ingest-storage, distributors are writing requests to Kafka-compatible backend. Increased latency in distributor may also come from this backend. + - When using Mimir [ingest-storage](#mimir-ingest-storage-experimental), distributors are writing requests to Kafka-compatible backend. Increased latency in distributor may also come from this backend. - **`ingester`** - Typically, ingester p99 latency is in the range 5-50ms. If the ingester latency is higher than this, you should investigate the root cause before scaling up ingesters. - Check out the following alerts and fix them if firing: @@ -246,7 +246,7 @@ How to **investigate**: - If query sharding already enabled, consider increasing total number of query shards (`query_sharding_total_shards`) for tenants submitting slow queries, so their queries can be further parallelized - **`ingester`** - Check if ingesters are not overloaded. If they are and you can scale up ingesters vertically, that may be the best action. If that's not possible, scaling horizontally can help as well, but it can take several hours for ingesters to fully redistribute their series. - - When using ingest-storage, check ratio of queries using strong-consistency, and latency of queries using strong-consistency. + - When using [ingest-storage](#mimir-ingest-storage-experimental), check ratio of queries using strong-consistency, and latency of queries using strong-consistency. #### Alertmanager @@ -282,7 +282,7 @@ How to **investigate**: - If the failing service is crashing / panicking: look for the stack trace in the logs and investigate from there - If crashing service is query-frontend, querier or store-gateway, and you have "activity tracker" feature enabled, look for `found unfinished activities from previous run` message and subsequent `activity` messages in the log file to see which queries caused the crash. - When using Memberlist as KV store for hash rings, ensure that Memberlist is working correctly. See instructions for the [`MimirGossipMembersTooHigh`](#MimirGossipMembersTooHigh) and [`MimirGossipMembersTooLow`](#MimirGossipMembersTooLow) alerts. -- When using ingest-storage and distributors are failing to write requests to Kafka, make sure that Kafka is up and running correctly. +- When using [ingest-storage](#mimir-ingest-storage-experimental) and distributors are failing to write requests to Kafka, make sure that Kafka is up and running correctly. #### Alertmanager From 4bf495f32d14353f48715e533862d0ec6fb2fa22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Mon, 25 Mar 2024 13:22:21 +0100 Subject: [PATCH 10/19] Fix range. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- operations/mimir-mixin-compiled-baremetal/alerts.yaml | 2 +- operations/mimir-mixin-compiled/alerts.yaml | 2 +- operations/mimir-mixin/alerts/ingest-storage.libsonnet | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index bdf3b2e12b8..8eb31e31ee1 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -995,7 +995,7 @@ groups: from Kafka. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh expr: | - histogram_quantile(0.99, sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[$__rate_interval]))) > (10*60) + histogram_quantile(0.99, sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) > (10*60) for: 5m labels: severity: critical diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index e181079a9b4..e2a2f156cbb 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -1008,7 +1008,7 @@ groups: from Kafka. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh expr: | - histogram_quantile(0.99, sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[$__rate_interval]))) > (10*60) + histogram_quantile(0.99, sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) > (10*60) for: 5m labels: severity: critical diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet index 26d0d7b6e5d..0bd50d89f25 100644 --- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet +++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet @@ -76,7 +76,7 @@ alert: $.alertName('RunningIngesterReceiveDelayTooHigh'), 'for': '5m', expr: ||| - histogram_quantile(0.99, sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[$__rate_interval]))) > (10*60) + histogram_quantile(0.99, sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) > (10*60) ||| % $._config, labels: { severity: 'critical', From aad59a4287a5c3ecc036ffcc73561c92e3398776 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Mon, 25 Mar 2024 13:22:27 +0100 Subject: [PATCH 11/19] Fix helm alerts. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- .../metamonitoring/mixin-alerts.yaml | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml index 72ad38a6c72..2edcf051d44 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml @@ -978,6 +978,72 @@ spec: for: 15m labels: severity: critical + - alert: MimirIngesterFailedToReadRecordsFromKafka + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} is failing to read records from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailedtoreadrecordsfromkafka + expr: | + sum by(cluster, namespace, pod, node_id) (rate(cortex_ingest_storage_reader_read_errors_total[1m])) + > 0 + for: 5m + labels: + severity: critical + - alert: MimirIngesterKafkaFetchErrorsRateTooHigh + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} is receiving fetch errors when reading records from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterkafkafetcherrorsratetoohigh + expr: | + sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetch_errors_total[5m])) + / + sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetches_total[5m])) + > 0.1 + for: 15m + labels: + severity: critical + - alert: MimirStartingIngesterKafkaReceiveDelayIncreasing + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} in "starting" phase is not reducing consumption lag of write requests read + from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing + expr: | + deriv(histogram_quantile(0.99, sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))[5m:1m]) > 0 + for: 5m + labels: + severity: warning + - alert: MimirRunningIngesterReceiveDelayTooHigh + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} in "running" phase is too far behind in its consumption of write requests + from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh + expr: | + histogram_quantile(0.99, sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) > (10*60) + for: 5m + labels: + severity: critical + - alert: MimirIngesterFailsToProcessRecordsFromKafka + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} fails to consume write requests read from Kafka due to internal errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka + expr: | + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m]) > 0 + for: 5m + labels: + severity: critical + - alert: MimirIngesterFailsEnforceStrongConsistencyOnReadPath + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} fails to enforce strong-consistency on read-path. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailsenforcestrongconsistencyonreadpath + expr: | + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_strong_consistency_failures_total[1m])) > 0 + for: 5m + labels: + severity: critical - name: mimir_continuous_test rules: - alert: MimirContinuousTestNotRunningOnWrites From 56c6c2175e9dc1c4cb1990d504c283c96bf13d6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Mon, 25 Mar 2024 13:45:11 +0100 Subject: [PATCH 12/19] Fix typos. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- docs/sources/mimir/manage/mimir-runbooks/_index.md | 6 +++--- .../templates/metamonitoring/mixin-alerts.yaml | 2 +- operations/mimir-mixin-compiled-baremetal/alerts.yaml | 2 +- operations/mimir-mixin-compiled/alerts.yaml | 2 +- operations/mimir-mixin/alerts/ingest-storage.libsonnet | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md index 6542bd8842c..646b0ef35eb 100644 --- a/docs/sources/mimir/manage/mimir-runbooks/_index.md +++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md @@ -190,7 +190,7 @@ How to **investigate**: - Check the `Mimir / Writes` dashboard - Looking at the dashboard you should see in which Mimir service the high latency originates - - The panels in the dashboard are vertically sorted by the network path (eg. gateway -> distributor -> ingester). When using [ingest-storage](#mimir-ingest-storage-experimental), network path changes to gateway -> distributor -> Kafka instead. + - The panels in the dashboard are vertically sorted by the network path (eg. gateway -> distributor -> ingester). When using [ingest-storage](#mimir-ingest-storage-experimental), network path changes to gateway -> distributor -> Kafka instead. - Deduce where in the stack the latency is being introduced - **`gateway`** - Latency may be caused by the time taken for the gateway to receive the entire request from the client. There are a multitude of reasons this can occur, so communication with the user may be necessary. For example: @@ -1340,7 +1340,7 @@ How it **works**: - Ingester connects to Kafka brokers and reads records from it. - When ingester fails to read more records from Kafka due to error, ingester logs such error. -- This can be normal if Kafka brokers are restarting, however if read errors continue for some time, alert is raised. +- This can be normal if Kafka brokers are restarting, however if read errors continue for some time, alert is raised. How to **investigate**: @@ -1384,7 +1384,7 @@ This alert fires when "receive delay" reported by ingester while it's running re How it **works**: -- After ingester start and catches up with records in Kafka, ingester switches to "running" mode. +- After ingester start and catches up with records in Kafka, ingester switches to "running" mode. - In running mode, ingester continues to process incoming samples from Kafka and continues to report "receive delay". See [`MimirStartingIngesterKafkaReceiveDelayIncreasing`](#MimirStartingIngesterKafkaReceiveDelayIncreasing) runbook for details about this metric. - Under normal conditions when ingester is running and it is processing records faster than records are appearing, receive delay should be stable. - If observed "receive delay" increases and reaches certain threshold, alert is raised. diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml index 2edcf051d44..fb9769c7c7a 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml @@ -1030,7 +1030,7 @@ spec: }} fails to consume write requests read from Kafka due to internal errors. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka expr: | - sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m]) > 0 + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m])) > 0 for: 5m labels: severity: critical diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index 8eb31e31ee1..e438e9d32a2 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -1005,7 +1005,7 @@ groups: }} fails to consume write requests read from Kafka due to internal errors. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka expr: | - sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m]) > 0 + sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m])) > 0 for: 5m labels: severity: critical diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index e2a2f156cbb..e78fded5b11 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -1018,7 +1018,7 @@ groups: }} fails to consume write requests read from Kafka due to internal errors. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka expr: | - sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m]) > 0 + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m])) > 0 for: 5m labels: severity: critical diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet index 0bd50d89f25..de03a51ae3f 100644 --- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet +++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet @@ -90,7 +90,7 @@ alert: $.alertName('IngesterFailsToProcessRecordsFromKafka'), 'for': '5m', expr: ||| - sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m]) > 0 + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m])) > 0 ||| % $._config, labels: { severity: 'critical', From f41860bd13c017046ab93f5210a3373a0776e8f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Mon, 25 Mar 2024 13:51:50 +0100 Subject: [PATCH 13/19] Fix runbook name. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- docs/sources/mimir/manage/mimir-runbooks/_index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md index 646b0ef35eb..9325ccbaab2 100644 --- a/docs/sources/mimir/manage/mimir-runbooks/_index.md +++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md @@ -1332,7 +1332,7 @@ How to **investigate**: - Check ingester logs to find details about the error. - Check Kafka logs and health. -### MimirIngesterIngesterFailedToReadRecordsFromKafka +### MimirIngesterFailedToReadRecordsFromKafka This alert fires when an ingester is failing to read records from Kafka backend. From 2b8f2b7ec491e90cd9367334cb9bd32e26fe276d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Mon, 25 Mar 2024 14:21:32 +0100 Subject: [PATCH 14/19] Update changelog entry. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- CHANGELOG.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7539640db68..1cc77a6ad7b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,9 +24,15 @@ * [ENHANCEMENT] Alerts: allow configuring alerts range interval via `_config.base_alerts_range_interval_minutes`. #7591 * [ENHANCEMENT] Dashboards: Add panels for monitoring distributor and ingester when using ingest-storage. These panels are disabled by default, but can be enabled using `show_ingest_storage_panels: true` config option. Similarly existing panels used when distributors and ingesters use gRPC for forwarding requests can be disabled by setting `show_grpc_ingestion_panels: false`. #7670 #7699 -* [ENHANCEMENT] Alerts: add the following alerts when using ingest-storage: #7699 +* [ENHANCEMENT] Alerts: add the following alerts when using ingest-storage: #7699 #7702 * `MimirIngesterLastConsumedOffsetCommitFailed` -* [BUGFIX] Dashobards: Fix regular expression for matching read-path gRPC ingester methods to include querying of exemplars, label-related queries, or active series queries. #7676 + * `MimirIngesterFailedToReadRecordsFromKafka` + * `MimirIngesterKafkaFetchErrorsRateTooHigh` + * `MimirStartingIngesterKafkaReceiveDelayIncreasing` + * `MimirRunningIngesterReceiveDelayTooHigh` + * `MimirIngesterFailsToProcessRecordsFromKafka` + * `MimirIngesterFailsEnforceStrongConsistencyOnReadPath` +* [BUGFIX] Dashboards: Fix regular expression for matching read-path gRPC ingester methods to include querying of exemplars, label-related queries, or active series queries. #7676 ### Jsonnet From 99be6d40f00b46aeaab64079f5966520cd1de38d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Tue, 26 Mar 2024 16:06:02 +0100 Subject: [PATCH 15/19] Apply suggestions from code review Co-authored-by: Marco Pracucci --- docs/sources/mimir/manage/mimir-runbooks/_index.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md index 9325ccbaab2..d8c3697d115 100644 --- a/docs/sources/mimir/manage/mimir-runbooks/_index.md +++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md @@ -1338,7 +1338,7 @@ This alert fires when an ingester is failing to read records from Kafka backend. How it **works**: -- Ingester connects to Kafka brokers and reads records from it. +- Ingester connects to Kafka brokers and reads records from it. Records contain write requests committed by distributors. - When ingester fails to read more records from Kafka due to error, ingester logs such error. - This can be normal if Kafka brokers are restarting, however if read errors continue for some time, alert is raised. @@ -1353,7 +1353,7 @@ This alert fires when an ingester is receiving errors instead of "fetches" from How it **works**: -- Ingester uses Kafka client to read records from Kafka. +- Ingester uses Kafka client to read records (containing write requests) from Kafka. - Kafka client can return errors instead of more records. - If rate of returned errors compared to returned records is too high, alert is raised. - Kafka client can return errors [documented in the source code](https://github.com/grafana/mimir/blob/24591ae56cd7d6ef24a7cc1541a41405676773f4/vendor/github.com/twmb/franz-go/pkg/kgo/record_and_fetch.go#L332-L366). @@ -1369,9 +1369,9 @@ This alert fires when "receive delay" reported by ingester during "starting" pha How it **works**: -- When ingester is starting, it needs to fetch and process records from Kafka until preconfigured consumption lag is honored. -- Each record has a timestamp when it was stored to Kafka. When ingester reads the record, it computes "receive delay" as a difference between current time (when record was read) and time when record was stored to Kafka. This receive delay is reported in metrics. -- Under normal conditions when ingester is processing records faster than records are appearing, receive delay should be decreasing. +- When ingester is starting, it needs to fetch and process records from Kafka until preconfigured consumption lag is honored. The maximum tolerated lag before an ingester is considered to have caught up reading from a partition at startup can be configured via `-ingest-storage.kafka.max-consumer-lag-at-startup`. +- Each record has a timestamp when it was sent to Kafka by the distributor. When ingester reads the record, it computes "receive delay" as a difference between current time (when record was read) and time when record was sent to Kafka. This receive delay is reported in the metric `cortex_ingest_storage_reader_receive_delay_seconds`. +- Under normal conditions when ingester is processing records faster than records are appearing, receive delay should be decreasing, until `-ingest-storage.kafka.max-consumer-lag-at-startup` is honored. - When ingester is starting, and observed "receive delay" is increasing, alert is raised. How to **investigate**: @@ -1385,8 +1385,8 @@ This alert fires when "receive delay" reported by ingester while it's running re How it **works**: - After ingester start and catches up with records in Kafka, ingester switches to "running" mode. -- In running mode, ingester continues to process incoming samples from Kafka and continues to report "receive delay". See [`MimirStartingIngesterKafkaReceiveDelayIncreasing`](#MimirStartingIngesterKafkaReceiveDelayIncreasing) runbook for details about this metric. -- Under normal conditions when ingester is running and it is processing records faster than records are appearing, receive delay should be stable. +- In running mode, ingester continues to process incoming records from Kafka and continues to report "receive delay". See [`MimirStartingIngesterKafkaReceiveDelayIncreasing`](#MimirStartingIngesterKafkaReceiveDelayIncreasing) runbook for details about this metric. +- Under normal conditions when ingester is running and it is processing records faster than records are appearing, receive delay should be stable and low. - If observed "receive delay" increases and reaches certain threshold, alert is raised. How to **investigate**: From c8b119a00cedd606e1f6c79da7b609ffded4bac7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Tue, 26 Mar 2024 18:02:38 +0100 Subject: [PATCH 16/19] Address review feedback. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- docs/sources/mimir/manage/mimir-runbooks/_index.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md index d8c3697d115..374790dc0ee 100644 --- a/docs/sources/mimir/manage/mimir-runbooks/_index.md +++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md @@ -1376,7 +1376,7 @@ How it **works**: How to **investigate**: -- Check if ingester is fast enough to process all data in Kafka. If not, configure ingesters to start with later offset instead. +- Check if ingester is fast enough to process all data in Kafka. ### MimirRunningIngesterReceiveDelayTooHigh @@ -1392,7 +1392,7 @@ How it **works**: How to **investigate**: - Check if ingester is fast enough to process all data in Kafka. -- If ingesters are too slow, consider scaling ingesters, either vertically (to make ingesters faster), or horizontally to spread incoming series between more ingesters. +- If ingesters are too slow, consider scaling ingesters horizontally to spread incoming series between more ingesters. ### MimirIngesterFailsToProcessRecordsFromKafka @@ -1401,7 +1401,7 @@ This alert fires when ingester is unable to process incoming records from Kafka How it **works**: - Ingester reads records from Kafka, and processes them locally. Processing means unmarshalling the data and handling write requests stored in records. -- Write requests can fail due to "user" or "server" errors. Typical user error is too low limit for number of series. Server error can be for example ingester hitting an instance limit. +- Write requests can fail due to "client" or "server" errors. An example of client error is too low limit for number of series. Server error can be for example ingester hitting an instance limit. - If requests keep failing due to server errors, this alert is raised. How to **investigate**: @@ -1420,9 +1420,8 @@ How it **works**: How to **investigate**: -- Check wait latency of requests with strong-consistency. +- Check wait latency of requests with strong-consistency on `Mimir / Queries` dashboard. - Check if ingester needs to process too many records, and whether ingesters need to be scaled up (vertically or horizontally). -- Consider increasing read-timeout of requests. ## Errors catalog From f87ebacde6eb5811b1c1c1d202a868b7f78ec1c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Tue, 26 Mar 2024 18:07:38 +0100 Subject: [PATCH 17/19] Refer to dashboard. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- docs/sources/mimir/manage/mimir-runbooks/_index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md index 374790dc0ee..414bd340f88 100644 --- a/docs/sources/mimir/manage/mimir-runbooks/_index.md +++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md @@ -1370,7 +1370,7 @@ This alert fires when "receive delay" reported by ingester during "starting" pha How it **works**: - When ingester is starting, it needs to fetch and process records from Kafka until preconfigured consumption lag is honored. The maximum tolerated lag before an ingester is considered to have caught up reading from a partition at startup can be configured via `-ingest-storage.kafka.max-consumer-lag-at-startup`. -- Each record has a timestamp when it was sent to Kafka by the distributor. When ingester reads the record, it computes "receive delay" as a difference between current time (when record was read) and time when record was sent to Kafka. This receive delay is reported in the metric `cortex_ingest_storage_reader_receive_delay_seconds`. +- Each record has a timestamp when it was sent to Kafka by the distributor. When ingester reads the record, it computes "receive delay" as a difference between current time (when record was read) and time when record was sent to Kafka. This receive delay is reported in the metric `cortex_ingest_storage_reader_receive_delay_seconds`. You can see receive delay on `Mimir / Writes` dashboard, in section "Ingester (ingest storage – end-to-end latency)". - Under normal conditions when ingester is processing records faster than records are appearing, receive delay should be decreasing, until `-ingest-storage.kafka.max-consumer-lag-at-startup` is honored. - When ingester is starting, and observed "receive delay" is increasing, alert is raised. From c91e214d4912a286ed06c626bb3a4beebf63ad8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Wed, 27 Mar 2024 08:54:28 +0100 Subject: [PATCH 18/19] Use average instead of p99 for StartingIngesterKafkaReceiveDelayIncreasing and RunningIngesterReceiveDelayTooHigh alerts. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- .../templates/metamonitoring/mixin-alerts.yaml | 12 ++++++++++-- .../mimir-mixin-compiled-baremetal/alerts.yaml | 12 ++++++++++-- operations/mimir-mixin-compiled/alerts.yaml | 12 ++++++++++-- .../mimir-mixin/alerts/ingest-storage.libsonnet | 12 ++++++++++-- 4 files changed, 40 insertions(+), 8 deletions(-) diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml index fb9769c7c7a..775f12e2003 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml @@ -1009,7 +1009,11 @@ spec: from Kafka. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing expr: | - deriv(histogram_quantile(0.99, sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))[5m:1m]) > 0 + deriv(( + histogram_sum(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m]))) + / + histogram_count(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m]))) + )[5m:1m]) > 0 for: 5m labels: severity: warning @@ -1020,7 +1024,11 @@ spec: from Kafka. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh expr: | - histogram_quantile(0.99, sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) > (10*60) + ( + histogram_sum(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) + / + histogram_count(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) + ) > (10 * 60) for: 5m labels: severity: critical diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index e438e9d32a2..73b881dd39d 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -984,7 +984,11 @@ groups: from Kafka. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing expr: | - deriv(histogram_quantile(0.99, sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))[5m:1m]) > 0 + deriv(( + histogram_sum(sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m]))) + / + histogram_count(sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m]))) + )[5m:1m]) > 0 for: 5m labels: severity: warning @@ -995,7 +999,11 @@ groups: from Kafka. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh expr: | - histogram_quantile(0.99, sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) > (10*60) + ( + histogram_sum(sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) + / + histogram_count(sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) + ) > (10 * 60) for: 5m labels: severity: critical diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index e78fded5b11..4c6870d4877 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -997,7 +997,11 @@ groups: from Kafka. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing expr: | - deriv(histogram_quantile(0.99, sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))[5m:1m]) > 0 + deriv(( + histogram_sum(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m]))) + / + histogram_count(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m]))) + )[5m:1m]) > 0 for: 5m labels: severity: warning @@ -1008,7 +1012,11 @@ groups: from Kafka. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh expr: | - histogram_quantile(0.99, sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) > (10*60) + ( + histogram_sum(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) + / + histogram_count(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) + ) > (10 * 60) for: 5m labels: severity: critical diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet index de03a51ae3f..c8d1057534f 100644 --- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet +++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet @@ -62,7 +62,11 @@ alert: $.alertName('StartingIngesterKafkaReceiveDelayIncreasing'), 'for': '5m', expr: ||| - deriv(histogram_quantile(0.99, sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))[5m:1m]) > 0 + deriv(( + histogram_sum(sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m]))) + / + histogram_count(sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m]))) + )[5m:1m]) > 0 ||| % $._config, labels: { severity: 'warning', @@ -76,7 +80,11 @@ alert: $.alertName('RunningIngesterReceiveDelayTooHigh'), 'for': '5m', expr: ||| - histogram_quantile(0.99, sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) > (10*60) + ( + histogram_sum(sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) + / + histogram_count(sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) + ) > (10 * 60) ||| % $._config, labels: { severity: 'critical', From fe281bd31d492bc5c015179aab1282afa44d5f71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Wed, 27 Mar 2024 09:49:58 +0100 Subject: [PATCH 19/19] Use series from classic histograms for StartingIngesterKafkaReceiveDelayIncreasing and RunningIngesterReceiveDelayTooHigh alerts, because mixtool can't handle native histogram functions. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- .../templates/metamonitoring/mixin-alerts.yaml | 8 ++++---- operations/mimir-mixin-compiled-baremetal/alerts.yaml | 8 ++++---- operations/mimir-mixin-compiled/alerts.yaml | 8 ++++---- operations/mimir-mixin/alerts/ingest-storage.libsonnet | 10 ++++++---- 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml index 775f12e2003..48d0638ffc0 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml @@ -1010,9 +1010,9 @@ spec: runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing expr: | deriv(( - histogram_sum(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m]))) + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="starting"}[1m])) / - histogram_count(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m]))) + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="starting"}[1m])) )[5m:1m]) > 0 for: 5m labels: @@ -1025,9 +1025,9 @@ spec: runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh expr: | ( - histogram_sum(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="running"}[1m])) / - histogram_count(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="running"}[1m])) ) > (10 * 60) for: 5m labels: diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index 73b881dd39d..12f92aea2e7 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -985,9 +985,9 @@ groups: runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing expr: | deriv(( - histogram_sum(sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m]))) + sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="starting"}[1m])) / - histogram_count(sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m]))) + sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="starting"}[1m])) )[5m:1m]) > 0 for: 5m labels: @@ -1000,9 +1000,9 @@ groups: runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh expr: | ( - histogram_sum(sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) + sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="running"}[1m])) / - histogram_count(sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) + sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="running"}[1m])) ) > (10 * 60) for: 5m labels: diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index 4c6870d4877..9fff3bc2888 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -998,9 +998,9 @@ groups: runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing expr: | deriv(( - histogram_sum(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m]))) + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="starting"}[1m])) / - histogram_count(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m]))) + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="starting"}[1m])) )[5m:1m]) > 0 for: 5m labels: @@ -1013,9 +1013,9 @@ groups: runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh expr: | ( - histogram_sum(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="running"}[1m])) / - histogram_count(sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="running"}[1m])) ) > (10 * 60) for: 5m labels: diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet index c8d1057534f..81944eb040e 100644 --- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet +++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet @@ -61,11 +61,12 @@ { alert: $.alertName('StartingIngesterKafkaReceiveDelayIncreasing'), 'for': '5m', + // We're using series from classic histogram here, because mixtool lint doesn't support histogram_sum, histogram_count functions yet. expr: ||| deriv(( - histogram_sum(sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m]))) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="starting"}[1m])) / - histogram_count(sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m]))) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="starting"}[1m])) )[5m:1m]) > 0 ||| % $._config, labels: { @@ -79,11 +80,12 @@ { alert: $.alertName('RunningIngesterReceiveDelayTooHigh'), 'for': '5m', + // We're using series from classic histogram here, because mixtool lint doesn't support histogram_sum, histogram_count functions yet. expr: ||| ( - histogram_sum(sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="running"}[1m])) / - histogram_count(sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m]))) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="running"}[1m])) ) > (10 * 60) ||| % $._config, labels: {