From e3f5cac9bcc2466f7b68f82d33cf08491cf1d19a Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 26 Jul 2023 16:26:55 +0200 Subject: [PATCH] Remove MimirProvisioningTooManyActiveSeries alert Signed-off-by: Marco Pracucci --- CHANGELOG.md | 1 + .../mimir/manage/mimir-runbooks/_index.md | 16 +--------------- .../templates/metamonitoring/mixin-alerts.yaml | 10 ---------- .../mimir-mixin-compiled-baremetal/alerts.yaml | 10 ---------- operations/mimir-mixin-compiled/alerts.yaml | 10 ---------- operations/mimir-mixin/alerts/alerts.libsonnet | 18 ------------------ 6 files changed, 2 insertions(+), 63 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b971e3137d2..549a181144f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -74,6 +74,7 @@ * [CHANGE] Dashboards: removed "Query results cache misses" panel on the "Mimir / Queries" dashboard. #5423 * [CHANGE] Dashboards: default to shared crosshair on all dashboards. #5489 * [CHANGE] Dashboards: sort variable drop-down lists from A to Z, rather than Z to A. #5490 +* [CHANGE] Alerts: removed `MimirProvisioningTooManyActiveSeries` alert. You should configure `-ingester.instance-limits.max-series` and rely on `MimirIngesterReachingSeriesLimit` alert instead. #5593 * [ENHANCEMENT] Dashboards: adjust layout of "rollout progress" dashboard panels so that the "rollout progress" panel doesn't require scrolling. #5113 * [ENHANCEMENT] Dashboards: show container name first in "pods count per version" panel on "rollout progress" dashboard. #5113 * [ENHANCEMENT] Dashboards: show time spend waiting for turn when lazy loading index headers in the "index-header lazy load gate latency" panel on the "queries" dashboard. #5313 diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md index 5d1620a1f47..ac1b3d2fa13 100644 --- a/docs/sources/mimir/manage/mimir-runbooks/_index.md +++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md @@ -204,7 +204,7 @@ How to **investigate**: - **`ingester`** - Typically, ingester p99 latency is in the range 5-50ms. If the ingester latency is higher than this, you should investigate the root cause before scaling up ingesters. - Check out the following alerts and fix them if firing: - - `MimirProvisioningTooManyActiveSeries` + - `MimirIngesterReachingSeriesLimit` - `MimirProvisioningTooManyWrites` #### Read Latency @@ -776,20 +776,6 @@ How to **investigate**: - `other` - Check both Mimir and cache logs to find more details -### MimirProvisioningTooManyActiveSeries - -This alert fires if the average number of in-memory series per ingester is above our target (1.5M). - -How to **fix** it: - -- Scale up ingesters - - To find out the Mimir clusters where ingesters should be scaled up and how many minimum replicas are expected: - ``` - ceil(sum by(cluster, namespace) (cortex_ingester_memory_series) / 1.5e6) > - count by(cluster, namespace) (cortex_ingester_memory_series) - ``` -- After the scale up, the in-memory series are expected to be reduced at the next TSDB head compaction (occurring every 2h) - ### MimirProvisioningTooManyWrites This alert fires if the average number of samples ingested / sec in ingesters is above our target. diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml index b187822742c..6fd92bd6a2a 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml @@ -364,16 +364,6 @@ spec: severity: critical - name: mimir-provisioning rules: - - alert: MimirProvisioningTooManyActiveSeries - annotations: - message: | - The number of in-memory series per ingester in {{ $labels.cluster }}/{{ $labels.namespace }} is too high. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirprovisioningtoomanyactiveseries - expr: | - avg by (cluster, namespace) (cortex_ingester_memory_series) > 1.6e6 - for: 2h - labels: - severity: warning - alert: MimirProvisioningTooManyWrites annotations: message: | diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index c937f8b57ce..1584b6e3f99 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -352,16 +352,6 @@ groups: severity: critical - name: mimir-provisioning rules: - - alert: MimirProvisioningTooManyActiveSeries - annotations: - message: | - The number of in-memory series per ingester in {{ $labels.cluster }}/{{ $labels.namespace }} is too high. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirprovisioningtoomanyactiveseries - expr: | - avg by (cluster, namespace) (cortex_ingester_memory_series) > 1.6e6 - for: 2h - labels: - severity: warning - alert: MimirProvisioningTooManyWrites annotations: message: | diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index 5496d5d4ea2..41107aa72c9 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -352,16 +352,6 @@ groups: severity: critical - name: mimir-provisioning rules: - - alert: MimirProvisioningTooManyActiveSeries - annotations: - message: | - The number of in-memory series per ingester in {{ $labels.cluster }}/{{ $labels.namespace }} is too high. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirprovisioningtoomanyactiveseries - expr: | - avg by (cluster, namespace) (cortex_ingester_memory_series) > 1.6e6 - for: 2h - labels: - severity: warning - alert: MimirProvisioningTooManyWrites annotations: message: | diff --git a/operations/mimir-mixin/alerts/alerts.libsonnet b/operations/mimir-mixin/alerts/alerts.libsonnet index 6b52403406d..ea103d2a22f 100644 --- a/operations/mimir-mixin/alerts/alerts.libsonnet +++ b/operations/mimir-mixin/alerts/alerts.libsonnet @@ -538,24 +538,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; { name: 'mimir-provisioning', rules: [ - { - alert: $.alertName('ProvisioningTooManyActiveSeries'), - // We target each ingester to 1.5M in-memory series. This alert fires if the average - // number of series / ingester in a Mimir cluster is > 1.6M for 2h (we compact - // the TSDB head every 2h). - expr: ||| - avg by (%s) (cortex_ingester_memory_series) > 1.6e6 - ||| % [$._config.alert_aggregation_labels], - 'for': '2h', - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - The number of in-memory series per ingester in %(alert_aggregation_variables)s is too high. - ||| % $._config, - }, - }, { alert: $.alertName('ProvisioningTooManyWrites'), // 80k writes / s per ingester max.