From be759725301da7dba35d603692a5ee70f571b147 Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Tue, 5 Apr 2022 09:51:45 +0200 Subject: [PATCH] Mixin: Fix "Failed evaluation rate" panel on Tenants dashboard. Fix to use the correct metric, and only show series which actually have failures, making the panel easier to read when there are many rule groups. --- CHANGELOG.md | 1 + operations/mimir-mixin-compiled/dashboards/mimir-tenants.json | 2 +- operations/mimir-mixin/dashboards/tenants.libsonnet | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 61f995692f7..e6ee6a72552 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,7 @@ * [CHANGE] Dashboards: Remove per-user series legends from Tenants dashboard. #1605 * [CHANGE] Dashboards: Show in-memory series and the per-user series limit on Tenants dashboard. #1613 +* [BUGFIX] Dashboards: Fix "Failed evaluation rate" panel on Tenants dashboard. #1629 ### Jsonnet diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-tenants.json b/operations/mimir-mixin-compiled/dashboards/mimir-tenants.json index bbc8faf0c3f..1c01242dc79 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-tenants.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-tenants.json @@ -1439,7 +1439,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (rule_group) (rate(cortex_prometheus_rule_group_rules{cluster=~\"$cluster\", job=~\"($namespace)/((ruler|cortex|mimir))\", user=\"$user\"}[$__rate_interval]))", + "expr": "sum by (rule_group) (rate(cortex_prometheus_rule_evaluation_failures_total{cluster=~\"$cluster\", job=~\"($namespace)/((ruler|cortex|mimir))\", user=\"$user\"}[$__rate_interval])) > 0", "format": "time_series", "interval": "15s", "intervalFactor": 2, diff --git a/operations/mimir-mixin/dashboards/tenants.libsonnet b/operations/mimir-mixin/dashboards/tenants.libsonnet index 9e2daa1aa16..e93e41b6ef5 100644 --- a/operations/mimir-mixin/dashboards/tenants.libsonnet +++ b/operations/mimir-mixin/dashboards/tenants.libsonnet @@ -400,7 +400,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; local title = 'Failed evaluations rate'; $.panel(title) + $.queryPanel( - 'sum by (rule_group) (rate(cortex_prometheus_rule_group_rules{%(job)s, user="$user"}[$__rate_interval]))' + 'sum by (rule_group) (rate(cortex_prometheus_rule_evaluation_failures_total{%(job)s, user="$user"}[$__rate_interval])) > 0' % { job: $.jobMatcher($._config.job_names.ruler) }, '{{ rule_group }}', ) + { stack: true },