diff --git a/CHANGELOG.md b/CHANGELOG.md index 3a46fddfce9..20692c9073a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,7 @@ * `MimirRunningIngesterReceiveDelayTooHigh` * `MimirIngesterFailsToProcessRecordsFromKafka` * `MimirIngesterFailsEnforceStrongConsistencyOnReadPath` +* [ENHANCEMENT] Dashboards: add in-flight queries scaling metric panel for ruler-querier. #7749 * [BUGFIX] Dashboards: Fix regular expression for matching read-path gRPC ingester methods to include querying of exemplars, label-related queries, or active series queries. #7676 * [BUGFIX] Dashboards: Fix user id abbreviations and column heads for Top Tenants dashboard. #7724 diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml index 54deb8bc60b..6de9e0aff24 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml @@ -24598,7 +24598,7 @@ data: "sort": "none" } }, - "span": 3, + "span": 6, "targets": [ { "expr": "max by (scaletargetref_name) (\n kube_horizontalpodautoscaler_spec_max_replicas{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"}\n # Add the scaletargetref_name label for readability\n + on (cluster, namespace, horizontalpodautoscaler) group_left (scaletargetref_name)\n 0*kube_horizontalpodautoscaler_info{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"}\n)\n", @@ -24624,7 +24624,7 @@ data: }, { "datasource": "$datasource", - "description": "### Scaling metric (CPU): Desired replicas\nThis panel shows the scaling metric exposed by KEDA divided by the target/threshold used.\nIt should represent the desired number of replicas, ignoring the min/max constraints applied later.\n\n", + "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly.\n\n", "fieldConfig": { "defaults": { "custom": { @@ -24659,7 +24659,68 @@ data: "sort": "none" } }, - "span": 3, + "span": 6, + "targets": [ + { + "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"} * 0,\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", + "format": "time_series", + "legendFormat": "{{scaler}} failures", + "legendLink": null + } + ], + "title": "Autoscaler failures rate", + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Ruler-querier - autoscaling", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "datasource": "$datasource", + "description": "### Scaling metric (CPU): Desired replicas\nThis panel shows the scaling metric exposed by KEDA divided by the target/threshold used.\nIt should represent the desired number of replicas, ignoring the min/max constraints applied later.\n\n", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "id": 17, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, "targets": [ { "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*cpu.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", @@ -24697,7 +24758,7 @@ data: }, "overrides": [ ] }, - "id": 17, + "id": 18, "links": [ ], "options": { "legend": { @@ -24708,7 +24769,7 @@ data: "sort": "none" } }, - "span": 3, + "span": 4, "targets": [ { "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*memory.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", @@ -24722,7 +24783,7 @@ data: }, { "datasource": "$datasource", - "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly.\n\n", + "description": "### Scaling metric (in-flight queries): Desired replicas\nThis panel shows the scaling metric exposed by KEDA divided by the target/threshold used.\nIt should represent the desired number of replicas, ignoring the min/max constraints applied later.\n\n", "fieldConfig": { "defaults": { "custom": { @@ -24746,7 +24807,7 @@ data: }, "overrides": [ ] }, - "id": 18, + "id": 19, "links": [ ], "options": { "legend": { @@ -24757,16 +24818,16 @@ data: "sort": "none" } }, - "span": 3, + "span": 4, "targets": [ { - "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"} * 0,\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", + "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*queries.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", "format": "time_series", - "legendFormat": "{{scaler}} failures", + "legendFormat": "{{ scaler }}", "legendLink": null } ], - "title": "Autoscaler failures rate", + "title": "Scaling metric (in-flight queries): Desired replicas", "type": "timeseries" } ], @@ -24774,7 +24835,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Ruler-Querier - autoscaling", + "title": "", "titleSize": "h6" } ], diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-remote-ruler-reads.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-remote-ruler-reads.json index 9cb3fbabb50..dea6b507db8 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-remote-ruler-reads.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-remote-ruler-reads.json @@ -1335,7 +1335,7 @@ "sort": "none" } }, - "span": 3, + "span": 6, "targets": [ { "expr": "max by (scaletargetref_name) (\n kube_horizontalpodautoscaler_spec_max_replicas{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"}\n # Add the scaletargetref_name label for readability\n + on (cluster, namespace, horizontalpodautoscaler) group_left (scaletargetref_name)\n 0*kube_horizontalpodautoscaler_info{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"}\n)\n", @@ -1361,7 +1361,7 @@ }, { "datasource": "$datasource", - "description": "### Scaling metric (CPU): Desired replicas\nThis panel shows the scaling metric exposed by KEDA divided by the target/threshold used.\nIt should represent the desired number of replicas, ignoring the min/max constraints applied later.\n\n", + "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly.\n\n", "fieldConfig": { "defaults": { "custom": { @@ -1396,7 +1396,68 @@ "sort": "none" } }, - "span": 3, + "span": 6, + "targets": [ + { + "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"} * 0,\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", + "format": "time_series", + "legendFormat": "{{scaler}} failures", + "legendLink": null + } + ], + "title": "Autoscaler failures rate", + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Ruler-querier - autoscaling", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "datasource": "$datasource", + "description": "### Scaling metric (CPU): Desired replicas\nThis panel shows the scaling metric exposed by KEDA divided by the target/threshold used.\nIt should represent the desired number of replicas, ignoring the min/max constraints applied later.\n\n", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "id": 17, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, "targets": [ { "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*cpu.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", @@ -1434,7 +1495,7 @@ }, "overrides": [ ] }, - "id": 17, + "id": 18, "links": [ ], "options": { "legend": { @@ -1445,7 +1506,7 @@ "sort": "none" } }, - "span": 3, + "span": 4, "targets": [ { "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*memory.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", @@ -1459,7 +1520,7 @@ }, { "datasource": "$datasource", - "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly.\n\n", + "description": "### Scaling metric (in-flight queries): Desired replicas\nThis panel shows the scaling metric exposed by KEDA divided by the target/threshold used.\nIt should represent the desired number of replicas, ignoring the min/max constraints applied later.\n\n", "fieldConfig": { "defaults": { "custom": { @@ -1483,7 +1544,7 @@ }, "overrides": [ ] }, - "id": 18, + "id": 19, "links": [ ], "options": { "legend": { @@ -1494,16 +1555,16 @@ "sort": "none" } }, - "span": 3, + "span": 4, "targets": [ { - "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"} * 0,\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", + "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*queries.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", "format": "time_series", - "legendFormat": "{{scaler}} failures", + "legendFormat": "{{ scaler }}", "legendLink": null } ], - "title": "Autoscaler failures rate", + "title": "Scaling metric (in-flight queries): Desired replicas", "type": "timeseries" } ], @@ -1511,7 +1572,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Ruler-Querier - autoscaling", + "title": "", "titleSize": "h6" } ], diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-remote-ruler-reads.json b/operations/mimir-mixin-compiled/dashboards/mimir-remote-ruler-reads.json index 07dd84baf75..0152e21f991 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-remote-ruler-reads.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-remote-ruler-reads.json @@ -1335,7 +1335,7 @@ "sort": "none" } }, - "span": 3, + "span": 6, "targets": [ { "expr": "max by (scaletargetref_name) (\n kube_horizontalpodautoscaler_spec_max_replicas{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"}\n # Add the scaletargetref_name label for readability\n + on (cluster, namespace, horizontalpodautoscaler) group_left (scaletargetref_name)\n 0*kube_horizontalpodautoscaler_info{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"}\n)\n", @@ -1361,7 +1361,7 @@ }, { "datasource": "$datasource", - "description": "### Scaling metric (CPU): Desired replicas\nThis panel shows the scaling metric exposed by KEDA divided by the target/threshold used.\nIt should represent the desired number of replicas, ignoring the min/max constraints applied later.\n\n", + "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly.\n\n", "fieldConfig": { "defaults": { "custom": { @@ -1396,7 +1396,68 @@ "sort": "none" } }, - "span": 3, + "span": 6, + "targets": [ + { + "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"} * 0,\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", + "format": "time_series", + "legendFormat": "{{scaler}} failures", + "legendLink": null + } + ], + "title": "Autoscaler failures rate", + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Ruler-querier - autoscaling", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "datasource": "$datasource", + "description": "### Scaling metric (CPU): Desired replicas\nThis panel shows the scaling metric exposed by KEDA divided by the target/threshold used.\nIt should represent the desired number of replicas, ignoring the min/max constraints applied later.\n\n", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "id": 17, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, "targets": [ { "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*cpu.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", @@ -1434,7 +1495,7 @@ }, "overrides": [ ] }, - "id": 17, + "id": 18, "links": [ ], "options": { "legend": { @@ -1445,7 +1506,7 @@ "sort": "none" } }, - "span": 3, + "span": 4, "targets": [ { "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*memory.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", @@ -1459,7 +1520,7 @@ }, { "datasource": "$datasource", - "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly.\n\n", + "description": "### Scaling metric (in-flight queries): Desired replicas\nThis panel shows the scaling metric exposed by KEDA divided by the target/threshold used.\nIt should represent the desired number of replicas, ignoring the min/max constraints applied later.\n\n", "fieldConfig": { "defaults": { "custom": { @@ -1483,7 +1544,7 @@ }, "overrides": [ ] }, - "id": 18, + "id": 19, "links": [ ], "options": { "legend": { @@ -1494,16 +1555,16 @@ "sort": "none" } }, - "span": 3, + "span": 4, "targets": [ { - "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"} * 0,\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", + "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*queries.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", "format": "time_series", - "legendFormat": "{{scaler}} failures", + "legendFormat": "{{ scaler }}", "legendLink": null } ], - "title": "Autoscaler failures rate", + "title": "Scaling metric (in-flight queries): Desired replicas", "type": "timeseries" } ], @@ -1511,7 +1572,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Ruler-Querier - autoscaling", + "title": "", "titleSize": "h6" } ], diff --git a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet index 47a347cb1a2..90e76b5c19c 100644 --- a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -596,179 +596,159 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.latencyPanel('cortex_kv_request_duration_seconds', '{%s, kv_name=~"%s"}' % [$.jobMatcher($._config.job_names[jobName]), kvName]) ), - cpuAndMemoryBasedAutoScalingRow(componentTitle):: - local component = std.asciiLower(componentTitle); - local field = std.strReplace(component, '-', '_'); - super.row('%s - autoscaling' % [componentTitle]) - .addPanel( - local title = 'Replicas'; - $.timeseriesPanel(title) + - $.queryPanel( - [ - ||| - max by (scaletargetref_name) ( - kube_horizontalpodautoscaler_spec_max_replicas{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} - # Add the scaletargetref_name label for readability - + on (%(cluster_labels)s, horizontalpodautoscaler) group_left (scaletargetref_name) - 0*kube_horizontalpodautoscaler_info{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} - ) - ||| % { - namespace_matcher: $.namespaceMatcher(), - hpa_name: $._config.autoscaling[field].hpa_name, - cluster_labels: std.join(', ', $._config.cluster_labels), - }, - ||| - max by (scaletargetref_name) ( - kube_horizontalpodautoscaler_status_current_replicas{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} - # HPA doesn't go to 0 replicas, so we multiply by 0 if the HPA is not active - * on (%(cluster_labels)s, horizontalpodautoscaler) - kube_horizontalpodautoscaler_status_condition{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s", condition="ScalingActive", status="true"} - # Add the scaletargetref_name label for readability - + on (%(cluster_labels)s, horizontalpodautoscaler) group_left (scaletargetref_name) - 0*kube_horizontalpodautoscaler_info{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} - ) - ||| % { - namespace_matcher: $.namespaceMatcher(), - hpa_name: $._config.autoscaling[field].hpa_name, - cluster_labels: std.join(', ', $._config.cluster_labels), - }, - ||| - max by (scaletargetref_name) ( - kube_horizontalpodautoscaler_spec_min_replicas{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} - # Add the scaletargetref_name label for readability - + on (%(cluster_labels)s, horizontalpodautoscaler) group_left (scaletargetref_name) - 0*kube_horizontalpodautoscaler_info{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} - ) - ||| % { - namespace_matcher: $.namespaceMatcher(), - hpa_name: $._config.autoscaling[field].hpa_name, - cluster_labels: std.join(', ', $._config.cluster_labels), - }, - ], - [ - 'Max {{ scaletargetref_name }}', - 'Current {{ scaletargetref_name }}', - 'Min {{ scaletargetref_name }}', - ], - ) + - $.panelDescription( - title, + // The provided componentName should be the name of a component among the ones defined in $._config.autoscaling. + autoScalingActualReplicas(componentName):: + local title = 'Replicas'; + local componentTitle = std.strReplace(componentName, '_', '-'); + + $.timeseriesPanel(title) + + $.queryPanel( + [ ||| - The maximum and current number of %s replicas. - Note: The current number of replicas can still show 1 replica even when scaled to 0. - Because HPA never reports 0 replicas, the query will report 0 only if the HPA is not active. - ||| % [component] - ) + - { - fieldConfig+: { - overrides: [ - $.overrideField('byRegexp', '/Max .+/', [ - $.overrideProperty('custom.fillOpacity', 0), - $.overrideProperty('custom.lineStyle', { fill: 'dash' }), - ]), - $.overrideField('byRegexp', '/Current .+/', [ - $.overrideProperty('custom.fillOpacity', 0), - ]), - $.overrideField('byRegexp', '/Min .+/', [ - $.overrideProperty('custom.fillOpacity', 0), - $.overrideProperty('custom.lineStyle', { fill: 'dash' }), - ]), - ], + max by (scaletargetref_name) ( + kube_horizontalpodautoscaler_spec_max_replicas{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} + # Add the scaletargetref_name label for readability + + on (%(cluster_labels)s, horizontalpodautoscaler) group_left (scaletargetref_name) + 0*kube_horizontalpodautoscaler_info{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} + ) + ||| % { + namespace_matcher: $.namespaceMatcher(), + hpa_name: $._config.autoscaling[componentName].hpa_name, + cluster_labels: std.join(', ', $._config.cluster_labels), + }, + ||| + max by (scaletargetref_name) ( + kube_horizontalpodautoscaler_status_current_replicas{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} + # HPA doesn't go to 0 replicas, so we multiply by 0 if the HPA is not active + * on (%(cluster_labels)s, horizontalpodautoscaler) + kube_horizontalpodautoscaler_status_condition{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s", condition="ScalingActive", status="true"} + # Add the scaletargetref_name label for readability + + on (%(cluster_labels)s, horizontalpodautoscaler) group_left (scaletargetref_name) + 0*kube_horizontalpodautoscaler_info{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} + ) + ||| % { + namespace_matcher: $.namespaceMatcher(), + hpa_name: $._config.autoscaling[componentName].hpa_name, + cluster_labels: std.join(', ', $._config.cluster_labels), + }, + ||| + max by (scaletargetref_name) ( + kube_horizontalpodautoscaler_spec_min_replicas{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} + # Add the scaletargetref_name label for readability + + on (%(cluster_labels)s, horizontalpodautoscaler) group_left (scaletargetref_name) + 0*kube_horizontalpodautoscaler_info{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} + ) + ||| % { + namespace_matcher: $.namespaceMatcher(), + hpa_name: $._config.autoscaling[componentName].hpa_name, + cluster_labels: std.join(', ', $._config.cluster_labels), }, + ], + [ + 'Max {{ scaletargetref_name }}', + 'Current {{ scaletargetref_name }}', + 'Min {{ scaletargetref_name }}', + ], + ) + + $.panelDescription( + title, + ||| + The maximum and current number of %s replicas. + Note: The current number of replicas can still show 1 replica even when scaled to 0. + Because HPA never reports 0 replicas, the query will report 0 only if the HPA is not active. + ||| % [componentTitle] + ) + + { + fieldConfig+: { + overrides: [ + $.overrideField('byRegexp', '/Max .+/', [ + $.overrideProperty('custom.fillOpacity', 0), + $.overrideProperty('custom.lineStyle', { fill: 'dash' }), + ]), + $.overrideField('byRegexp', '/Current .+/', [ + $.overrideProperty('custom.fillOpacity', 0), + ]), + $.overrideField('byRegexp', '/Min .+/', [ + $.overrideProperty('custom.fillOpacity', 0), + $.overrideProperty('custom.lineStyle', { fill: 'dash' }), + ]), + ], }, - ) - .addPanel( - local title = 'Scaling metric (CPU): Desired replicas'; - $.timeseriesPanel(title) + - $.queryPanel( - [ - ||| - sum by (scaler) ( + }, + + // The provided componentName should be the name of a component among the ones defined in $._config.autoscaling. + autoScalingDesiredReplicasByScalingMetricPanel(componentName, scalingMetricName, scalingMetricID):: + local title = 'Scaling metric (%s): Desired replicas' % scalingMetricName; + + $.timeseriesPanel(title) + + $.queryPanel( + [ + ||| + sum by (scaler) ( + label_replace( + keda_scaler_metrics_value{%(cluster_label)s=~"$cluster", exported_namespace=~"$namespace", scaler=~".*%(scaling_metric_id)s.*"}, + "namespace", "$1", "exported_namespace", "(.*)" + ) + / + on(%(aggregation_labels)s, scaledObject, metric) group_left label_replace( label_replace( - keda_scaler_metrics_value{%(cluster_label)s=~"$cluster", exported_namespace=~"$namespace", scaler=~".*cpu.*"}, - "namespace", "$1", "exported_namespace", "(.*)" - ) - / - on(%(aggregation_labels)s, scaledObject, metric) group_left label_replace( - label_replace( - kube_horizontalpodautoscaler_spec_target_metric{%(namespace)s, horizontalpodautoscaler=~"%(hpa_name)s"}, - "metric", "$1", "metric_name", "(.+)" - ), - "scaledObject", "$1", "horizontalpodautoscaler", "%(hpa_prefix)s(.*)" - ) + kube_horizontalpodautoscaler_spec_target_metric{%(namespace)s, horizontalpodautoscaler=~"%(hpa_name)s"}, + "metric", "$1", "metric_name", "(.+)" + ), + "scaledObject", "$1", "horizontalpodautoscaler", "%(hpa_prefix)s(.*)" ) - ||| % { - aggregation_labels: $._config.alert_aggregation_labels, - cluster_label: $._config.per_cluster_label, - hpa_prefix: $._config.autoscaling_hpa_prefix, - hpa_name: $._config.autoscaling[field].hpa_name, - namespace: $.namespaceMatcher(), - }, - ], [ - '{{ scaler }}', - ] - ) + - $.panelDescription( - title, - ||| - This panel shows the scaling metric exposed by KEDA divided by the target/threshold used. - It should represent the desired number of replicas, ignoring the min/max constraints applied later. - ||| - ), + ) + ||| % { + aggregation_labels: $._config.alert_aggregation_labels, + cluster_label: $._config.per_cluster_label, + hpa_prefix: $._config.autoscaling_hpa_prefix, + hpa_name: $._config.autoscaling[componentName].hpa_name, + namespace: $.namespaceMatcher(), + scaling_metric_id: scalingMetricID, + }, + ], [ + '{{ scaler }}', + ] + ) + + $.panelDescription( + title, + ||| + This panel shows the scaling metric exposed by KEDA divided by the target/threshold used. + It should represent the desired number of replicas, ignoring the min/max constraints applied later. + ||| + ), + + // The provided componentName should be the name of a component among the ones defined in $._config.autoscaling. + autoScalingFailuresPanel(componentName):: + local title = 'Autoscaler failures rate'; + + $.timeseriesPanel(title) + + $.queryPanel( + $.filterKedaScalerErrorsByHPA($._config.autoscaling[componentName].hpa_name), + '{{scaler}} failures' + ) + + $.panelDescription( + title, + ||| + The rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom + metrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly. + ||| + ), + + cpuAndMemoryBasedAutoScalingRow(componentTitle):: + local componentName = std.strReplace(std.asciiLower(componentTitle), '-', '_'); + super.row('%s - autoscaling' % [componentTitle]) + .addPanel( + $.autoScalingActualReplicas(componentName) ) .addPanel( - local title = 'Scaling metric (memory): Desired replicas'; - $.timeseriesPanel(title) + - $.queryPanel( - [ - ||| - sum by (scaler) ( - label_replace( - keda_scaler_metrics_value{%(cluster_label)s=~"$cluster", exported_namespace=~"$namespace", scaler=~".*memory.*"}, - "namespace", "$1", "exported_namespace", "(.*)" - ) - / - on(%(aggregation_labels)s, scaledObject, metric) group_left label_replace( - label_replace( - kube_horizontalpodautoscaler_spec_target_metric{%(namespace)s, horizontalpodautoscaler=~"%(hpa_name)s"}, - "metric", "$1", "metric_name", "(.+)" - ), - "scaledObject", "$1", "horizontalpodautoscaler", "%(hpa_prefix)s(.*)" - ) - ) - ||| % { - aggregation_labels: $._config.alert_aggregation_labels, - cluster_label: $._config.per_cluster_label, - hpa_prefix: $._config.autoscaling_hpa_prefix, - hpa_name: $._config.autoscaling[field].hpa_name, - namespace: $.namespaceMatcher(), - }, - ], [ - '{{ scaler }}', - ] - ) + - $.panelDescription( - title, - ||| - This panel shows the scaling metric exposed by KEDA divided by the target/threshold used. - It should represent the desired number of replicas, ignoring the min/max constraints applied later. - ||| - ), + $.autoScalingDesiredReplicasByScalingMetricPanel(componentName, 'CPU', 'cpu') ) .addPanel( - local title = 'Autoscaler failures rate'; - $.timeseriesPanel(title) + - $.queryPanel( - $.filterKedaScalerErrorsByHPA($._config.autoscaling[field].hpa_name), - '{{scaler}} failures' - ) + - $.panelDescription( - title, - ||| - The rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom - metrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly. - ||| - ), + $.autoScalingDesiredReplicasByScalingMetricPanel(componentName, 'memory', 'memory') + ) + .addPanel( + $.autoScalingFailuresPanel(componentName) ), newStatPanel(queries, legends='', unit='percentunit', decimals=1, thresholds=[], instant=false, novalue=''):: diff --git a/operations/mimir-mixin/dashboards/remote-ruler-reads.libsonnet b/operations/mimir-mixin/dashboards/remote-ruler-reads.libsonnet index df1f48f4aa8..cce695137f9 100644 --- a/operations/mimir-mixin/dashboards/remote-ruler-reads.libsonnet +++ b/operations/mimir-mixin/dashboards/remote-ruler-reads.libsonnet @@ -187,6 +187,25 @@ local filename = 'mimir-remote-ruler-reads.json'; ) .addRowIf( $._config.autoscaling.ruler_querier.enabled, - $.cpuAndMemoryBasedAutoScalingRow('Ruler-Querier'), + $.row('Ruler-querier - autoscaling') + .addPanel( + $.autoScalingActualReplicas('ruler_querier') + ) + .addPanel( + $.autoScalingFailuresPanel('ruler_querier') + ) + ) + .addRowIf( + $._config.autoscaling.ruler_querier.enabled, + $.row('') + .addPanel( + $.autoScalingDesiredReplicasByScalingMetricPanel('ruler_querier', 'CPU', 'cpu') + ) + .addPanel( + $.autoScalingDesiredReplicasByScalingMetricPanel('ruler_querier', 'memory', 'memory') + ) + .addPanel( + $.autoScalingDesiredReplicasByScalingMetricPanel('ruler_querier', 'in-flight queries', 'queries') + ) ), }