grafana · aknuds1 · Apr 5, 2022 · Mar 22, 2022 · Mar 24, 2022 · Mar 30, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@
     - `-alertmanager.alertmanager-client.grpc-max-recv-msg-size` now defaults to 100 MiB (previously was not configurable and set to 16 MiB)
     - `-alertmanager.alertmanager-client.grpc-max-send-msg-size` now defaults to 100 MiB (previously was not configurable and set to 4 MiB)
     - `-alertmanager.max-recv-msg-size` now defaults to 100 MiB (previously was 16 MiB)
+* [CHANGE] Ingester: Add `user` label to metrics `cortex_ingester_ingested_samples_total` and `cortex_ingester_ingested_samples_failures_total`. #1533
 * [FEATURE] Ruler: Allow setting `evaluation_delay` for each rule group via rules group configuration file. #1474
 * [FEATURE] Distributor: Added the ability to forward specifics metrics to alternative remote_write API endpoints. #1052
 * [ENHANCEMENT] Alertmanager API: Concurrency limit for GET requests is now configurable using `-alertmanager.max-concurrent-get-requests-per-tenant`. #1547

@@ -264,7 +264,7 @@ groups:
       message: |
         Ingesters in {{ $labels.cluster }}/{{ $labels.namespace }} ingest too many samples per second.
     expr: |
-      avg by (cluster, namespace) (rate(cortex_ingester_ingested_samples_total[1m])) > 80e3
+      avg by (cluster, namespace) (cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m) > 80e3
     for: 15m
     labels:
       severity: warning
@@ -466,32 +466,32 @@ groups:
   rules:
   - alert: MimirIngesterHasNotShippedBlocks
     annotations:
-      message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace
+      message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
         }} has not shipped any block in the last 4 hours.
     expr: |
-      (min by(cluster, namespace, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4)
+      (min by(cluster, namespace, pod) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4)
       and
-      (max by(cluster, namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0)
+      (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0)
       and
       # Only if the ingester has ingested samples over the last 4h.
-      (max by(cluster, namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
+      (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
       and
       # Only if the ingester was ingesting samples 4h ago. This protects from the case the ingester instance
       # had ingested samples in the past, then no traffic was received for a long period and then it starts
       # receiving samples again. Without this check, the alert would fire as soon as it gets back receiving
       # samples, while the a block shipping is expected within the next 4h.
-      (max by(cluster, namespace, instance) (rate(cortex_ingester_ingested_samples_total[1h] offset 4h)) > 0)
+      (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0)
     for: 15m
     labels:
       severity: critical
   - alert: MimirIngesterHasNotShippedBlocksSinceStart
     annotations:
-      message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace
+      message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
         }} has not shipped any block in the last 4 hours.
     expr: |
-      (max by(cluster, namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0)
+      (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0)
       and
-      (max by(cluster, namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
+      (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
     for: 4h
     labels:
       severity: critical

@@ -548,3 +548,8 @@ groups:
   - expr: |
       sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m]))
     record: cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m
+- name: mimir_ingester_rules
+  rules:
+  - expr: |
+      sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m]))
+    record: cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m
@@ -435,8 +435,8 @@
           alert: $.alertName('ProvisioningTooManyWrites'),
           // 80k writes / s per ingester max.
           expr: |||
-            avg by (%s) (rate(cortex_ingester_ingested_samples_total[1m])) > 80e3
-          ||| % $._config.alert_aggregation_labels,
+            avg by (%(alert_aggregation_labels)s) (cluster_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m) > 80e3
+          ||| % $._config,
           'for': '15m',
           labels: {
             severity: 'warning',

@@ -9,24 +9,24 @@
           alert: $.alertName('IngesterHasNotShippedBlocks'),
           'for': '15m',
           expr: |||
-            (min by(%(alert_aggregation_labels)s, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4)
+            (min by(%(alert_aggregation_labels)s, %(per_instance_label)s) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4)
             and
-            (max by(%(alert_aggregation_labels)s, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0)
+            (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0)
             and
             # Only if the ingester has ingested samples over the last 4h.
-            (max by(%(alert_aggregation_labels)s, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
+            (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(cluster_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
             and
             # Only if the ingester was ingesting samples 4h ago. This protects from the case the ingester instance
             # had ingested samples in the past, then no traffic was received for a long period and then it starts
             # receiving samples again. Without this check, the alert would fire as soon as it gets back receiving
             # samples, while the a block shipping is expected within the next 4h.
-            (max by(%(alert_aggregation_labels)s, instance) (rate(cortex_ingester_ingested_samples_total[1h] offset 4h)) > 0)
+            (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(cluster_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0)
           ||| % $._config,
           labels: {
             severity: 'critical',
           },
           annotations: {
-            message: '%(product)s Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config,
+            message: '%(product)s Ingester {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config,
           },
         },
         {
@@ -35,15 +35,15 @@
           alert: $.alertName('IngesterHasNotShippedBlocksSinceStart'),
           'for': '4h',
           expr: |||
-            (max by(%(alert_aggregation_labels)s, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0)
+            (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0)
             and
-            (max by(%(alert_aggregation_labels)s, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
+            (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(cluster_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
           ||| % $._config,
           labels: {
             severity: 'critical',
           },
           annotations: {
-            message: '%(product)s Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config,
+            message: '%(product)s Ingester {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config,
           },
         },
         {

@@ -474,6 +474,18 @@ local utils = import 'mixin-utils/utils.libsonnet';
           },
         ],
       },
+      {
+        name: 'mimir_ingester_rules',
+        rules: [
+          {
+            // cortex_ingester_ingested_samples_total is per user, in this rule we want to see the sum per cluster/namespace/instance
+            record: 'cluster_namespace_%s:cortex_ingester_ingested_samples_total:rate1m' % $._config.per_instance_label,
+            expr: |||
+              sum by(%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingester_ingested_samples_total[1m]))
+            ||| % $._config,
+          },
+        ],
+      },
     ],
   },
 }
@@ -775,8 +775,8 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques
 	// Increment metrics only if the samples have been successfully committed.
 	// If the code didn't reach this point, it means that we returned an error
 	// which will be converted into an HTTP 5xx and the client should/will retry.
-	i.metrics.ingestedSamples.Add(float64(succeededSamplesCount))
-	i.metrics.ingestedSamplesFail.Add(float64(failedSamplesCount))
+	i.metrics.ingestedSamples.WithLabelValues(userID).Add(float64(succeededSamplesCount))
+	i.metrics.ingestedSamplesFail.WithLabelValues(userID).Add(float64(failedSamplesCount))
 	i.metrics.ingestedExemplars.Add(float64(succeededExemplarsCount))
 	i.metrics.ingestedExemplarsFail.Add(float64(failedExemplarsCount))