Skip to content

Commit

Permalink
Add user label to ingested samples metrics (#1533)
Browse files Browse the repository at this point in the history
* ingester: Add user label to cortex_ingester_ingested_samples_*total
* operations/mimir-mixin/alerts: Add ingester recording rule

Co-authored-by: Marco Pracucci <marco@pracucci.com>
Signed-off-by: Arve Knudsen <arve.knudsen@gmail.com>
  • Loading branch information
aknuds1 and pracucci committed Apr 5, 2022
1 parent 7c6e1ba commit adc4626
Show file tree
Hide file tree
Showing 9 changed files with 87 additions and 71 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
- `-alertmanager.alertmanager-client.grpc-max-recv-msg-size` now defaults to 100 MiB (previously was not configurable and set to 16 MiB)
- `-alertmanager.alertmanager-client.grpc-max-send-msg-size` now defaults to 100 MiB (previously was not configurable and set to 4 MiB)
- `-alertmanager.max-recv-msg-size` now defaults to 100 MiB (previously was 16 MiB)
* [CHANGE] Ingester: Add `user` label to metrics `cortex_ingester_ingested_samples_total` and `cortex_ingester_ingested_samples_failures_total`. #1533
* [FEATURE] Ruler: Allow setting `evaluation_delay` for each rule group via rules group configuration file. #1474
* [FEATURE] Distributor: Added the ability to forward specifics metrics to alternative remote_write API endpoints. #1052
* [ENHANCEMENT] Alertmanager API: Concurrency limit for GET requests is now configurable using `-alertmanager.max-concurrent-get-requests-per-tenant`. #1547
Expand Down
18 changes: 9 additions & 9 deletions operations/mimir-mixin-compiled/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ groups:
message: |
Ingesters in {{ $labels.cluster }}/{{ $labels.namespace }} ingest too many samples per second.
expr: |
avg by (cluster, namespace) (rate(cortex_ingester_ingested_samples_total[1m])) > 80e3
avg by (cluster, namespace) (cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m) > 80e3
for: 15m
labels:
severity: warning
Expand Down Expand Up @@ -466,32 +466,32 @@ groups:
rules:
- alert: MimirIngesterHasNotShippedBlocks
annotations:
message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace
message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
}} has not shipped any block in the last 4 hours.
expr: |
(min by(cluster, namespace, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4)
(min by(cluster, namespace, pod) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4)
and
(max by(cluster, namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0)
(max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0)
and
# Only if the ingester has ingested samples over the last 4h.
(max by(cluster, namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
(max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
and
# Only if the ingester was ingesting samples 4h ago. This protects from the case the ingester instance
# had ingested samples in the past, then no traffic was received for a long period and then it starts
# receiving samples again. Without this check, the alert would fire as soon as it gets back receiving
# samples, while the a block shipping is expected within the next 4h.
(max by(cluster, namespace, instance) (rate(cortex_ingester_ingested_samples_total[1h] offset 4h)) > 0)
(max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0)
for: 15m
labels:
severity: critical
- alert: MimirIngesterHasNotShippedBlocksSinceStart
annotations:
message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace
message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
}} has not shipped any block in the last 4 hours.
expr: |
(max by(cluster, namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0)
(max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0)
and
(max by(cluster, namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
(max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
for: 4h
labels:
severity: critical
Expand Down
5 changes: 5 additions & 0 deletions operations/mimir-mixin-compiled/rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -548,3 +548,8 @@ groups:
- expr: |
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m]))
record: cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m
- name: mimir_ingester_rules
rules:
- expr: |
sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m]))
record: cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m
4 changes: 2 additions & 2 deletions operations/mimir-mixin/alerts/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -435,8 +435,8 @@
alert: $.alertName('ProvisioningTooManyWrites'),
// 80k writes / s per ingester max.
expr: |||
avg by (%s) (rate(cortex_ingester_ingested_samples_total[1m])) > 80e3
||| % $._config.alert_aggregation_labels,
avg by (%(alert_aggregation_labels)s) (cluster_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m) > 80e3
||| % $._config,
'for': '15m',
labels: {
severity: 'warning',
Expand Down
16 changes: 8 additions & 8 deletions operations/mimir-mixin/alerts/blocks.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,24 @@
alert: $.alertName('IngesterHasNotShippedBlocks'),
'for': '15m',
expr: |||
(min by(%(alert_aggregation_labels)s, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4)
(min by(%(alert_aggregation_labels)s, %(per_instance_label)s) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4)
and
(max by(%(alert_aggregation_labels)s, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0)
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0)
and
# Only if the ingester has ingested samples over the last 4h.
(max by(%(alert_aggregation_labels)s, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(cluster_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
and
# Only if the ingester was ingesting samples 4h ago. This protects from the case the ingester instance
# had ingested samples in the past, then no traffic was received for a long period and then it starts
# receiving samples again. Without this check, the alert would fire as soon as it gets back receiving
# samples, while the a block shipping is expected within the next 4h.
(max by(%(alert_aggregation_labels)s, instance) (rate(cortex_ingester_ingested_samples_total[1h] offset 4h)) > 0)
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(cluster_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0)
||| % $._config,
labels: {
severity: 'critical',
},
annotations: {
message: '%(product)s Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config,
message: '%(product)s Ingester {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config,
},
},
{
Expand All @@ -35,15 +35,15 @@
alert: $.alertName('IngesterHasNotShippedBlocksSinceStart'),
'for': '4h',
expr: |||
(max by(%(alert_aggregation_labels)s, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0)
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0)
and
(max by(%(alert_aggregation_labels)s, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(cluster_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
||| % $._config,
labels: {
severity: 'critical',
},
annotations: {
message: '%(product)s Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config,
message: '%(product)s Ingester {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config,
},
},
{
Expand Down
12 changes: 12 additions & 0 deletions operations/mimir-mixin/recording_rules.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,18 @@ local utils = import 'mixin-utils/utils.libsonnet';
},
],
},
{
name: 'mimir_ingester_rules',
rules: [
{
// cortex_ingester_ingested_samples_total is per user, in this rule we want to see the sum per cluster/namespace/instance
record: 'cluster_namespace_%s:cortex_ingester_ingested_samples_total:rate1m' % $._config.per_instance_label,
expr: |||
sum by(%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingester_ingested_samples_total[1m]))
||| % $._config,
},
],
},
],
},
}
4 changes: 2 additions & 2 deletions pkg/ingester/ingester.go
Original file line number Diff line number Diff line change
Expand Up @@ -775,8 +775,8 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques
// Increment metrics only if the samples have been successfully committed.
// If the code didn't reach this point, it means that we returned an error
// which will be converted into an HTTP 5xx and the client should/will retry.
i.metrics.ingestedSamples.Add(float64(succeededSamplesCount))
i.metrics.ingestedSamplesFail.Add(float64(failedSamplesCount))
i.metrics.ingestedSamples.WithLabelValues(userID).Add(float64(succeededSamplesCount))
i.metrics.ingestedSamplesFail.WithLabelValues(userID).Add(float64(failedSamplesCount))
i.metrics.ingestedExemplars.Add(float64(succeededExemplarsCount))
i.metrics.ingestedExemplarsFail.Add(float64(failedExemplarsCount))

Expand Down
Loading

0 comments on commit adc4626

Please sign in to comment.