Skip to content

Commit

Permalink
Use parameterized per instance label
Browse files Browse the repository at this point in the history
Signed-off-by: Arve Knudsen <arve.knudsen@gmail.com>
  • Loading branch information
aknuds1 committed Mar 30, 2022
1 parent 8f7aead commit 69afed2
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 24 deletions.
18 changes: 9 additions & 9 deletions operations/mimir-mixin-compiled/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ groups:
message: |
Ingesters in {{ $labels.cluster }}/{{ $labels.namespace }} ingest too many samples per second.
expr: |
avg by (cluster, namespace) (cluster_job_pod:cortex_ingester_ingested_samples_total:rate1m) > 80e3
avg by (cluster, namespace) (cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m) > 80e3
for: 15m
labels:
severity: warning
Expand Down Expand Up @@ -466,32 +466,32 @@ groups:
rules:
- alert: MimirIngesterHasNotShippedBlocks
annotations:
message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace
message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
}} has not shipped any block in the last 4 hours.
expr: |
(min by(cluster, namespace, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4)
(min by(cluster, namespace, pod) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4)
and
(max by(cluster, namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0)
(max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0)
and
# Only if the ingester has ingested samples over the last 4h.
(max by(cluster, namespace, instance) (max_over_time(cluster_job_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
(max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
and
# Only if the ingester was ingesting samples 4h ago. This protects from the case the ingester instance
# had ingested samples in the past, then no traffic was received for a long period and then it starts
# receiving samples again. Without this check, the alert would fire as soon as it gets back receiving
# samples, while the a block shipping is expected within the next 4h.
(max by(cluster, namespace, instance) (max_over_time(cluster_job_pod:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0)
(max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0)
for: 15m
labels:
severity: critical
- alert: MimirIngesterHasNotShippedBlocksSinceStart
annotations:
message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace
message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace
}} has not shipped any block in the last 4 hours.
expr: |
(max by(cluster, namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0)
(max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0)
and
(max by(cluster, namespace, instance) (max_over_time(cluster_job_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
(max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
for: 4h
labels:
severity: critical
Expand Down
4 changes: 2 additions & 2 deletions operations/mimir-mixin-compiled/rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -551,5 +551,5 @@ groups:
- name: mimir_ingester_rules
rules:
- expr: |
sum by(cluster, job, instance) (rate(cortex_ingester_ingested_samples_total[1m]))
record: cluster_job_instance:cortex_ingester_ingested_samples_total:rate1m
sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m]))
record: cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m
2 changes: 1 addition & 1 deletion operations/mimir-mixin/alerts/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,7 @@
alert: $.alertName('ProvisioningTooManyWrites'),
// 80k writes / s per ingester max.
expr: |||
avg by (%(alert_aggregation_labels)s) (cluster_job_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m) > 80e3
avg by (%(alert_aggregation_labels)s) (cluster_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m) > 80e3
||| % $._config,
'for': '15m',
labels: {
Expand Down
16 changes: 8 additions & 8 deletions operations/mimir-mixin/alerts/blocks.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,24 @@
alert: $.alertName('IngesterHasNotShippedBlocks'),
'for': '15m',
expr: |||
(min by(%(alert_aggregation_labels)s, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4)
(min by(%(alert_aggregation_labels)s, %(per_instance_label)s) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4)
and
(max by(%(alert_aggregation_labels)s, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0)
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0)
and
# Only if the ingester has ingested samples over the last 4h.
(max by(%(alert_aggregation_labels)s, instance) (max_over_time(cluster_job_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(cluster_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
and
# Only if the ingester was ingesting samples 4h ago. This protects from the case the ingester instance
# had ingested samples in the past, then no traffic was received for a long period and then it starts
# receiving samples again. Without this check, the alert would fire as soon as it gets back receiving
# samples, while the a block shipping is expected within the next 4h.
(max by(%(alert_aggregation_labels)s, instance) (max_over_time(cluster_job_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0)
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(cluster_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0)
||| % $._config,
labels: {
severity: 'critical',
},
annotations: {
message: '%(product)s Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config,
message: '%(product)s Ingester {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config,
},
},
{
Expand All @@ -35,15 +35,15 @@
alert: $.alertName('IngesterHasNotShippedBlocksSinceStart'),
'for': '4h',
expr: |||
(max by(%(alert_aggregation_labels)s, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0)
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0)
and
(max by(%(alert_aggregation_labels)s, instance) (max_over_time(cluster_job_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(cluster_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
||| % $._config,
labels: {
severity: 'critical',
},
annotations: {
message: '%(product)s Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config,
message: '%(product)s Ingester {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config,
},
},
{
Expand Down
8 changes: 4 additions & 4 deletions operations/mimir-mixin/recording_rules.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -478,11 +478,11 @@ local utils = import 'mixin-utils/utils.libsonnet';
name: 'mimir_ingester_rules',
rules: [
{
// cortex_ingester_ingested_samples_total is per user, in this rule we want to see the sum per cluster/job/instance
record: 'cluster_job_instance:cortex_ingester_ingested_samples_total:rate1m',
// cortex_ingester_ingested_samples_total is per user, in this rule we want to see the sum per cluster/namespace/instance
record: 'cluster_namespace_%s:cortex_ingester_ingested_samples_total:rate1m' % $._config.per_instance_label,
expr: |||
sum by(cluster, job, instance) (rate(cortex_ingester_ingested_samples_total[1m]))
|||,
sum by(%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingester_ingested_samples_total[1m]))
||| % $._config,
},
],
},
Expand Down

0 comments on commit 69afed2

Please sign in to comment.