-
Notifications
You must be signed in to change notification settings - Fork 512
/
blocks.libsonnet
250 lines (249 loc) · 12 KB
/
blocks.libsonnet
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
(import 'alerts-utils.libsonnet') {
local alertGroups = [
{
name: 'mimir_blocks_alerts',
rules: [
{
// Alert if the ingester has not shipped any block in the last 4h. It also checks cortex_ingester_ingested_samples_total
// to avoid false positives on ingesters not receiving any traffic yet (eg. a newly created cluster).
alert: $.alertName('IngesterHasNotShippedBlocks'),
'for': '15m',
expr: |||
(min by(%(alert_aggregation_labels)s, %(per_instance_label)s) (time() - cortex_ingester_shipper_last_successful_upload_timestamp_seconds) > 60 * 60 * 4)
and
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds) > 0)
and
# Only if the ingester has ingested samples over the last 4h.
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
and
# Only if the ingester was ingesting samples 4h ago. This protects against the case where the ingester replica
# had ingested samples in the past, then no traffic was received for a long period and then it starts
# receiving samples again. Without this check, the alert would fire as soon as it gets back receiving
# samples, while the a block shipping is expected within the next 4h.
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0)
||| % {
alert_aggregation_labels: $._config.alert_aggregation_labels,
per_instance_label: $._config.per_instance_label,
alert_aggregation_rule_prefix: $._config.alert_aggregation_rule_prefix,
},
labels: {
severity: 'critical',
},
annotations: {
message: '%(product)s Ingester {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config,
},
},
{
// Alert if the ingester has not shipped any block since start. It also checks cortex_ingester_ingested_samples_total
// to avoid false positives on ingesters not receiving any traffic yet (eg. a newly created cluster).
alert: $.alertName('IngesterHasNotShippedBlocksSinceStart'),
'for': '4h',
expr: |||
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds) == 0)
and
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
||| % {
alert_aggregation_labels: $._config.alert_aggregation_labels,
per_instance_label: $._config.per_instance_label,
alert_aggregation_rule_prefix: $._config.alert_aggregation_rule_prefix,
},
labels: {
severity: 'critical',
},
annotations: {
message: '%(product)s Ingester {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config,
},
},
{
// Alert if the ingester has compacted some blocks that haven't been successfully uploaded to the storage yet since
// more than 1 hour. The metric tracks the time of the oldest unshipped block, measured as the time when the
// TSDB head has been compacted to a block. The metric is 0 if all blocks have been shipped.
alert: $.alertName('IngesterHasUnshippedBlocks'),
'for': '15m',
expr: |||
(time() - cortex_ingester_oldest_unshipped_block_timestamp_seconds > 3600)
and
(cortex_ingester_oldest_unshipped_block_timestamp_seconds > 0)
|||,
labels: {
severity: 'critical',
},
annotations: {
message: "%(product)s Ingester %(alert_instance_variable)s in %(alert_aggregation_variables)s has compacted a block {{ $value | humanizeDuration }} ago but it hasn't been successfully uploaded to the storage yet." % $._config,
},
},
{
// Alert if the ingester is failing to compact TSDB head into a block, for any opened TSDB. Once the TSDB head is
// compactable, the ingester will try to compact it every 1 minute. Repeatedly failing it is a critical condition
// that should never happen.
alert: $.alertName('IngesterTSDBHeadCompactionFailed'),
'for': '15m',
expr: |||
rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0
|||,
labels: {
severity: 'critical',
},
annotations: {
message: '%(product)s Ingester %(alert_instance_variable)s in %(alert_aggregation_variables)s is failing to compact TSDB head.' % $._config,
},
},
{
alert: $.alertName('IngesterTSDBHeadTruncationFailed'),
expr: |||
rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0
|||,
labels: {
severity: 'critical',
},
annotations: {
message: '%(product)s Ingester %(alert_instance_variable)s in %(alert_aggregation_variables)s is failing to truncate TSDB head.' % $._config,
},
},
{
alert: $.alertName('IngesterTSDBCheckpointCreationFailed'),
expr: |||
rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0
|||,
labels: {
severity: 'critical',
},
annotations: {
message: '%(product)s Ingester %(alert_instance_variable)s in %(alert_aggregation_variables)s is failing to create TSDB checkpoint.' % $._config,
},
},
{
alert: $.alertName('IngesterTSDBCheckpointDeletionFailed'),
expr: |||
rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0
|||,
labels: {
severity: 'critical',
},
annotations: {
message: '%(product)s Ingester %(alert_instance_variable)s in %(alert_aggregation_variables)s is failing to delete TSDB checkpoint.' % $._config,
},
},
{
alert: $.alertName('IngesterTSDBWALTruncationFailed'),
expr: |||
rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0
|||,
labels: {
severity: 'warning',
},
annotations: {
message: '%(product)s Ingester %(alert_instance_variable)s in %(alert_aggregation_variables)s is failing to truncate TSDB WAL.' % $._config,
},
},
{
alert: $.alertName('IngesterTSDBWALCorrupted'),
expr: |||
# alert when there are more than one corruptions
count by (%(alert_aggregation_labels)s) (rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0) > 1
and
# and there is only one zone
count by (%(alert_aggregation_labels)s) (group by (%(alert_aggregation_labels)s, %(per_job_label)s) (cortex_ingester_tsdb_wal_corruptions_total)) == 1
||| % $._config,
labels: {
severity: 'critical',
deployment: 'single-zone',
},
annotations: {
message: '%(product)s Ingester %(alert_instance_variable)s in %(alert_aggregation_variables)s got a corrupted TSDB WAL.' % $._config,
},
},
{
alert: $.alertName('IngesterTSDBWALCorrupted'),
expr: |||
# alert when there are more than one corruptions
count by (%(alert_aggregation_labels)s) (sum by (%(alert_aggregation_labels)s, %(per_job_label)s) (rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0)) > 1
and
# and there are multiple zones
count by (%(alert_aggregation_labels)s) (group by (%(alert_aggregation_labels)s, %(per_job_label)s) (cortex_ingester_tsdb_wal_corruptions_total)) > 1
||| % $._config,
labels: {
severity: 'critical',
deployment: 'multi-zone',
},
annotations: {
message: '%(product)s Ingester %(alert_instance_variable)s in %(alert_aggregation_variables)s got a corrupted TSDB WAL.' % $._config,
},
},
{
alert: $.alertName('IngesterTSDBWALWritesFailed'),
'for': '3m',
expr: |||
rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0
|||,
labels: {
severity: 'critical',
},
annotations: {
message: '%(product)s Ingester %(alert_instance_variable)s in %(alert_aggregation_variables)s is failing to write to TSDB WAL.' % $._config,
},
},
{
// Alert if the querier is not successfully scanning the bucket.
alert: $.alertName('QuerierHasNotScanTheBucket'),
'for': '5m',
expr: |||
(time() - cortex_querier_blocks_last_successful_scan_timestamp_seconds > 60 * 30)
and
cortex_querier_blocks_last_successful_scan_timestamp_seconds > 0
|||,
labels: {
severity: 'critical',
},
annotations: {
message: '%(product)s Querier %(alert_instance_variable)s in %(alert_aggregation_variables)s has not successfully scanned the bucket since {{ $value | humanizeDuration }}.' % $._config,
},
},
{
// Alert if the store-gateway is not successfully synching the bucket.
alert: $.alertName('StoreGatewayHasNotSyncTheBucket'),
'for': '5m',
expr: |||
(time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 60 * 30)
and
cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0
|||,
labels: {
severity: 'critical',
},
annotations: {
message: '%(product)s store-gateway %(alert_instance_variable)s in %(alert_aggregation_variables)s has not successfully synched the bucket since {{ $value | humanizeDuration }}.' % $._config,
},
},
{
// Alert if the store-gateway is not owning any tenant.
alert: $.alertName('StoreGatewayNoSyncedTenants'),
'for': '1h',
expr: |||
min by(%(alert_aggregation_labels)s, %(per_instance_label)s) (cortex_bucket_stores_tenants_synced{component="store-gateway"}) == 0
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: '%(product)s store-gateway %(alert_instance_variable)s in %(alert_aggregation_variables)s is not syncing any blocks for any tenant.' % $._config,
},
},
{
// Alert if the bucket index has not been updated for a given user.
alert: $.alertName('BucketIndexNotUpdated'),
expr: |||
min by(%(alert_aggregation_labels)s, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200
||| % $._config,
labels: {
severity: 'critical',
},
annotations: {
message: '%(product)s bucket index for tenant {{ $labels.user }} in %(alert_aggregation_variables)s has not been updated since {{ $value | humanizeDuration }}.' % $._config,
},
},
],
},
],
groups+: $.withRunbookURL('https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#%s', alertGroups),
}