Skip to content

Commit

Permalink
Fix cortex_ingester_ingested_exemplars_failures_total metric tracking…
Browse files Browse the repository at this point in the history
… when exemplars are discarded because OOO (#7948)

Signed-off-by: Marco Pracucci <marco@pracucci.com>
  • Loading branch information
pracucci committed Apr 23, 2024
1 parent 1064c1a commit 38ef803
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 2 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
* [BUGFIX] querier: Don't cache context.Canceled errors for bucket index. #7620
* [BUGFIX] Store-gateway: account for `"other"` time in LabelValues and LabelNames requests. #7622
* [BUGFIX] Query-frontend: Don't panic when using the `-query-frontend.downstream-url` flag. #7651
* [BUGFIX] Ingester: when receiving multiple exemplars for a native histogram via remote write, sort them and only report an error if all are older than the latest exemplar as this could be a partial update. #7640
* [BUGFIX] Ingester: when receiving multiple exemplars for a native histogram via remote write, sort them and only report an error if all are older than the latest exemplar as this could be a partial update. #7640 #7948
* [BUGFIX] Ingester: don't retain blocks if they finish exactly on the boundary of the retention window. #7656
* [BUGFIX] Bug-fixes and improvements to experimental native histograms. #7744 #7813
* [BUGFIX] Querier: return an error when a query uses `label_join` with an invalid destination label name. #7744
Expand Down
5 changes: 4 additions & 1 deletion pkg/ingester/ingester.go
Original file line number Diff line number Diff line change
Expand Up @@ -1522,6 +1522,10 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre
continue
}

// We track the failed exemplars ingestion, whatever is the reason. This way, the sum of successfully
// and failed ingested exemplars is equal to the total number of processed ones.
stats.failedExemplarsCount++

if errors.Is(err, storage.ErrOutOfOrderExemplar) {
outOfOrderExemplars++
// Only report out of order exemplars if all are out of order, otherwise this was a partial update
Expand All @@ -1535,7 +1539,6 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre
updateFirstPartial(nil, func() softError {
return newTSDBIngestExemplarErr(err, model.Time(ex.TimestampMs), ts.Labels, ex.Labels)
})
stats.failedExemplarsCount++
}
}
}
Expand Down
100 changes: 100 additions & 0 deletions pkg/ingester/ingester_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,8 @@ func TestIngester_Push(t *testing.T) {
"cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage",
"cortex_ingester_tsdb_exemplar_last_exemplars_timestamp_seconds",
"cortex_ingester_tsdb_exemplar_out_of_order_exemplars_total",
"cortex_ingester_ingested_exemplars_total",
"cortex_ingester_ingested_exemplars_failures_total",
},
expectedMetrics: `
# HELP cortex_ingester_ingested_samples_total The total number of samples ingested per user.
Expand Down Expand Up @@ -376,6 +378,14 @@ func TestIngester_Push(t *testing.T) {
# HELP cortex_ingester_tsdb_head_max_timestamp_seconds Maximum timestamp of the head block across all tenants.
# TYPE cortex_ingester_tsdb_head_max_timestamp_seconds gauge
cortex_ingester_tsdb_head_max_timestamp_seconds 0.009
# HELP cortex_ingester_ingested_exemplars_total The total number of exemplars ingested.
# TYPE cortex_ingester_ingested_exemplars_total counter
cortex_ingester_ingested_exemplars_total 1
# HELP cortex_ingester_ingested_exemplars_failures_total The total number of exemplars that errored on ingestion.
# TYPE cortex_ingester_ingested_exemplars_failures_total counter
cortex_ingester_ingested_exemplars_failures_total 0
`,
},
"should succeed on new float series and an exemplar": {
Expand Down Expand Up @@ -527,6 +537,8 @@ func TestIngester_Push(t *testing.T) {
"cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage",
"cortex_ingester_tsdb_exemplar_last_exemplars_timestamp_seconds",
"cortex_ingester_tsdb_exemplar_out_of_order_exemplars_total",
"cortex_ingester_ingested_exemplars_total",
"cortex_ingester_ingested_exemplars_failures_total",
},
expectedMetrics: `
# HELP cortex_ingester_ingested_samples_total The total number of samples ingested per user.
Expand Down Expand Up @@ -578,6 +590,14 @@ func TestIngester_Push(t *testing.T) {
# HELP cortex_ingester_tsdb_head_max_timestamp_seconds Maximum timestamp of the head block across all tenants.
# TYPE cortex_ingester_tsdb_head_max_timestamp_seconds gauge
cortex_ingester_tsdb_head_max_timestamp_seconds 0.01
# HELP cortex_ingester_ingested_exemplars_total The total number of exemplars ingested.
# TYPE cortex_ingester_ingested_exemplars_total counter
cortex_ingester_ingested_exemplars_total 1
# HELP cortex_ingester_ingested_exemplars_failures_total The total number of exemplars that errored on ingestion.
# TYPE cortex_ingester_ingested_exemplars_failures_total counter
cortex_ingester_ingested_exemplars_failures_total 0
`,
},
"should succeed on new histogram series with an exemplar": {
Expand Down Expand Up @@ -619,6 +639,8 @@ func TestIngester_Push(t *testing.T) {
"cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage",
"cortex_ingester_tsdb_exemplar_last_exemplars_timestamp_seconds",
"cortex_ingester_tsdb_exemplar_out_of_order_exemplars_total",
"cortex_ingester_ingested_exemplars_total",
"cortex_ingester_ingested_exemplars_failures_total",
},
expectedMetrics: `
# HELP cortex_ingester_ingested_samples_total The total number of samples ingested per user.
Expand Down Expand Up @@ -676,6 +698,14 @@ func TestIngester_Push(t *testing.T) {
# HELP cortex_ingester_tsdb_head_max_timestamp_seconds Maximum timestamp of the head block across all tenants.
# TYPE cortex_ingester_tsdb_head_max_timestamp_seconds gauge
cortex_ingester_tsdb_head_max_timestamp_seconds 0.009
# HELP cortex_ingester_ingested_exemplars_total The total number of exemplars ingested.
# TYPE cortex_ingester_ingested_exemplars_total counter
cortex_ingester_ingested_exemplars_total 1
# HELP cortex_ingester_ingested_exemplars_failures_total The total number of exemplars that errored on ingestion.
# TYPE cortex_ingester_ingested_exemplars_failures_total counter
cortex_ingester_ingested_exemplars_failures_total 0
`,
},
"should succeed on new histogram series and an exemplar": {
Expand Down Expand Up @@ -730,6 +760,8 @@ func TestIngester_Push(t *testing.T) {
"cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage",
"cortex_ingester_tsdb_exemplar_last_exemplars_timestamp_seconds",
"cortex_ingester_tsdb_exemplar_out_of_order_exemplars_total",
"cortex_ingester_ingested_exemplars_total",
"cortex_ingester_ingested_exemplars_failures_total",
},
expectedMetrics: `
# HELP cortex_ingester_ingested_samples_total The total number of samples ingested per user.
Expand Down Expand Up @@ -787,6 +819,14 @@ func TestIngester_Push(t *testing.T) {
# HELP cortex_ingester_tsdb_head_max_timestamp_seconds Maximum timestamp of the head block across all tenants.
# TYPE cortex_ingester_tsdb_head_max_timestamp_seconds gauge
cortex_ingester_tsdb_head_max_timestamp_seconds 0.009
# HELP cortex_ingester_ingested_exemplars_total The total number of exemplars ingested.
# TYPE cortex_ingester_ingested_exemplars_total counter
cortex_ingester_ingested_exemplars_total 1
# HELP cortex_ingester_ingested_exemplars_failures_total The total number of exemplars that errored on ingestion.
# TYPE cortex_ingester_ingested_exemplars_failures_total counter
cortex_ingester_ingested_exemplars_failures_total 0
`,
},
"should succeed on existing histogram series with an exemplar": {
Expand Down Expand Up @@ -836,6 +876,8 @@ func TestIngester_Push(t *testing.T) {
"cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage",
"cortex_ingester_tsdb_exemplar_last_exemplars_timestamp_seconds",
"cortex_ingester_tsdb_exemplar_out_of_order_exemplars_total",
"cortex_ingester_ingested_exemplars_total",
"cortex_ingester_ingested_exemplars_failures_total",
},
expectedMetrics: `
# HELP cortex_ingester_ingested_samples_total The total number of samples ingested per user.
Expand Down Expand Up @@ -893,6 +935,14 @@ func TestIngester_Push(t *testing.T) {
# HELP cortex_ingester_tsdb_head_max_timestamp_seconds Maximum timestamp of the head block across all tenants.
# TYPE cortex_ingester_tsdb_head_max_timestamp_seconds gauge
cortex_ingester_tsdb_head_max_timestamp_seconds 0.01
# HELP cortex_ingester_ingested_exemplars_total The total number of exemplars ingested.
# TYPE cortex_ingester_ingested_exemplars_total counter
cortex_ingester_ingested_exemplars_total 1
# HELP cortex_ingester_ingested_exemplars_failures_total The total number of exemplars that errored on ingestion.
# TYPE cortex_ingester_ingested_exemplars_failures_total counter
cortex_ingester_ingested_exemplars_failures_total 0
`,
},
"should succeed on existing histogram series with multiple exemplars": {
Expand Down Expand Up @@ -955,6 +1005,8 @@ func TestIngester_Push(t *testing.T) {
"cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage",
"cortex_ingester_tsdb_exemplar_last_exemplars_timestamp_seconds",
"cortex_ingester_tsdb_exemplar_out_of_order_exemplars_total",
"cortex_ingester_ingested_exemplars_total",
"cortex_ingester_ingested_exemplars_failures_total",
},
expectedMetrics: `
# HELP cortex_ingester_ingested_samples_total The total number of samples ingested per user.
Expand Down Expand Up @@ -1012,6 +1064,14 @@ func TestIngester_Push(t *testing.T) {
# HELP cortex_ingester_tsdb_head_max_timestamp_seconds Maximum timestamp of the head block across all tenants.
# TYPE cortex_ingester_tsdb_head_max_timestamp_seconds gauge
cortex_ingester_tsdb_head_max_timestamp_seconds 0.01
# HELP cortex_ingester_ingested_exemplars_total The total number of exemplars ingested.
# TYPE cortex_ingester_ingested_exemplars_total counter
cortex_ingester_ingested_exemplars_total 2
# HELP cortex_ingester_ingested_exemplars_failures_total The total number of exemplars that errored on ingestion.
# TYPE cortex_ingester_ingested_exemplars_failures_total counter
cortex_ingester_ingested_exemplars_failures_total 0
`,
},
"should succeed on existing histogram series with partial updated exemplars": {
Expand Down Expand Up @@ -1113,6 +1173,8 @@ func TestIngester_Push(t *testing.T) {
"cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage",
"cortex_ingester_tsdb_exemplar_last_exemplars_timestamp_seconds",
"cortex_ingester_tsdb_exemplar_out_of_order_exemplars_total",
"cortex_ingester_ingested_exemplars_total",
"cortex_ingester_ingested_exemplars_failures_total",
},
expectedMetrics: `
# HELP cortex_ingester_ingested_samples_total The total number of samples ingested per user.
Expand Down Expand Up @@ -1170,6 +1232,14 @@ func TestIngester_Push(t *testing.T) {
# HELP cortex_ingester_tsdb_head_max_timestamp_seconds Maximum timestamp of the head block across all tenants.
# TYPE cortex_ingester_tsdb_head_max_timestamp_seconds gauge
cortex_ingester_tsdb_head_max_timestamp_seconds 0.01
# HELP cortex_ingester_ingested_exemplars_total The total number of exemplars ingested.
# TYPE cortex_ingester_ingested_exemplars_total counter
cortex_ingester_ingested_exemplars_total 4
# HELP cortex_ingester_ingested_exemplars_failures_total The total number of exemplars that errored on ingestion.
# TYPE cortex_ingester_ingested_exemplars_failures_total counter
cortex_ingester_ingested_exemplars_failures_total 1
`,
},
"should soft fail if histogram has a bucket count vs overall count mismatch": {
Expand Down Expand Up @@ -1451,6 +1521,8 @@ func TestIngester_Push(t *testing.T) {
"cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage",
"cortex_ingester_tsdb_exemplar_last_exemplars_timestamp_seconds",
"cortex_ingester_tsdb_exemplar_out_of_order_exemplars_total",
"cortex_ingester_ingested_exemplars_total",
"cortex_ingester_ingested_exemplars_failures_total",
},
expectedMetrics: `
# HELP cortex_ingester_ingested_samples_total The total number of samples ingested per user.
Expand Down Expand Up @@ -1508,6 +1580,14 @@ func TestIngester_Push(t *testing.T) {
# HELP cortex_ingester_tsdb_head_max_timestamp_seconds Maximum timestamp of the head block across all tenants.
# TYPE cortex_ingester_tsdb_head_max_timestamp_seconds gauge
cortex_ingester_tsdb_head_max_timestamp_seconds 0.01
# HELP cortex_ingester_ingested_exemplars_total The total number of exemplars ingested.
# TYPE cortex_ingester_ingested_exemplars_total counter
cortex_ingester_ingested_exemplars_total 2
# HELP cortex_ingester_ingested_exemplars_failures_total The total number of exemplars that errored on ingestion.
# TYPE cortex_ingester_ingested_exemplars_failures_total counter
cortex_ingester_ingested_exemplars_failures_total 2
`,
},
"successful push, active series disabled": {
Expand Down Expand Up @@ -2131,6 +2211,8 @@ func TestIngester_Push(t *testing.T) {
"cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage",
"cortex_ingester_tsdb_exemplar_last_exemplars_timestamp_seconds",
"cortex_ingester_tsdb_exemplar_out_of_order_exemplars_total",
"cortex_ingester_ingested_exemplars_total",
"cortex_ingester_ingested_exemplars_failures_total",
},
expectedMetrics: `
# HELP cortex_ingester_ingested_samples_total The total number of samples ingested per user.
Expand Down Expand Up @@ -2179,6 +2261,14 @@ func TestIngester_Push(t *testing.T) {
# HELP cortex_ingester_tsdb_head_max_timestamp_seconds Maximum timestamp of the head block across all tenants.
# TYPE cortex_ingester_tsdb_head_max_timestamp_seconds gauge
cortex_ingester_tsdb_head_max_timestamp_seconds 0
# HELP cortex_ingester_ingested_exemplars_total The total number of exemplars ingested.
# TYPE cortex_ingester_ingested_exemplars_total counter
cortex_ingester_ingested_exemplars_total 0
# HELP cortex_ingester_ingested_exemplars_failures_total The total number of exemplars that errored on ingestion.
# TYPE cortex_ingester_ingested_exemplars_failures_total counter
cortex_ingester_ingested_exemplars_failures_total 1
`,
},
"should soft fail on exemplar with series later in the same write request": {
Expand Down Expand Up @@ -2220,6 +2310,8 @@ func TestIngester_Push(t *testing.T) {
"cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage",
"cortex_ingester_tsdb_exemplar_last_exemplars_timestamp_seconds",
"cortex_ingester_tsdb_exemplar_out_of_order_exemplars_total",
"cortex_ingester_ingested_exemplars_total",
"cortex_ingester_ingested_exemplars_failures_total",
},
expectedMetrics: `
# HELP cortex_ingester_active_series Number of currently active series per user.
Expand Down Expand Up @@ -2271,6 +2363,14 @@ func TestIngester_Push(t *testing.T) {
# HELP cortex_ingester_tsdb_head_max_timestamp_seconds Maximum timestamp of the head block across all tenants.
# TYPE cortex_ingester_tsdb_head_max_timestamp_seconds gauge
cortex_ingester_tsdb_head_max_timestamp_seconds 0.009
# HELP cortex_ingester_ingested_exemplars_total The total number of exemplars ingested.
# TYPE cortex_ingester_ingested_exemplars_total counter
cortex_ingester_ingested_exemplars_total 0
# HELP cortex_ingester_ingested_exemplars_failures_total The total number of exemplars that errored on ingestion.
# TYPE cortex_ingester_ingested_exemplars_failures_total counter
cortex_ingester_ingested_exemplars_failures_total 1
`,
},
"should succeed with a request containing only metadata": {
Expand Down

0 comments on commit 38ef803

Please sign in to comment.