From 521be6a89e88407b4e83d6ea704109c9d0fd0f18 Mon Sep 17 00:00:00 2001 From: Mauro Stettler Date: Mon, 3 Jun 2024 13:47:35 +0200 Subject: [PATCH] add metric to track out-of-space errors (#8237) * add metric for out-of-space errors Signed-off-by: Mauro Stettler * syntax Signed-off-by: Mauro Stettler * better comment Signed-off-by: Mauro Stettler * PR feedback Co-authored-by: Marco Pracucci * add CHANGELOG entry Signed-off-by: Mauro Stettler --------- Signed-off-by: Mauro Stettler Co-authored-by: Marco Pracucci --- CHANGELOG.md | 1 + pkg/compactor/compactor.go | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b8431c43b8e..44fbcebba27 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ * Query blocking can no longer be circumvented with an equivalent query in a different format; see [Configure queries to block](https://grafana.com/docs/mimir/latest/configure/configure-blocked-queries/) * [CHANGE] Query-frontend: stop using `-validation.create-grace-period` to clamp how far into the future a query can span. * [CHANGE] Clamp [`GOMAXPROCS`](https://pkg.go.dev/runtime#GOMAXPROCS) to [`runtime.NumCPU`](https://pkg.go.dev/runtime#NumCPU). #8201 +* [CHANGE] Added new metric `cortex_compactor_disk_out_of_space_errors_total` which counts how many times a compaction failed due to the compactor being out of disk. #8237 * [FEATURE] Continuous-test: now runable as a module with `mimir -target=continuous-test`. #7747 * [FEATURE] Store-gateway: Allow specific tenants to be enabled or disabled via `-store-gateway.enabled-tenants` or `-store-gateway.disabled-tenants` CLI flags or their corresponding YAML settings. #7653 * [FEATURE] New `-.s3.bucket-lookup-type` flag configures lookup style type, used to access bucket in s3 compatible providers. #7684 diff --git a/pkg/compactor/compactor.go b/pkg/compactor/compactor.go index 8f7bcfd0282..ca9246f6558 100644 --- a/pkg/compactor/compactor.go +++ b/pkg/compactor/compactor.go @@ -16,6 +16,7 @@ import ( "path/filepath" "slices" "strings" + "syscall" "time" "github.com/go-kit/log" @@ -285,6 +286,10 @@ type MultitenantCompactor struct { compactionRunInterval prometheus.Gauge blocksMarkedForDeletion prometheus.Counter + // outOfSpace is a separate metric for out-of-space errors because this is a common issue which often requires an operator to investigate, + // so alerts need to be able to treat it with higher priority than other compaction errors. + outOfSpace prometheus.Counter + // Metrics shared across all BucketCompactor instances. bucketCompactorMetrics *BucketCompactorMetrics @@ -384,6 +389,10 @@ func newMultitenantCompactor( Name: "cortex_compactor_compaction_interval_seconds", Help: "The configured interval on which compaction is run in seconds. Useful when compared to the last successful run metric to accurately detect multiple failed compaction runs.", }), + outOfSpace: promauto.With(registerer).NewGauge(prometheus.GaugeOpts{ + Name: "cortex_compactor_disk_out_of_space_errors_total", + Help: "Number of times a compaction failed because the compactor disk was out of space.", + }), blocksMarkedForDeletion: promauto.With(registerer).NewCounter(prometheus.CounterOpts{ Name: blocksMarkedForDeletionName, Help: blocksMarkedForDeletionHelp, @@ -665,6 +674,9 @@ func (c *MultitenantCompactor) compactUsers(ctx context.Context) { // We don't want to count shutdowns as failed compactions because we will pick up with the rest of the compaction after the restart. level.Info(c.logger).Log("msg", "compaction for user was interrupted by a shutdown", "user", userID) return + case errors.Is(err, syscall.ENOSPC): + c.outOfSpace.Inc() + fallthrough default: c.compactionRunFailedTenants.Inc() compactionErrorCount++