diff --git a/CHANGELOG.md b/CHANGELOG.md index b8431c43b8e..44fbcebba27 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ * Query blocking can no longer be circumvented with an equivalent query in a different format; see [Configure queries to block](https://grafana.com/docs/mimir/latest/configure/configure-blocked-queries/) * [CHANGE] Query-frontend: stop using `-validation.create-grace-period` to clamp how far into the future a query can span. * [CHANGE] Clamp [`GOMAXPROCS`](https://pkg.go.dev/runtime#GOMAXPROCS) to [`runtime.NumCPU`](https://pkg.go.dev/runtime#NumCPU). #8201 +* [CHANGE] Added new metric `cortex_compactor_disk_out_of_space_errors_total` which counts how many times a compaction failed due to the compactor being out of disk. #8237 * [FEATURE] Continuous-test: now runable as a module with `mimir -target=continuous-test`. #7747 * [FEATURE] Store-gateway: Allow specific tenants to be enabled or disabled via `-store-gateway.enabled-tenants` or `-store-gateway.disabled-tenants` CLI flags or their corresponding YAML settings. #7653 * [FEATURE] New `-.s3.bucket-lookup-type` flag configures lookup style type, used to access bucket in s3 compatible providers. #7684 diff --git a/pkg/compactor/compactor.go b/pkg/compactor/compactor.go index 8f7bcfd0282..ca9246f6558 100644 --- a/pkg/compactor/compactor.go +++ b/pkg/compactor/compactor.go @@ -16,6 +16,7 @@ import ( "path/filepath" "slices" "strings" + "syscall" "time" "github.com/go-kit/log" @@ -285,6 +286,10 @@ type MultitenantCompactor struct { compactionRunInterval prometheus.Gauge blocksMarkedForDeletion prometheus.Counter + // outOfSpace is a separate metric for out-of-space errors because this is a common issue which often requires an operator to investigate, + // so alerts need to be able to treat it with higher priority than other compaction errors. + outOfSpace prometheus.Counter + // Metrics shared across all BucketCompactor instances. bucketCompactorMetrics *BucketCompactorMetrics @@ -384,6 +389,10 @@ func newMultitenantCompactor( Name: "cortex_compactor_compaction_interval_seconds", Help: "The configured interval on which compaction is run in seconds. Useful when compared to the last successful run metric to accurately detect multiple failed compaction runs.", }), + outOfSpace: promauto.With(registerer).NewGauge(prometheus.GaugeOpts{ + Name: "cortex_compactor_disk_out_of_space_errors_total", + Help: "Number of times a compaction failed because the compactor disk was out of space.", + }), blocksMarkedForDeletion: promauto.With(registerer).NewCounter(prometheus.CounterOpts{ Name: blocksMarkedForDeletionName, Help: blocksMarkedForDeletionHelp, @@ -665,6 +674,9 @@ func (c *MultitenantCompactor) compactUsers(ctx context.Context) { // We don't want to count shutdowns as failed compactions because we will pick up with the rest of the compaction after the restart. level.Info(c.logger).Log("msg", "compaction for user was interrupted by a shutdown", "user", userID) return + case errors.Is(err, syscall.ENOSPC): + c.outOfSpace.Inc() + fallthrough default: c.compactionRunFailedTenants.Inc() compactionErrorCount++