From 521be6a89e88407b4e83d6ea704109c9d0fd0f18 Mon Sep 17 00:00:00 2001
From: Mauro Stettler <mauro.stettler@gmail.com>
Date: Mon, 3 Jun 2024 13:47:35 +0200
Subject: [PATCH] add metric to track out-of-space errors (#8237)

* add metric for out-of-space errors

Signed-off-by: Mauro Stettler <mauro.stettler@gmail.com>

* syntax

Signed-off-by: Mauro Stettler <mauro.stettler@gmail.com>

* better comment

Signed-off-by: Mauro Stettler <mauro.stettler@gmail.com>

* PR feedback

Co-authored-by: Marco Pracucci <marco@pracucci.com>

* add CHANGELOG entry

Signed-off-by: Mauro Stettler <mauro.stettler@gmail.com>

---------

Signed-off-by: Mauro Stettler <mauro.stettler@gmail.com>
Co-authored-by: Marco Pracucci <marco@pracucci.com>
---
 CHANGELOG.md               |  1 +
 pkg/compactor/compactor.go | 12 ++++++++++++
 2 files changed, 13 insertions(+)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b8431c43b8e..44fbcebba27 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,7 @@
   * Query blocking can no longer be circumvented with an equivalent query in a different format; see [Configure queries to block](https://grafana.com/docs/mimir/latest/configure/configure-blocked-queries/)
 * [CHANGE] Query-frontend: stop using `-validation.create-grace-period` to clamp how far into the future a query can span.
 * [CHANGE] Clamp [`GOMAXPROCS`](https://pkg.go.dev/runtime#GOMAXPROCS) to [`runtime.NumCPU`](https://pkg.go.dev/runtime#NumCPU). #8201
+* [CHANGE] Added new metric `cortex_compactor_disk_out_of_space_errors_total` which counts how many times a compaction failed due to the compactor being out of disk. #8237
 * [FEATURE] Continuous-test: now runable as a module with `mimir -target=continuous-test`. #7747
 * [FEATURE] Store-gateway: Allow specific tenants to be enabled or disabled via `-store-gateway.enabled-tenants` or `-store-gateway.disabled-tenants` CLI flags or their corresponding YAML settings. #7653
 * [FEATURE] New `-<prefix>.s3.bucket-lookup-type` flag configures lookup style type, used to access bucket in s3 compatible providers. #7684
diff --git a/pkg/compactor/compactor.go b/pkg/compactor/compactor.go
index 8f7bcfd0282..ca9246f6558 100644
--- a/pkg/compactor/compactor.go
+++ b/pkg/compactor/compactor.go
@@ -16,6 +16,7 @@ import (
 	"path/filepath"
 	"slices"
 	"strings"
+	"syscall"
 	"time"
 
 	"github.com/go-kit/log"
@@ -285,6 +286,10 @@ type MultitenantCompactor struct {
 	compactionRunInterval          prometheus.Gauge
 	blocksMarkedForDeletion        prometheus.Counter
 
+	// outOfSpace is a separate metric for out-of-space errors because this is a common issue which often requires an operator to investigate,
+	// so alerts need to be able to treat it with higher priority than other compaction errors.
+	outOfSpace prometheus.Counter
+
 	// Metrics shared across all BucketCompactor instances.
 	bucketCompactorMetrics *BucketCompactorMetrics
 
@@ -384,6 +389,10 @@ func newMultitenantCompactor(
 			Name: "cortex_compactor_compaction_interval_seconds",
 			Help: "The configured interval on which compaction is run in seconds. Useful when compared to the last successful run metric to accurately detect multiple failed compaction runs.",
 		}),
+		outOfSpace: promauto.With(registerer).NewGauge(prometheus.GaugeOpts{
+			Name: "cortex_compactor_disk_out_of_space_errors_total",
+			Help: "Number of times a compaction failed because the compactor disk was out of space.",
+		}),
 		blocksMarkedForDeletion: promauto.With(registerer).NewCounter(prometheus.CounterOpts{
 			Name:        blocksMarkedForDeletionName,
 			Help:        blocksMarkedForDeletionHelp,
@@ -665,6 +674,9 @@ func (c *MultitenantCompactor) compactUsers(ctx context.Context) {
 				// We don't want to count shutdowns as failed compactions because we will pick up with the rest of the compaction after the restart.
 				level.Info(c.logger).Log("msg", "compaction for user was interrupted by a shutdown", "user", userID)
 				return
+			case errors.Is(err, syscall.ENOSPC):
+				c.outOfSpace.Inc()
+				fallthrough
 			default:
 				c.compactionRunFailedTenants.Inc()
 				compactionErrorCount++