From 1435abf0da58dbcbed22947867ee8e1e9946b4cc Mon Sep 17 00:00:00 2001 From: Dimitar Dimitrov Date: Wed, 3 Apr 2024 12:05:40 +0200 Subject: [PATCH] instrumented gate: observe permitted and non-permitted queries separately (#512) * instrumented gate: observe permitted and non-permitted queries separately This distinction helps when debugging increased congestion on a gate. * Add separation between cancelled and deadline exceeded errors * Add changelog entry * Remove timeout gate * Remove ErrTimeout --- CHANGELOG.md | 1 + gate/gate.go | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ceaebd6f..d57f5a4b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -205,6 +205,7 @@ * `gate_duration_seconds` * `kv_request_duration_seconds` * `operation_duration_seconds` +* [ENHANCEMENT] Add `outcome` label to `gate_duration_seconds` metric. Possible values are `rejected_canceled`, `rejected_deadline_exceeded`, `rejected_other`, and `permitted`. #512 * [BUGFIX] spanlogger: Support multiple tenant IDs. #59 * [BUGFIX] Memberlist: fixed corrupted packets when sending compound messages with more than 255 messages or messages bigger than 64KB. #85 * [BUGFIX] Ring: `ring_member_ownership_percent` and `ring_tokens_owned` metrics are not updated on scale down. #109 diff --git a/gate/gate.go b/gate/gate.go index 1f915ecba..fe050edad 100644 --- a/gate/gate.go +++ b/gate/gate.go @@ -64,7 +64,7 @@ func NewInstrumented(reg prometheus.Registerer, maxConcurrent int, gate Gate) Ga Name: "gate_queries_in_flight", Help: "Number of queries that are currently in flight.", }), - duration: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ + duration: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ Name: "gate_duration_seconds", Help: "How many seconds it took for queries to wait at the gate.", Buckets: []float64{0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120, 240, 360, 720}, @@ -72,7 +72,7 @@ func NewInstrumented(reg prometheus.Registerer, maxConcurrent int, gate Gate) Ga NativeHistogramBucketFactor: 1.1, NativeHistogramMaxBucketNumber: 100, NativeHistogramMinResetDuration: time.Hour, - }), + }, []string{"outcome"}), } g.max.Set(float64(maxConcurrent)) @@ -84,20 +84,28 @@ type instrumentedGate struct { max prometheus.Gauge inflight prometheus.Gauge - duration prometheus.Histogram + duration *prometheus.HistogramVec } func (g *instrumentedGate) Start(ctx context.Context) error { start := time.Now() - defer func() { - g.duration.Observe(time.Since(start).Seconds()) - }() err := g.gate.Start(ctx) if err != nil { + var reason string + switch { + case errors.Is(err, context.Canceled): + reason = "rejected_canceled" + case errors.Is(err, context.DeadlineExceeded): + reason = "rejected_deadline_exceeded" + default: + reason = "rejected_other" + } + g.duration.WithLabelValues(reason).Observe(time.Since(start).Seconds()) return err } + g.duration.WithLabelValues("permitted").Observe(time.Since(start).Seconds()) g.inflight.Inc() return nil }