From 51b2a854465dd3c10abc63ae612eb80c56e1d0b3 Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Fri, 9 Jul 2021 02:03:54 -0500 Subject: [PATCH] Add a new config and metric for reporting ruler query execution wall time. (#4317) * Add a new config and metric for reporting ruler query execution wall time. Signed-off-by: Tyler Reid * Spacing and PR number fixup Signed-off-by: Tyler Reid * Wrap the defer in a function to make it defer after the return rather than after the if block. Add a unit test to validate we're tracking time correctly. Signed-off-by: Tyler Reid * Use seconds for our duration rather than nanoseconds Signed-off-by: Tyler Reid * Review comment fixes Signed-off-by: Tyler Reid * Update config flag in the config docs Signed-off-by: Tyler Reid * Pass counter rather than counter vector for metrics query function Signed-off-by: Tyler Reid * Fix comment in MetricsQueryFunction Signed-off-by: Tyler Reid * Move query metric and log to separate function. Add log message for ruler query time. Signed-off-by: Tyler Reid * Update config file and change log to show this a per user metric Signed-off-by: Tyler Reid * code review fixes Signed-off-by: Tyler Reid * update log message for ruler query metrics Signed-off-by: Tyler Reid * Remove append and just use the array for key values in the log messag Signed-off-by: Tyler Reid * Add query-frontend component to front end log message Signed-off-by: Tyler Reid Signed-off-by: Alvin Lin --- CHANGELOG.md | 1 + docs/configuration/config-file-reference.md | 5 +++ pkg/frontend/transport/handler.go | 1 + pkg/ruler/compat.go | 45 +++++++++++++++++++-- pkg/ruler/compat_test.go | 15 ++++++- pkg/ruler/ruler.go | 4 ++ pkg/ruler/ruler_test.go | 1 + 7 files changed, 68 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 95bd7af7b8..7ce523859c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # Changelog ## master / unreleased +* [FEATURE] Ruler: Add new `-ruler.query-stats-enabled` which when enabled will report the `cortex_ruler_query_seconds_total` as a per-user metric that tracks the sum of the wall time of executing queries in the ruler in seconds. #4317 * [CHANGE] Querier / ruler: Change `-querier.max-fetched-chunks-per-query` configuration to limit to maximum number of chunks that can be fetched in a single query. The number of chunks fetched by ingesters AND long-term storare combined should not exceed the value configured on `-querier.max-fetched-chunks-per-query`. #4260 * [ENHANCEMENT] Add timeout for waiting on compactor to become ACTIVE in the ring. #4262 diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index cb148bed35..de7ea120f5 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -1617,6 +1617,11 @@ ring: # processing will ignore them instead. Subject to sharding. # CLI flag: -ruler.disabled-tenants [disabled_tenants: | default = ""] + +# Report the wall time for ruler queries to complete as a per user metric and as +# an info level log message. +# CLI flag: -ruler.query-stats-enabled +[query_stats_enabled: | default = false] ``` ### `ruler_storage_config` diff --git a/pkg/frontend/transport/handler.go b/pkg/frontend/transport/handler.go index 8fcf8e9630..435a022748 100644 --- a/pkg/frontend/transport/handler.go +++ b/pkg/frontend/transport/handler.go @@ -173,6 +173,7 @@ func (f *Handler) reportQueryStats(r *http.Request, queryString url.Values, quer // Log stats. logMessage := append([]interface{}{ "msg", "query stats", + "component", "query-frontend", "method", r.Method, "path", r.URL.Path, "response_time", queryResponseTime, diff --git a/pkg/ruler/compat.go b/pkg/ruler/compat.go index 3c02695a9c..c95cfcd944 100644 --- a/pkg/ruler/compat.go +++ b/pkg/ruler/compat.go @@ -6,6 +6,7 @@ import ( "time" "github.com/go-kit/kit/log" + "github.com/go-kit/kit/log/level" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" "github.com/prometheus/prometheus/notifier" @@ -20,6 +21,7 @@ import ( "github.com/cortexproject/cortex/pkg/cortexpb" "github.com/cortexproject/cortex/pkg/querier" + util_log "github.com/cortexproject/cortex/pkg/util/log" ) // Pusher is an ingester server that accepts pushes. @@ -146,9 +148,7 @@ func EngineQueryFunc(engine *promql.Engine, q storage.Queryable, overrides Rules func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Counter) rules.QueryFunc { return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) { queries.Inc() - result, err := qf(ctx, qs, t) - // We rely on TranslateToPromqlApiError to do its job here... it returns nil, if err is nil. // It returns promql.ErrStorage, if error should be reported back as 500. // Other errors it returns are either for canceled or timed-out queriers (we're not reporting those as failures), @@ -163,6 +163,33 @@ func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Coun } } +func RecordAndReportRuleQueryMetrics(qf rules.QueryFunc, queryTime prometheus.Counter, logger log.Logger) rules.QueryFunc { + if queryTime == nil { + return qf + } + + return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) { + // If we've been passed a counter we want to record the wall time spent executing this request. + timer := prometheus.NewTimer(nil) + defer func() { + querySeconds := timer.ObserveDuration().Seconds() + queryTime.Add(querySeconds) + + // Log ruler query stats. + logMessage := []interface{}{ + "msg", "query stats", + "component", "ruler", + "cortex_ruler_query_seconds_total", querySeconds, + "query", qs, + } + level.Info(util_log.WithContext(ctx, logger)).Log(logMessage...) + }() + + result, err := qf(ctx, qs, t) + return result, err + } +} + // This interface mimicks rules.Manager API. Interface is used to simplify tests. type RulesManager interface { // Starts rules manager. Blocks until Stop is called. @@ -199,12 +226,24 @@ func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engi Name: "cortex_ruler_queries_failed_total", Help: "Number of failed queries by ruler.", }) + var rulerQuerySeconds *prometheus.CounterVec + if cfg.EnableQueryStats { + rulerQuerySeconds = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_ruler_query_seconds_total", + Help: "Total amount of wall clock time spent processing queries by the ruler.", + }, []string{"user"}) + } return func(ctx context.Context, userID string, notifier *notifier.Manager, logger log.Logger, reg prometheus.Registerer) RulesManager { + var queryTime prometheus.Counter = nil + if rulerQuerySeconds != nil { + queryTime = rulerQuerySeconds.WithLabelValues(userID) + } + return rules.NewManager(&rules.ManagerOptions{ Appendable: NewPusherAppendable(p, userID, overrides, totalWrites, failedWrites), Queryable: q, - QueryFunc: MetricsQueryFunc(EngineQueryFunc(engine, q, overrides, userID), totalQueries, failedQueries), + QueryFunc: RecordAndReportRuleQueryMetrics(MetricsQueryFunc(EngineQueryFunc(engine, q, overrides, userID), totalQueries, failedQueries), queryTime, logger), Context: user.InjectOrgID(ctx, userID), ExternalURL: cfg.ExternalURL.URL, NotifyFunc: SendAlerts(notifier, cfg.ExternalURL.URL.String()), diff --git a/pkg/ruler/compat_test.go b/pkg/ruler/compat_test.go index 1c5a9fe17b..dfcb251803 100644 --- a/pkg/ruler/compat_test.go +++ b/pkg/ruler/compat_test.go @@ -8,6 +8,7 @@ import ( "testing" "time" + "github.com/go-kit/kit/log" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil" "github.com/prometheus/common/model" @@ -230,7 +231,6 @@ func TestMetricsQueryFuncErrors(t *testing.T) { mockFunc := func(ctx context.Context, q string, t time.Time) (promql.Vector, error) { return promql.Vector{}, tc.returnedError } - qf := MetricsQueryFunc(mockFunc, queries, failures) _, err := qf(context.Background(), "test", time.Now()) @@ -241,3 +241,16 @@ func TestMetricsQueryFuncErrors(t *testing.T) { }) } } + +func TestRecordAndReportRuleQueryMetrics(t *testing.T) { + queryTime := prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user"}) + + mockFunc := func(ctx context.Context, q string, t time.Time) (promql.Vector, error) { + time.Sleep(1 * time.Second) + return promql.Vector{}, nil + } + qf := RecordAndReportRuleQueryMetrics(mockFunc, queryTime.WithLabelValues("userID"), log.NewNopLogger()) + _, _ = qf(context.Background(), "test", time.Now()) + + require.GreaterOrEqual(t, testutil.ToFloat64(queryTime.WithLabelValues("userID")), float64(1)) +} diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go index 25061a8d35..57b51ed103 100644 --- a/pkg/ruler/ruler.go +++ b/pkg/ruler/ruler.go @@ -115,6 +115,8 @@ type Config struct { DisabledTenants flagext.StringSliceCSV `yaml:"disabled_tenants"` RingCheckPeriod time.Duration `yaml:"-"` + + EnableQueryStats bool `yaml:"query_stats_enabled"` } // Validate config and returns error on failure @@ -173,6 +175,8 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { f.Var(&cfg.EnabledTenants, "ruler.enabled-tenants", "Comma separated list of tenants whose rules this ruler can evaluate. If specified, only these tenants will be handled by ruler, otherwise this ruler can process rules from all tenants. Subject to sharding.") f.Var(&cfg.DisabledTenants, "ruler.disabled-tenants", "Comma separated list of tenants whose rules this ruler cannot evaluate. If specified, a ruler that would normally pick the specified tenant(s) for processing will ignore them instead. Subject to sharding.") + f.BoolVar(&cfg.EnableQueryStats, "ruler.query-stats-enabled", false, "Report the wall time for ruler queries to complete as a per user metric and as an info level log message.") + cfg.RingCheckPeriod = 5 * time.Second } diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index a6e89918b6..503474afd8 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -60,6 +60,7 @@ func defaultRulerConfig(store rulestore.RuleStore) (Config, func()) { cfg.Ring.ListenPort = 0 cfg.Ring.InstanceAddr = "localhost" cfg.Ring.InstanceID = "localhost" + cfg.EnableQueryStats = false // Create a cleanup function that will be called at the end of the test cleanup := func() {