From 595b7e2f28a4f6f0afdb4025166a5f24e1533935 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Fri, 16 Sep 2022 18:29:48 +0200 Subject: [PATCH 01/31] Implement granular query performance metrics for Thanos Query These are grabbed from the data returned by multiple Store APIs after execution of a query. Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- cmd/thanos/query.go | 17 +++++ pkg/api/query/grpc.go | 3 + pkg/api/query/v1.go | 92 +++++++++++++++++++++--- pkg/query/querier.go | 86 ++++++++++++++++------- pkg/query/querier_test.go | 39 +++++++++-- pkg/query/query_bench_test.go | 13 ++-- pkg/query/query_test.go | 11 ++- pkg/store/metrics/metrics.go | 75 ++++++++++++++++++++ test/e2e/query_frontend_test.go | 12 ++-- test/e2e/query_test.go | 120 ++++++++++++++++++++++++++++++++ 10 files changed, 415 insertions(+), 53 deletions(-) create mode 100644 pkg/store/metrics/metrics.go diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go index c8c355475e..7e7e04ba3e 100644 --- a/cmd/thanos/query.go +++ b/cmd/thanos/query.go @@ -25,6 +25,7 @@ import ( "github.com/prometheus/prometheus/discovery/targetgroup" "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/promql" + "github.com/thanos-io/thanos/pkg/store/metrics" "google.golang.org/grpc" apiv1 "github.com/thanos-io/thanos/pkg/api/query" @@ -184,6 +185,10 @@ func registerQuery(app *extkingpin.App) { alertQueryURL := cmd.Flag("alert.query-url", "The external Thanos Query URL that would be set in all alerts 'Source' field.").String() grpcProxyStrategy := cmd.Flag("grpc.proxy-strategy", "Strategy to use when proxying Series requests to leaf nodes. Hidden and only used for testing, will be removed after lazy becomes the default.").Default(string(store.EagerRetrieval)).Hidden().Enum(string(store.EagerRetrieval), string(store.LazyRetrieval)) + queryTelemetryDurationQuantiles := cmd.Flag("query.telemetry.request-duration-seconds-quantiles", "The quantiles for exporting metrics about the request duration quantiles.").Default("0.1", "0.25", "0.75", "1.25", "1.75", "2.5", "3", "5", "10").Float64List() + queryTelemetrySamplesQuantiles := cmd.Flag("query.telemetry.request-samples-quantiles", "The quantiles for exporting metrics about the samples count quantiles.").Default("100", "1000", "10000", "100000", "1000000").Int64List() + queryTelemetrySeriesQuantiles := cmd.Flag("query.telemetry.request-series-seconds-quantiles", "The quantiles for exporting metrics about the series count quantiles.").Default("10", "100", "1000", "10000", "100000").Int64List() + cmd.Setup(func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ <-chan struct{}, _ bool) error { selectorLset, err := parseFlagLabels(*selectorLabels) if err != nil { @@ -295,6 +300,9 @@ func registerQuery(app *extkingpin.App) { *alertQueryURL, *grpcProxyStrategy, component.Query, + *queryTelemetryDurationQuantiles, + *queryTelemetrySamplesQuantiles, + *queryTelemetrySeriesQuantiles, ) }) } @@ -366,6 +374,9 @@ func runQuery( alertQueryURL string, grpcProxyStrategy string, comp component.Component, + queryTelemetryDurationQuantiles []float64, + queryTelemetrySamplesQuantiles []int64, + queryTelemetrySeriesQuantiles []int64, ) error { if alertQueryURL == "" { lastColon := strings.LastIndex(httpBindAddr, ":") @@ -658,6 +669,12 @@ func runQuery( extprom.WrapRegistererWithPrefix("thanos_query_concurrent_", reg), maxConcurrentQueries, ), + metrics.NewSeriesQueryPerformanceMetricsAggregator( + reg, + queryTelemetryDurationQuantiles, + queryTelemetrySamplesQuantiles, + queryTelemetrySeriesQuantiles, + ), reg, ) diff --git a/pkg/api/query/grpc.go b/pkg/api/query/grpc.go index 4b2dde66ae..183057b04b 100644 --- a/pkg/api/query/grpc.go +++ b/pkg/api/query/grpc.go @@ -86,6 +86,7 @@ func (g *GRPCAPI) Query(request *querypb.QueryRequest, server querypb.Query_Quer request.EnableQueryPushdown, false, request.ShardInfo, + query.NoopSeriesStatsReporter, ) qry, err := g.queryEngine.NewInstantQuery(queryable, &promql.QueryOpts{LookbackDelta: lookbackDelta}, request.Query, ts) if err != nil { @@ -160,6 +161,8 @@ func (g *GRPCAPI) QueryRange(request *querypb.QueryRangeRequest, srv querypb.Que request.EnableQueryPushdown, false, request.ShardInfo, + // TODO(douglascamata): do we need to do something here? Is there GRPC and HTTP? + query.NoopSeriesStatsReporter, ) startTime := time.Unix(request.StartTimeSeconds, 0) diff --git a/pkg/api/query/v1.go b/pkg/api/query/v1.go index ad9bbb9c94..5092cbd737 100644 --- a/pkg/api/query/v1.go +++ b/pkg/api/query/v1.go @@ -30,6 +30,7 @@ import ( "time" "github.com/go-kit/log" + "github.com/go-kit/log/level" "github.com/opentracing/opentracing-go" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" @@ -41,6 +42,7 @@ import ( "github.com/prometheus/prometheus/promql" "github.com/prometheus/prometheus/promql/parser" "github.com/prometheus/prometheus/storage" + "github.com/thanos-io/thanos/pkg/store/metrics" "github.com/prometheus/prometheus/util/stats" @@ -106,6 +108,8 @@ type QueryAPI struct { defaultMetadataTimeRange time.Duration queryRangeHist prometheus.Histogram + + seriesStatsAggregator *metrics.SeriesQueryPerformanceMetricsAggregator } // NewQueryAPI returns an initialized QueryAPI type. @@ -133,6 +137,7 @@ func NewQueryAPI( defaultMetadataTimeRange time.Duration, disableCORS bool, gate gate.Gate, + statsAggregator *metrics.SeriesQueryPerformanceMetricsAggregator, reg *prometheus.Registry, ) *QueryAPI { return &QueryAPI{ @@ -159,6 +164,7 @@ func NewQueryAPI( defaultInstantQueryMaxSourceResolution: defaultInstantQueryMaxSourceResolution, defaultMetadataTimeRange: defaultMetadataTimeRange, disableCORS: disableCORS, + seriesStatsAggregator: statsAggregator, queryRangeHist: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ Name: "thanos_query_range_requested_timespan_duration_seconds", @@ -395,7 +401,24 @@ func (qapi *QueryAPI) query(r *http.Request) (interface{}, []error, *api.ApiErro span, ctx := tracing.StartSpan(ctx, "promql_instant_query") defer span.Finish() - qry, err := qapi.queryEngine.NewInstantQuery(qapi.queryableCreate(enableDedup, replicaLabels, storeDebugMatchers, maxSourceResolution, enablePartialResponse, qapi.enableQueryPushdown, false, shardInfo), &promql.QueryOpts{LookbackDelta: lookbackDelta}, r.FormValue("query"), ts) + var seriesStats []storepb.SeriesStatsCounter + qry, err := qapi.queryEngine.NewInstantQuery( + qapi.queryableCreate( + enableDedup, + replicaLabels, + storeDebugMatchers, + maxSourceResolution, + enablePartialResponse, + qapi.enableQueryPushdown, + false, + shardInfo, + query.NewAggregateStatsReporter(&seriesStats), + ), + &promql.QueryOpts{LookbackDelta: lookbackDelta}, + r.FormValue("query"), + ts, + ) + if err != nil { return nil, nil, &api.ApiError{Typ: api.ErrorBadData, Err: err}, func() {} } @@ -408,6 +431,7 @@ func (qapi *QueryAPI) query(r *http.Request) (interface{}, []error, *api.ApiErro } defer qapi.gate.Done() + beforeRange := time.Now() res := qry.Exec(ctx) if res.Err != nil { switch res.Err.(type) { @@ -420,6 +444,12 @@ func (qapi *QueryAPI) query(r *http.Request) (interface{}, []error, *api.ApiErro } return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: res.Err}, qry.Close } + level.Info(qapi.logger).Log("totalStats", len(seriesStats)) + for i := range seriesStats { + level.Info(qapi.logger).Log("series", seriesStats[i].Series, "samples", seriesStats[i].Samples) + qapi.seriesStatsAggregator.Aggregate(seriesStats[i]) + } + qapi.seriesStatsAggregator.Observe(time.Now().Sub(beforeRange).Seconds()) // Optional stats field in response if parameter "stats" is not empty. var qs stats.QueryStats @@ -524,8 +554,19 @@ func (qapi *QueryAPI) queryRange(r *http.Request) (interface{}, []error, *api.Ap span, ctx := tracing.StartSpan(ctx, "promql_range_query") defer span.Finish() + var seriesStats []storepb.SeriesStatsCounter qry, err := qapi.queryEngine.NewRangeQuery( - qapi.queryableCreate(enableDedup, replicaLabels, storeDebugMatchers, maxSourceResolution, enablePartialResponse, qapi.enableQueryPushdown, false, shardInfo), + qapi.queryableCreate( + enableDedup, + replicaLabels, + storeDebugMatchers, + maxSourceResolution, + enablePartialResponse, + qapi.enableQueryPushdown, + false, + shardInfo, + query.NewAggregateStatsReporter(&seriesStats), + ), &promql.QueryOpts{LookbackDelta: lookbackDelta}, r.FormValue("query"), start, @@ -544,6 +585,7 @@ func (qapi *QueryAPI) queryRange(r *http.Request) (interface{}, []error, *api.Ap } defer qapi.gate.Done() + beforeRange := time.Now() res := qry.Exec(ctx) if res.Err != nil { switch res.Err.(type) { @@ -554,6 +596,12 @@ func (qapi *QueryAPI) queryRange(r *http.Request) (interface{}, []error, *api.Ap } return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: res.Err}, qry.Close } + level.Info(qapi.logger).Log("totalStats", len(seriesStats)) + for i := range seriesStats { + qapi.seriesStatsAggregator.Aggregate(seriesStats[i]) + level.Info(qapi.logger).Log("series", seriesStats[i].Series, "samples", seriesStats[i].Samples) + } + qapi.seriesStatsAggregator.Observe(time.Now().Sub(beforeRange).Seconds()) // Optional stats field in response if parameter "stats" is not empty. var qs stats.QueryStats @@ -599,8 +647,17 @@ func (qapi *QueryAPI) labelValues(r *http.Request) (interface{}, []error, *api.A matcherSets = append(matcherSets, matchers) } - q, err := qapi.queryableCreate(true, nil, storeDebugMatchers, 0, enablePartialResponse, qapi.enableQueryPushdown, true, nil). - Querier(ctx, timestamp.FromTime(start), timestamp.FromTime(end)) + q, err := qapi.queryableCreate( + true, + nil, + storeDebugMatchers, + 0, + enablePartialResponse, + qapi.enableQueryPushdown, + true, + nil, + query.NoopSeriesStatsReporter, + ).Querier(ctx, timestamp.FromTime(start), timestamp.FromTime(end)) if err != nil { return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: err}, func() {} } @@ -686,8 +743,18 @@ func (qapi *QueryAPI) series(r *http.Request) (interface{}, []error, *api.ApiErr return nil, nil, apiErr, func() {} } - q, err := qapi.queryableCreate(enableDedup, replicaLabels, storeDebugMatchers, math.MaxInt64, enablePartialResponse, qapi.enableQueryPushdown, true, nil). - Querier(r.Context(), timestamp.FromTime(start), timestamp.FromTime(end)) + q, err := qapi.queryableCreate( + enableDedup, + replicaLabels, + storeDebugMatchers, + math.MaxInt64, + enablePartialResponse, + qapi.enableQueryPushdown, + true, + nil, + query.NoopSeriesStatsReporter, + ).Querier(r.Context(), timestamp.FromTime(start), timestamp.FromTime(end)) + if err != nil { return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: err}, func() {} } @@ -736,8 +803,17 @@ func (qapi *QueryAPI) labelNames(r *http.Request) (interface{}, []error, *api.Ap matcherSets = append(matcherSets, matchers) } - q, err := qapi.queryableCreate(true, nil, storeDebugMatchers, 0, enablePartialResponse, qapi.enableQueryPushdown, true, nil). - Querier(r.Context(), timestamp.FromTime(start), timestamp.FromTime(end)) + q, err := qapi.queryableCreate( + true, + nil, + storeDebugMatchers, + 0, + enablePartialResponse, + qapi.enableQueryPushdown, + true, + nil, + query.NoopSeriesStatsReporter, + ).Querier(r.Context(), timestamp.FromTime(start), timestamp.FromTime(end)) if err != nil { return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: err}, func() {} } diff --git a/pkg/query/querier.go b/pkg/query/querier.go index 361834c07d..d971e29002 100644 --- a/pkg/query/querier.go +++ b/pkg/query/querier.go @@ -7,6 +7,7 @@ import ( "context" "sort" "strings" + "sync" "time" "github.com/go-kit/log" @@ -28,21 +29,60 @@ import ( "github.com/thanos-io/thanos/pkg/tracing" ) +type SeriesStatsReporter func(seriesStats storepb.SeriesStatsCounter) + +var NoopSeriesStatsReporter SeriesStatsReporter = func(_ storepb.SeriesStatsCounter) {} + +func NewAggregateStatsReporter(stats *[]storepb.SeriesStatsCounter) SeriesStatsReporter { + var mutex sync.Mutex + return func(s storepb.SeriesStatsCounter) { + mutex.Lock() + defer mutex.Unlock() + *stats = append(*stats, s) + } +} + // QueryableCreator returns implementation of promql.Queryable that fetches data from the proxy store API endpoints. // If deduplication is enabled, all data retrieved from it will be deduplicated along all replicaLabels by default. // When the replicaLabels argument is not empty it overwrites the global replicaLabels flag. This allows specifying // replicaLabels at query time. // maxResolutionMillis controls downsampling resolution that is allowed (specified in milliseconds). // partialResponse controls `partialResponseDisabled` option of StoreAPI and partial response behavior of proxy. -type QueryableCreator func(deduplicate bool, replicaLabels []string, storeDebugMatchers [][]*labels.Matcher, maxResolutionMillis int64, partialResponse, enableQueryPushdown, skipChunks bool, shardInfo *storepb.ShardInfo) storage.Queryable +type QueryableCreator func( + deduplicate bool, + replicaLabels []string, + storeDebugMatchers [][]*labels.Matcher, + maxResolutionMillis int64, + partialResponse, + enableQueryPushdown, + skipChunks bool, + shardInfo *storepb.ShardInfo, + seriesStatsReporter SeriesStatsReporter, +) storage.Queryable // NewQueryableCreator creates QueryableCreator. -func NewQueryableCreator(logger log.Logger, reg prometheus.Registerer, proxy storepb.StoreServer, maxConcurrentSelects int, selectTimeout time.Duration) QueryableCreator { +func NewQueryableCreator( + logger log.Logger, + reg prometheus.Registerer, + proxy storepb.StoreServer, + maxConcurrentSelects int, + selectTimeout time.Duration, +) QueryableCreator { duration := promauto.With( extprom.WrapRegistererWithPrefix("concurrent_selects_", reg), ).NewHistogram(gate.DurationHistogramOpts) - return func(deduplicate bool, replicaLabels []string, storeDebugMatchers [][]*labels.Matcher, maxResolutionMillis int64, partialResponse, enableQueryPushdown, skipChunks bool, shardInfo *storepb.ShardInfo) storage.Queryable { + return func( + deduplicate bool, + replicaLabels []string, + storeDebugMatchers [][]*labels.Matcher, + maxResolutionMillis int64, + partialResponse, + enableQueryPushdown, + skipChunks bool, + shardInfo *storepb.ShardInfo, + seriesStatsReporter SeriesStatsReporter, + ) storage.Queryable { return &queryable{ logger: logger, replicaLabels: replicaLabels, @@ -59,6 +99,7 @@ func NewQueryableCreator(logger log.Logger, reg prometheus.Registerer, proxy sto selectTimeout: selectTimeout, enableQueryPushdown: enableQueryPushdown, shardInfo: shardInfo, + seriesStatsReporter: seriesStatsReporter, } } } @@ -77,11 +118,12 @@ type queryable struct { selectTimeout time.Duration enableQueryPushdown bool shardInfo *storepb.ShardInfo + seriesStatsReporter SeriesStatsReporter } // Querier returns a new storage querier against the underlying proxy store API. func (q *queryable) Querier(ctx context.Context, mint, maxt int64) (storage.Querier, error) { - return newQuerier(ctx, q.logger, mint, maxt, q.replicaLabels, q.storeDebugMatchers, q.proxy, q.deduplicate, q.maxResolutionMillis, q.partialResponse, q.enableQueryPushdown, q.skipChunks, q.gateProviderFn(), q.selectTimeout, q.shardInfo), nil + return newQuerier(ctx, q.logger, mint, maxt, q.replicaLabels, q.storeDebugMatchers, q.proxy, q.deduplicate, q.maxResolutionMillis, q.partialResponse, q.enableQueryPushdown, q.skipChunks, q.gateProviderFn(), q.selectTimeout, q.shardInfo, q.seriesStatsReporter), nil } type querier struct { @@ -100,24 +142,12 @@ type querier struct { selectGate gate.Gate selectTimeout time.Duration shardInfo *storepb.ShardInfo + seriesStatsReporter SeriesStatsReporter } // newQuerier creates implementation of storage.Querier that fetches data from the proxy // store API endpoints. -func newQuerier( - ctx context.Context, - logger log.Logger, - mint, maxt int64, - replicaLabels []string, - storeDebugMatchers [][]*labels.Matcher, - proxy storepb.StoreServer, - deduplicate bool, - maxResolutionMillis int64, - partialResponse, enableQueryPushdown bool, skipChunks bool, - selectGate gate.Gate, - selectTimeout time.Duration, - shardInfo *storepb.ShardInfo, -) *querier { +func newQuerier(ctx context.Context, logger log.Logger, mint, maxt int64, replicaLabels []string, storeDebugMatchers [][]*labels.Matcher, proxy storepb.StoreServer, deduplicate bool, maxResolutionMillis int64, partialResponse, enableQueryPushdown, skipChunks bool, selectGate gate.Gate, selectTimeout time.Duration, shardInfo *storepb.ShardInfo, seriesStatsReporter SeriesStatsReporter) *querier { if logger == nil { logger = log.NewNopLogger() } @@ -145,6 +175,7 @@ func newQuerier( skipChunks: skipChunks, enableQueryPushdown: enableQueryPushdown, shardInfo: shardInfo, + seriesStatsReporter: seriesStatsReporter, } } @@ -157,8 +188,9 @@ type seriesServer struct { storepb.Store_SeriesServer ctx context.Context - seriesSet []storepb.Series - warnings []string + seriesSet []storepb.Series + seriesSetStats storepb.SeriesStatsCounter + warnings []string } func (s *seriesServer) Send(r *storepb.SeriesResponse) error { @@ -169,6 +201,7 @@ func (s *seriesServer) Send(r *storepb.SeriesResponse) error { if r.GetSeries() != nil { s.seriesSet = append(s.seriesSet, *r.GetSeries()) + s.seriesSetStats.Count(r.GetSeries()) return nil } @@ -257,11 +290,12 @@ func (q *querier) Select(_ bool, hints *storage.SelectHints, ms ...*labels.Match span, ctx := tracing.StartSpan(ctx, "querier_select_select_fn") defer span.Finish() - set, err := q.selectFn(ctx, hints, ms...) + set, stats, err := q.selectFn(ctx, hints, ms...) if err != nil { promise <- storage.ErrSeriesSet(err) return } + q.seriesStatsReporter(stats) promise <- set }() @@ -279,10 +313,10 @@ func (q *querier) Select(_ bool, hints *storage.SelectHints, ms ...*labels.Match }} } -func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms ...*labels.Matcher) (storage.SeriesSet, error) { +func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms ...*labels.Matcher) (storage.SeriesSet, storepb.SeriesStatsCounter, error) { sms, err := storepb.PromMatchersToMatchers(ms...) if err != nil { - return nil, errors.Wrap(err, "convert matchers") + return nil, storepb.SeriesStatsCounter{}, errors.Wrap(err, "convert matchers") } aggrs := aggrsFromFunc(hints.Func) @@ -310,7 +344,7 @@ func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms . Step: hints.Step, Range: hints.Range, }, resp); err != nil { - return nil, errors.Wrap(err, "proxy Series()") + return nil, storepb.SeriesStatsCounter{}, errors.Wrap(err, "proxy Series()") } var warns storage.Warnings @@ -342,7 +376,7 @@ func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms . set: newStoreSeriesSet(resp.seriesSet), aggrs: aggrs, warns: warns, - }, nil + }, resp.seriesSetStats, nil } // TODO(fabxc): this could potentially pushed further down into the store API to make true streaming possible. @@ -357,7 +391,7 @@ func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms . // The merged series set assembles all potentially-overlapping time ranges of the same series into a single one. // TODO(bwplotka): We could potentially dedup on chunk level, use chunk iterator for that when available. - return dedup.NewSeriesSet(set, q.replicaLabels, hints.Func, q.enableQueryPushdown), nil + return dedup.NewSeriesSet(set, q.replicaLabels, hints.Func, q.enableQueryPushdown), resp.seriesSetStats, nil } // sortDedupLabels re-sorts the set so that the same series with different replica diff --git a/pkg/query/querier_test.go b/pkg/query/querier_test.go index fe6ba688d2..2e31fa65a0 100644 --- a/pkg/query/querier_test.go +++ b/pkg/query/querier_test.go @@ -44,7 +44,17 @@ func TestQueryableCreator_MaxResolution(t *testing.T) { queryableCreator := NewQueryableCreator(nil, nil, testProxy, 2, 5*time.Second) oneHourMillis := int64(1*time.Hour) / int64(time.Millisecond) - queryable := queryableCreator(false, nil, nil, oneHourMillis, false, false, false, nil) + queryable := queryableCreator( + false, + nil, + nil, + oneHourMillis, + false, + false, + false, + nil, + NoopSeriesStatsReporter, + ) q, err := queryable.Querier(context.Background(), 0, 42) testutil.Ok(t, err) @@ -71,7 +81,22 @@ func TestQuerier_DownsampledData(t *testing.T) { } timeout := 10 * time.Second - q := NewQueryableCreator(nil, nil, testProxy, 2, timeout)(false, nil, nil, 9999999, false, false, false, nil) + q := NewQueryableCreator( + nil, + nil, + testProxy, + 2, + timeout, + )(false, + nil, + nil, + 9999999, + false, + false, + false, + nil, + NoopSeriesStatsReporter, + ) engine := promql.NewEngine( promql.EngineOpts{ MaxSamples: math.MaxInt32, @@ -299,10 +324,12 @@ func (s series) Iterator() chunkenc.Iterator { // To test with real data: // Collect the expected results from Prometheus or Thanos through "/api/v1/query_range" and save to a file. // Collect raw data to be used for local storage: +// // scripts/insecure_grpcurl_series.sh querierGrpcIP:port '[{"name":"type","value":"current"},{"name":"_id","value":"xxx"}]' 1597823000000 1597824600000 > localStorage.json // Remove all white space from the file and put each series in a new line. // When collecting the raw data mint should be Prometheus query time minus the default look back delta(default is 5min or 300000ms) // For example if the Prometheus query mint is 1597823700000 the grpccurl query mint should be 1597823400000. +// // This is because when promql displays data for a given range it looks back 5min before the requested time window. func TestQuerier_Select_AfterPromQL(t *testing.T) { logger := log.NewLogfmtLogger(os.Stderr) @@ -363,7 +390,7 @@ func TestQuerier_Select_AfterPromQL(t *testing.T) { g := gate.New(2) mq := &mockedQueryable{ Creator: func(mint, maxt int64) storage.Querier { - return newQuerier(context.Background(), nil, mint, maxt, tcase.replicaLabels, nil, tcase.storeAPI, sc.dedup, 0, true, false, false, g, timeout, nil) + return newQuerier(context.Background(), nil, mint, maxt, tcase.replicaLabels, nil, tcase.storeAPI, sc.dedup, 0, true, false, false, g, timeout, nil, NoopSeriesStatsReporter) }, } t.Cleanup(func() { @@ -607,7 +634,7 @@ func TestQuerier_Select(t *testing.T) { {dedup: true, expected: []series{tcase.expectedAfterDedup}}, } { g := gate.New(2) - q := newQuerier(context.Background(), nil, tcase.mint, tcase.maxt, tcase.replicaLabels, nil, tcase.storeAPI, sc.dedup, 0, true, false, false, g, timeout, nil) + q := newQuerier(context.Background(), nil, tcase.mint, tcase.maxt, tcase.replicaLabels, nil, tcase.storeAPI, sc.dedup, 0, true, false, false, g, timeout, nil, func(i storepb.SeriesStatsCounter) {}) t.Cleanup(func() { testutil.Ok(t, q.Close()) }) t.Run(fmt.Sprintf("dedup=%v", sc.dedup), func(t *testing.T) { @@ -836,7 +863,7 @@ func TestQuerierWithDedupUnderstoodByPromQL_Rate(t *testing.T) { timeout := 100 * time.Second g := gate.New(2) - q := newQuerier(context.Background(), logger, realSeriesWithStaleMarkerMint, realSeriesWithStaleMarkerMaxt, []string{"replica"}, nil, s, false, 0, true, false, false, g, timeout, nil) + q := newQuerier(context.Background(), logger, realSeriesWithStaleMarkerMint, realSeriesWithStaleMarkerMaxt, []string{"replica"}, nil, s, false, 0, true, false, false, g, timeout, nil, NoopSeriesStatsReporter) t.Cleanup(func() { testutil.Ok(t, q.Close()) }) @@ -906,7 +933,7 @@ func TestQuerierWithDedupUnderstoodByPromQL_Rate(t *testing.T) { timeout := 5 * time.Second g := gate.New(2) - q := newQuerier(context.Background(), logger, realSeriesWithStaleMarkerMint, realSeriesWithStaleMarkerMaxt, []string{"replica"}, nil, s, true, 0, true, false, false, g, timeout, nil) + q := newQuerier(context.Background(), logger, realSeriesWithStaleMarkerMint, realSeriesWithStaleMarkerMaxt, []string{"replica"}, nil, s, true, 0, true, false, false, g, timeout, nil, NoopSeriesStatsReporter) t.Cleanup(func() { testutil.Ok(t, q.Close()) }) diff --git a/pkg/query/query_bench_test.go b/pkg/query/query_bench_test.go index 301c880877..84efb46820 100644 --- a/pkg/query/query_bench_test.go +++ b/pkg/query/query_bench_test.go @@ -80,12 +80,13 @@ func benchQuerySelect(t testutil.TB, totalSamples, totalSeries int, dedup bool) logger := log.NewNopLogger() q := &querier{ - ctx: context.Background(), - logger: logger, - proxy: &mockedStoreServer{responses: resps}, - replicaLabels: map[string]struct{}{"a_replica": {}}, - deduplicate: dedup, - selectGate: gate.NewNoop(), + ctx: context.Background(), + logger: logger, + proxy: &mockedStoreServer{responses: resps}, + replicaLabels: map[string]struct{}{"a_replica": {}}, + deduplicate: dedup, + selectGate: gate.NewNoop(), + seriesStatsReporter: NoopSeriesStatsReporter, } testSelect(t, q, expectedSeries) } diff --git a/pkg/query/query_test.go b/pkg/query/query_test.go index 99e29be66f..060571fc70 100644 --- a/pkg/query/query_test.go +++ b/pkg/query/query_test.go @@ -54,7 +54,16 @@ func TestQuerier_Proxy(t *testing.T) { name: fmt.Sprintf("store number %v", i), }) } - return q(true, nil, nil, 0, false, false, false, nil) + return q(true, + nil, + nil, + 0, + false, + false, + false, + nil, + NoopSeriesStatsReporter, + ) } for _, fn := range files { diff --git a/pkg/store/metrics/metrics.go b/pkg/store/metrics/metrics.go new file mode 100644 index 0000000000..3738da25f4 --- /dev/null +++ b/pkg/store/metrics/metrics.go @@ -0,0 +1,75 @@ +package metrics + +import ( + "strconv" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + "github.com/thanos-io/thanos/pkg/store/storepb" +) + +// SeriesQueryPerformanceMetricsAggregator aggregates results from fanned-out queries into a histogram given their +// response's shape. +type SeriesQueryPerformanceMetricsAggregator struct { + QueryDuration *prometheus.HistogramVec + + SeriesLeBuckets []int64 + SamplesLeBuckets []int64 + SeriesStats storepb.SeriesStatsCounter +} + +// NewSeriesQueryPerformanceMetricsAggregator is a constructor for SeriesQueryPerformanceMetricsAggregator. +func NewSeriesQueryPerformanceMetricsAggregator( + reg prometheus.Registerer, + durationQuantiles []float64, + sampleQuantiles []int64, + seriesQuantiles []int64, +) *SeriesQueryPerformanceMetricsAggregator { + return &SeriesQueryPerformanceMetricsAggregator{ + QueryDuration: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ + Name: "thanos_store_api_query_duration_seconds", + Help: "Duration of the Thanos Store API select phase for a query.", + Buckets: durationQuantiles, + }, []string{"series_le", "samples_le"}), + SeriesLeBuckets: seriesQuantiles, + SamplesLeBuckets: sampleQuantiles, + SeriesStats: storepb.SeriesStatsCounter{}, + } +} + +// Aggregate is an aggregator for merging `storepb.SeriesStatsCounter` for each incoming fanned out query. +func (s *SeriesQueryPerformanceMetricsAggregator) Aggregate(stats storepb.SeriesStatsCounter) { + s.SeriesStats.Series += stats.Series + s.SeriesStats.Samples += stats.Samples + s.SeriesStats.Chunks += stats.Chunks +} + +// Observe commits the aggregated SeriesStatsCounter as an observation. +func (s *SeriesQueryPerformanceMetricsAggregator) Observe(duration float64) { + // Bucket matching for series/labels matchSeriesBucket/matchSamplesBucket => float64, float64 + seriesLeBucket := s.findBucket(float64(s.SeriesStats.Series), s.SeriesLeBuckets) + samplesLeBucket := s.findBucket(float64(s.SeriesStats.Samples), s.SamplesLeBuckets) + s.QueryDuration.With(prometheus.Labels{ + "series_le": strconv.Itoa(int(seriesLeBucket)), + "samples_le": strconv.Itoa(int(samplesLeBucket)), + }).Observe(duration) + s.reset() +} + +func (s *SeriesQueryPerformanceMetricsAggregator) reset() { + s.SeriesStats = storepb.SeriesStatsCounter{} +} + +func (s *SeriesQueryPerformanceMetricsAggregator) findBucket(value float64, quantiles []int64) int64 { + if len(quantiles) == 0 { + return 0 + } + var foundBucket int64 + for _, bucket := range quantiles { + foundBucket = bucket + if value < float64(bucket) { + break + } + } + return foundBucket +} diff --git a/test/e2e/query_frontend_test.go b/test/e2e/query_frontend_test.go index 54d81168b2..267e2a8bfd 100644 --- a/test/e2e/query_frontend_test.go +++ b/test/e2e/query_frontend_test.go @@ -214,8 +214,8 @@ func TestQueryFrontend(t *testing.T) { testutil.Ok(t, queryFrontend.WaitSumMetricsWithOptions( e2e.Equals(3), []string{"thanos_query_frontend_queries_total"}, - e2e.WithLabelMatchers(matchers.MustNewMatcher(matchers.MatchEqual, "op", "query_range"))), - ) + e2e.WithLabelMatchers(matchers.MustNewMatcher(matchers.MatchEqual, "op", "query_range")), + )) testutil.Ok(t, queryFrontend.WaitSumMetrics(e2e.Equals(3), "cortex_cache_fetched_keys_total")) testutil.Ok(t, queryFrontend.WaitSumMetrics(e2e.Equals(2), "cortex_cache_hits_total")) testutil.Ok(t, queryFrontend.WaitSumMetrics(e2e.Equals(1), "querier_cache_added_new_total")) @@ -227,14 +227,14 @@ func TestQueryFrontend(t *testing.T) { // Query is 25h so it will be split to 2 requests. testutil.Ok(t, queryFrontend.WaitSumMetricsWithOptions( e2e.Equals(4), []string{"thanos_frontend_split_queries_total"}, - e2e.WithLabelMatchers(matchers.MustNewMatcher(matchers.MatchEqual, "tripperware", "query_range"))), - ) + e2e.WithLabelMatchers(matchers.MustNewMatcher(matchers.MatchEqual, "tripperware", "query_range")), + )) testutil.Ok(t, q.WaitSumMetricsWithOptions( e2e.Equals(4), []string{"http_requests_total"}, - e2e.WithLabelMatchers(matchers.MustNewMatcher(matchers.MatchEqual, "handler", "query_range"))), - ) + e2e.WithLabelMatchers(matchers.MustNewMatcher(matchers.MatchEqual, "handler", "query_range")), + )) }) t.Run("query frontend splitting works for labels names API", func(t *testing.T) { diff --git a/test/e2e/query_test.go b/test/e2e/query_test.go index 6cf84381e4..01d8297888 100644 --- a/test/e2e/query_test.go +++ b/test/e2e/query_test.go @@ -20,6 +20,8 @@ import ( "testing" "time" + e2edb "github.com/efficientgo/e2e/db" + e2emonitoring "github.com/efficientgo/e2e/monitoring" "github.com/gogo/protobuf/proto" "github.com/golang/snappy" config_util "github.com/prometheus/common/config" @@ -186,6 +188,7 @@ func TestQuery(t *testing.T) { "prometheus": "prom-ha", }, }) + fmt.Println("foobar") } func TestQueryExternalPrefixWithoutReverseProxy(t *testing.T) { @@ -579,6 +582,123 @@ func newSample(s fakeMetricSample) model.Sample { } } +func TestQueryStoreMetrics(t *testing.T) { + t.Parallel() + + // Build up. + e, err := e2e.NewDockerEnvironment("e2e-query-store-metrics") + testutil.Ok(t, err) + t.Cleanup(e2ethanos.CleanScenario(t, e)) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) + t.Cleanup(cancel) + + bucket := "store-gw-test" + minio := e2ethanos.NewMinio(e, "thanos-minio", bucket) + testutil.Ok(t, e2e.StartAndWaitReady(minio)) + + l := log.NewLogfmtLogger(os.Stdout) + bkt, err := s3.NewBucketWithConfig(l, e2ethanos.NewS3Config(bucket, minio.Endpoint("https"), minio.Dir()), "test") + testutil.Ok(t, err) + + blockSizes := []struct { + samples int + series int + name string + }{ + {samples: 10, series: 1, name: "one_series"}, + {samples: 10, series: 1001, name: "thousand_one_series"}, + } + now := time.Now() + externalLabels := labels.FromStrings("prometheus", "p1", "replica", "0") + dir := filepath.Join(e.SharedDir(), "tmp") + testutil.Ok(t, os.MkdirAll(filepath.Join(e.SharedDir(), dir), os.ModePerm)) + for _, blockSize := range blockSizes { + series := make([]labels.Labels, blockSize.series, blockSize.series) + for i := 0; i < blockSize.series; i++ { + bigSeriesLabels := labels.FromStrings("__name__", blockSize.name, "instance", fmt.Sprintf("foo_%d", i)) + series[i] = bigSeriesLabels + } + blockID, err := e2eutil.CreateBlockWithBlockDelay(ctx, + dir, + series, + blockSize.samples, + timestamp.FromTime(now), + timestamp.FromTime(now.Add(2*time.Hour)), + 30*time.Minute, + externalLabels, + 0, + metadata.NoneFunc, + ) + testutil.Ok(t, err) + testutil.Ok(t, objstore.UploadDir(ctx, l, bkt, path.Join(dir, blockID.String()), blockID.String())) + } + + s1 := e2ethanos.NewStoreGW( + e, + "s1", + client.BucketConfig{ + Type: client.S3, + Config: e2ethanos.NewS3Config(bucket, minio.InternalEndpoint("https"), minio.InternalDir()), + }, + "", + nil, + ) + testutil.Ok(t, e2e.StartAndWaitReady(s1)) + + q := e2ethanos.NewQuerierBuilder(e, "1", s1.InternalEndpoint("grpc")).Init() + testutil.Ok(t, e2e.StartAndWaitReady(q)) + testutil.Ok(t, s1.WaitSumMetrics(e2e.Equals(2), "thanos_blocks_meta_synced")) + + instantQuery(t, ctx, q.Endpoint("http"), func() string { + return "max_over_time(one_series{instance='foo_0'}[2h])" + }, time.Now, promclient.QueryOptions{ + Deduplicate: true, + }, 1) + instantQuery(t, ctx, q.Endpoint("http"), func() string { + return "max_over_time(thousand_one_series[2h])" + }, time.Now, promclient.QueryOptions{ + Deduplicate: true, + }, 1001) + + mon, err := e2emonitoring.Start(e) + testutil.Ok(t, err) + + queryWaitAndAssert(t, ctx, mon.GetMonitoringRunnable().Endpoint(e2edb.AccessPortName), func() string { + return "thanos_store_api_query_duration_seconds_count{samples_le='100000',series_le='10000'}" + }, time.Now, promclient.QueryOptions{ + Deduplicate: true, + }, model.Vector{ + &model.Sample{ + Metric: model.Metric{ + "__name__": "thanos_store_api_query_duration_seconds_count", + "instance": "e2e-query-store-metrics-querier-1:8080", + "job": "querier-1", + "samples_le": "100000", + "series_le": "10000", + }, + Value: model.SampleValue(1), + }, + }) + + queryWaitAndAssert(t, ctx, mon.GetMonitoringRunnable().Endpoint(e2edb.AccessPortName), func() string { + return "thanos_store_api_query_duration_seconds_count{samples_le='100',series_le='10'}" + }, time.Now, promclient.QueryOptions{ + Deduplicate: true, + }, model.Vector{ + &model.Sample{ + Metric: model.Metric{ + "__name__": "thanos_store_api_query_duration_seconds_count", + "instance": "e2e-query-store-metrics-querier-1:8080", + "job": "querier-1", + "samples_le": "100", + "series_le": "10", + }, + Value: model.SampleValue(1), + }, + }) +} + // Regression test for https://github.com/thanos-io/thanos/issues/5033. // Tests whether queries work with mixed sources, and with functions // that we are pushing down: min, max, min_over_time, max_over_time, From f6e25119f85787aa54be1a105138fb78db05da28 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Thu, 29 Sep 2022 10:59:04 +0200 Subject: [PATCH 02/31] Fix some linter warnings Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- pkg/api/query/v1.go | 4 ++-- test/e2e/query_test.go | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/api/query/v1.go b/pkg/api/query/v1.go index 1bdbbac4b8..de6097782b 100644 --- a/pkg/api/query/v1.go +++ b/pkg/api/query/v1.go @@ -450,7 +450,7 @@ func (qapi *QueryAPI) query(r *http.Request) (interface{}, []error, *api.ApiErro level.Info(qapi.logger).Log("series", seriesStats[i].Series, "samples", seriesStats[i].Samples) qapi.seriesStatsAggregator.Aggregate(seriesStats[i]) } - qapi.seriesStatsAggregator.Observe(time.Now().Sub(beforeRange).Seconds()) + qapi.seriesStatsAggregator.Observe(time.Since(beforeRange).Seconds()) // Optional stats field in response if parameter "stats" is not empty. var qs stats.QueryStats @@ -602,7 +602,7 @@ func (qapi *QueryAPI) queryRange(r *http.Request) (interface{}, []error, *api.Ap qapi.seriesStatsAggregator.Aggregate(seriesStats[i]) level.Info(qapi.logger).Log("series", seriesStats[i].Series, "samples", seriesStats[i].Samples) } - qapi.seriesStatsAggregator.Observe(time.Now().Sub(beforeRange).Seconds()) + qapi.seriesStatsAggregator.Observe(time.Since(beforeRange).Seconds()) // Optional stats field in response if parameter "stats" is not empty. var qs stats.QueryStats diff --git a/test/e2e/query_test.go b/test/e2e/query_test.go index 82c9dd1733..4bc7aaf168 100644 --- a/test/e2e/query_test.go +++ b/test/e2e/query_test.go @@ -612,7 +612,7 @@ func TestQueryStoreMetrics(t *testing.T) { dir := filepath.Join(e.SharedDir(), "tmp") testutil.Ok(t, os.MkdirAll(filepath.Join(e.SharedDir(), dir), os.ModePerm)) for _, blockSize := range blockSizes { - series := make([]labels.Labels, blockSize.series, blockSize.series) + series := make([]labels.Labels, blockSize.series) for i := 0; i < blockSize.series; i++ { bigSeriesLabels := labels.FromStrings("__name__", blockSize.name, "instance", fmt.Sprintf("foo_%d", i)) series[i] = bigSeriesLabels From 193d76f56cb4622a98f197f8cf83fcd9a96ca7a9 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Thu, 29 Sep 2022 11:53:44 +0200 Subject: [PATCH 03/31] Remove useless logs Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- pkg/api/query/v1.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/pkg/api/query/v1.go b/pkg/api/query/v1.go index de6097782b..193c21d41a 100644 --- a/pkg/api/query/v1.go +++ b/pkg/api/query/v1.go @@ -445,9 +445,7 @@ func (qapi *QueryAPI) query(r *http.Request) (interface{}, []error, *api.ApiErro } return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: res.Err}, qry.Close } - level.Info(qapi.logger).Log("totalStats", len(seriesStats)) for i := range seriesStats { - level.Info(qapi.logger).Log("series", seriesStats[i].Series, "samples", seriesStats[i].Samples) qapi.seriesStatsAggregator.Aggregate(seriesStats[i]) } qapi.seriesStatsAggregator.Observe(time.Since(beforeRange).Seconds()) From 86f56e0d00cea344841f9f7cf2cff1bd2de35586 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Thu, 29 Sep 2022 11:53:59 +0200 Subject: [PATCH 04/31] Refactor query tests Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- test/e2e/query_test.go | 90 ++++++++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 42 deletions(-) diff --git a/test/e2e/query_test.go b/test/e2e/query_test.go index 4bc7aaf168..2837a520ee 100644 --- a/test/e2e/query_test.go +++ b/test/e2e/query_test.go @@ -186,7 +186,6 @@ func TestQuery(t *testing.T) { "prometheus": "prom-ha", }, }) - fmt.Println("foobar") } func TestQueryExternalPrefixWithoutReverseProxy(t *testing.T) { @@ -584,7 +583,7 @@ func TestQueryStoreMetrics(t *testing.T) { t.Parallel() // Build up. - e, err := e2e.NewDockerEnvironment("e2e-query-store-metrics") + e, err := e2e.New(e2e.WithName("e2e-query-store-metrics")) testutil.Ok(t, err) t.Cleanup(e2ethanos.CleanScenario(t, e)) @@ -599,37 +598,40 @@ func TestQueryStoreMetrics(t *testing.T) { bkt, err := s3.NewBucketWithConfig(l, e2ethanos.NewS3Config(bucket, minio.Endpoint("https"), minio.Dir()), "test") testutil.Ok(t, err) - blockSizes := []struct { - samples int - series int - name string - }{ - {samples: 10, series: 1, name: "one_series"}, - {samples: 10, series: 1001, name: "thousand_one_series"}, - } - now := time.Now() - externalLabels := labels.FromStrings("prometheus", "p1", "replica", "0") - dir := filepath.Join(e.SharedDir(), "tmp") - testutil.Ok(t, os.MkdirAll(filepath.Join(e.SharedDir(), dir), os.ModePerm)) - for _, blockSize := range blockSizes { - series := make([]labels.Labels, blockSize.series) - for i := 0; i < blockSize.series; i++ { - bigSeriesLabels := labels.FromStrings("__name__", blockSize.name, "instance", fmt.Sprintf("foo_%d", i)) - series[i] = bigSeriesLabels + // Preparing 2 different blocks for the tests. + { + blockSizes := []struct { + samples int + series int + name string + }{ + {samples: 10, series: 1, name: "one_series"}, + {samples: 10, series: 1001, name: "thousand_one_series"}, + } + now := time.Now() + externalLabels := labels.FromStrings("prometheus", "p1", "replica", "0") + dir := filepath.Join(e.SharedDir(), "tmp") + testutil.Ok(t, os.MkdirAll(filepath.Join(e.SharedDir(), dir), os.ModePerm)) + for _, blockSize := range blockSizes { + series := make([]labels.Labels, blockSize.series) + for i := 0; i < blockSize.series; i++ { + bigSeriesLabels := labels.FromStrings("__name__", blockSize.name, "instance", fmt.Sprintf("foo_%d", i)) + series[i] = bigSeriesLabels + } + blockID, err := e2eutil.CreateBlockWithBlockDelay(ctx, + dir, + series, + blockSize.samples, + timestamp.FromTime(now), + timestamp.FromTime(now.Add(2*time.Hour)), + 30*time.Minute, + externalLabels, + 0, + metadata.NoneFunc, + ) + testutil.Ok(t, err) + testutil.Ok(t, objstore.UploadDir(ctx, l, bkt, path.Join(dir, blockID.String()), blockID.String())) } - blockID, err := e2eutil.CreateBlockWithBlockDelay(ctx, - dir, - series, - blockSize.samples, - timestamp.FromTime(now), - timestamp.FromTime(now.Add(2*time.Hour)), - 30*time.Minute, - externalLabels, - 0, - metadata.NoneFunc, - ) - testutil.Ok(t, err) - testutil.Ok(t, objstore.UploadDir(ctx, l, bkt, path.Join(dir, blockID.String()), blockID.String())) } s1 := e2ethanos.NewStoreGW( @@ -648,16 +650,20 @@ func TestQueryStoreMetrics(t *testing.T) { testutil.Ok(t, e2e.StartAndWaitReady(q)) testutil.Ok(t, s1.WaitSumMetrics(e2emon.Equals(2), "thanos_blocks_meta_synced")) - instantQuery(t, ctx, q.Endpoint("http"), func() string { - return "max_over_time(one_series{instance='foo_0'}[2h])" - }, time.Now, promclient.QueryOptions{ - Deduplicate: true, - }, 1) - instantQuery(t, ctx, q.Endpoint("http"), func() string { - return "max_over_time(thousand_one_series[2h])" - }, time.Now, promclient.QueryOptions{ - Deduplicate: true, - }, 1001) + // Querying the series in the previously created blocks to ensure we produce Store API query metrics. + { + instantQuery(t, ctx, q.Endpoint("http"), func() string { + return "max_over_time(one_series{instance='foo_0'}[2h])" + }, time.Now, promclient.QueryOptions{ + Deduplicate: true, + }, 1) + + instantQuery(t, ctx, q.Endpoint("http"), func() string { + return "max_over_time(thousand_one_series[2h])" + }, time.Now, promclient.QueryOptions{ + Deduplicate: true, + }, 1001) + } mon, err := e2emon.Start(e) testutil.Ok(t, err) From f0c1a2276f926572f7e993ee079f6a3fd105454b Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Thu, 29 Sep 2022 11:54:25 +0200 Subject: [PATCH 05/31] Fix long function definition (newQuerier) Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- pkg/query/querier.go | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/pkg/query/querier.go b/pkg/query/querier.go index d971e29002..07c97c986e 100644 --- a/pkg/query/querier.go +++ b/pkg/query/querier.go @@ -147,7 +147,24 @@ type querier struct { // newQuerier creates implementation of storage.Querier that fetches data from the proxy // store API endpoints. -func newQuerier(ctx context.Context, logger log.Logger, mint, maxt int64, replicaLabels []string, storeDebugMatchers [][]*labels.Matcher, proxy storepb.StoreServer, deduplicate bool, maxResolutionMillis int64, partialResponse, enableQueryPushdown, skipChunks bool, selectGate gate.Gate, selectTimeout time.Duration, shardInfo *storepb.ShardInfo, seriesStatsReporter SeriesStatsReporter) *querier { +func newQuerier( + ctx context.Context, + logger log.Logger, + mint, + maxt int64, + replicaLabels []string, + storeDebugMatchers [][]*labels.Matcher, + proxy storepb.StoreServer, + deduplicate bool, + maxResolutionMillis int64, + partialResponse, + enableQueryPushdown, + skipChunks bool, + selectGate gate.Gate, + selectTimeout time.Duration, + shardInfo *storepb.ShardInfo, + seriesStatsReporter SeriesStatsReporter, +) *querier { if logger == nil { logger = log.NewNopLogger() } From 6803e4a648c00a84a8f6096f0b221ec860d8a367 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Thu, 29 Sep 2022 11:54:38 +0200 Subject: [PATCH 06/31] Remove TODO comment Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- pkg/api/query/grpc.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/api/query/grpc.go b/pkg/api/query/grpc.go index 032cca9fc9..8848cd2ffe 100644 --- a/pkg/api/query/grpc.go +++ b/pkg/api/query/grpc.go @@ -169,7 +169,6 @@ func (g *GRPCAPI) QueryRange(request *querypb.QueryRangeRequest, srv querypb.Que request.EnableQueryPushdown, false, request.ShardInfo, - // TODO(douglascamata): do we need to do something here? Is there GRPC and HTTP? query.NoopSeriesStatsReporter, ) From 3b3bc0c0873c911d4b5e8ea832e615f4dda7ea88 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Thu, 29 Sep 2022 15:28:43 +0200 Subject: [PATCH 07/31] Fix query tests Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- pkg/api/query/v1.go | 18 +++++++++++------- pkg/api/query/v1_test.go | 4 ++++ pkg/store/metrics/metrics.go | 10 ++++++++++ 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/pkg/api/query/v1.go b/pkg/api/query/v1.go index 193c21d41a..2c20aa2cc7 100644 --- a/pkg/api/query/v1.go +++ b/pkg/api/query/v1.go @@ -30,7 +30,6 @@ import ( "time" "github.com/go-kit/log" - "github.com/go-kit/log/level" "github.com/opentracing/opentracing-go" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" @@ -42,11 +41,10 @@ import ( "github.com/prometheus/prometheus/promql" "github.com/prometheus/prometheus/promql/parser" "github.com/prometheus/prometheus/storage" + "github.com/prometheus/prometheus/util/stats" v1 "github.com/prometheus/prometheus/web/api/v1" "github.com/thanos-io/thanos/pkg/store/metrics" - "github.com/prometheus/prometheus/util/stats" - "github.com/thanos-io/thanos/pkg/api" "github.com/thanos-io/thanos/pkg/exemplars" "github.com/thanos-io/thanos/pkg/exemplars/exemplarspb" @@ -110,7 +108,12 @@ type QueryAPI struct { queryRangeHist prometheus.Histogram - seriesStatsAggregator *metrics.SeriesQueryPerformanceMetricsAggregator + seriesStatsAggregator seriesQueryPerformanceMetricsAggregator +} + +type seriesQueryPerformanceMetricsAggregator interface { + Aggregate(seriesStats storepb.SeriesStatsCounter) + Observe(duration float64) } // NewQueryAPI returns an initialized QueryAPI type. @@ -138,9 +141,12 @@ func NewQueryAPI( defaultMetadataTimeRange time.Duration, disableCORS bool, gate gate.Gate, - statsAggregator *metrics.SeriesQueryPerformanceMetricsAggregator, + statsAggregator seriesQueryPerformanceMetricsAggregator, reg *prometheus.Registry, ) *QueryAPI { + if statsAggregator == nil { + statsAggregator = &metrics.NopSeriesQueryPerformanceMetricsAggregator{} + } return &QueryAPI{ baseAPI: api.NewBaseAPI(logger, disableCORS, flagsMap), logger: logger, @@ -595,10 +601,8 @@ func (qapi *QueryAPI) queryRange(r *http.Request) (interface{}, []error, *api.Ap } return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: res.Err}, qry.Close } - level.Info(qapi.logger).Log("totalStats", len(seriesStats)) for i := range seriesStats { qapi.seriesStatsAggregator.Aggregate(seriesStats[i]) - level.Info(qapi.logger).Log("series", seriesStats[i].Series, "samples", seriesStats[i].Samples) } qapi.seriesStatsAggregator.Observe(time.Since(beforeRange).Seconds()) diff --git a/pkg/api/query/v1_test.go b/pkg/api/query/v1_test.go index 000410ddbd..0f213ff6ed 100644 --- a/pkg/api/query/v1_test.go +++ b/pkg/api/query/v1_test.go @@ -45,6 +45,7 @@ import ( promgate "github.com/prometheus/prometheus/util/gate" "github.com/prometheus/prometheus/util/stats" "github.com/thanos-io/thanos/pkg/compact" + "github.com/thanos-io/thanos/pkg/store/metrics" baseAPI "github.com/thanos-io/thanos/pkg/api" "github.com/thanos-io/thanos/pkg/component" @@ -198,6 +199,7 @@ func TestQueryEndpoints(t *testing.T) { queryRangeHist: promauto.With(prometheus.NewRegistry()).NewHistogram(prometheus.HistogramOpts{ Name: "query_range_hist", }), + seriesStatsAggregator: &metrics.NopSeriesQueryPerformanceMetricsAggregator{}, } start := time.Unix(0, 0) @@ -737,6 +739,7 @@ func TestMetadataEndpoints(t *testing.T) { queryRangeHist: promauto.With(prometheus.NewRegistry()).NewHistogram(prometheus.HistogramOpts{ Name: "query_range_hist", }), + seriesStatsAggregator: &metrics.NopSeriesQueryPerformanceMetricsAggregator{}, } apiWithLabelLookback := &QueryAPI{ baseAPI: &baseAPI.BaseAPI{ @@ -750,6 +753,7 @@ func TestMetadataEndpoints(t *testing.T) { queryRangeHist: promauto.With(prometheus.NewRegistry()).NewHistogram(prometheus.HistogramOpts{ Name: "query_range_hist", }), + seriesStatsAggregator: &metrics.NopSeriesQueryPerformanceMetricsAggregator{}, } var tests = []endpointTestCase{ diff --git a/pkg/store/metrics/metrics.go b/pkg/store/metrics/metrics.go index 3738da25f4..d45b1923f9 100644 --- a/pkg/store/metrics/metrics.go +++ b/pkg/store/metrics/metrics.go @@ -73,3 +73,13 @@ func (s *SeriesQueryPerformanceMetricsAggregator) findBucket(value float64, quan } return foundBucket } + +type NopSeriesQueryPerformanceMetricsAggregator struct{} + +func (s *NopSeriesQueryPerformanceMetricsAggregator) Aggregate(_ storepb.SeriesStatsCounter) { + return +} + +func (s *NopSeriesQueryPerformanceMetricsAggregator) Observe(_ float64) { + return +} From 2e8778d2f53332432cf586c3dd7aea10322c832f Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Thu, 29 Sep 2022 15:29:00 +0200 Subject: [PATCH 08/31] Reformat query docs Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- docs/components/query.md | 129 +++++++++++++++++++++------------------ 1 file changed, 69 insertions(+), 60 deletions(-) diff --git a/docs/components/query.md b/docs/components/query.md index 72e3d6e1ab..40f418cb0d 100644 --- a/docs/components/query.md +++ b/docs/components/query.md @@ -262,26 +262,26 @@ Flags: --alert.query-url=ALERT.QUERY-URL The external Thanos Query URL that would be set in all alerts 'Source' field. - --enable-feature= ... Comma separated experimental feature names to - enable.The current list of features is + --enable-feature= ... Comma separated experimental feature names + to enable.The current list of features is query-pushdown. - --endpoint= ... Addresses of statically configured Thanos API - servers (repeatable). The scheme may be + --endpoint= ... Addresses of statically configured Thanos + API servers (repeatable). The scheme may be prefixed with 'dns+' or 'dnssrv+' to detect Thanos API servers through respective DNS lookups. --endpoint-strict= ... Addresses of only statically configured Thanos - API servers that are always used, even if the - health check fails. Useful if you have a + API servers that are always used, even if + the health check fails. Useful if you have a caching layer on top. --grpc-address="0.0.0.0:10901" Listen ip:port address for gRPC endpoints (StoreAPI). Make sure this address is routable from other components. --grpc-client-server-name="" - Server name to verify the hostname on the - returned gRPC certificates. See + Server name to verify the hostname on + the returned gRPC certificates. See https://tools.ietf.org/html/rfc4366#section-3.1 --grpc-client-tls-ca="" TLS CA Certificates to use to verify gRPC servers @@ -297,14 +297,14 @@ Flags: --grpc-grace-period=2m Time to wait after an interrupt received for GRPC Server. --grpc-server-max-connection-age=60m - The grpc server max connection age. This - controls how often to re-read the tls + The grpc server max connection age. + This controls how often to re-read the tls certificates and redo the TLS handshake --grpc-server-tls-cert="" TLS Certificate for gRPC server, leave blank to disable TLS --grpc-server-tls-client-ca="" - TLS CA to verify clients against. If no client - CA is specified, there is no client + TLS CA to verify clients against. If no + client CA is specified, there is no client verification on server side. (tls.NoClientCert) --grpc-server-tls-key="" TLS Key for the gRPC server, leave blank to disable TLS @@ -320,10 +320,10 @@ Flags: --log.format=logfmt Log format to use. Possible options: logfmt or json. --log.level=info Log filtering level. - --log.request.decision= Deprecation Warning - This flag would be soon - deprecated, and replaced with - `request.logging-config`. Request Logging for - logging the start and end of requests. By + --log.request.decision= Deprecation Warning - This flag would + be soon deprecated, and replaced with + `request.logging-config`. Request Logging + for logging the start and end of requests. By default this flag is disabled. LogFinishCall: Logs the finish call of the requests. LogStartAndFinishCall: Logs the start and @@ -340,24 +340,24 @@ Flags: queries. --query.default-step=1s Set default step for range queries. Default step is only used when step is not set in UI. - In such cases, Thanos UI will use default step - to calculate resolution (resolution = - max(rangeSeconds / 250, defaultStep)). This - will not work from Grafana, but Grafana has - __step variable which can be used. + In such cases, Thanos UI will use default + step to calculate resolution (resolution + = max(rangeSeconds / 250, defaultStep)). + This will not work from Grafana, but Grafana + has __step variable which can be used. --query.lookback-delta=QUERY.LOOKBACK-DELTA The maximum lookback duration for retrieving - metrics during expression evaluations. PromQL - always evaluates the query for the certain - timestamp (query range timestamps are deduced - by step). Since scrape intervals might be - different, PromQL looks back for given amount - of time to get latest sample. If it exceeds the - maximum lookback delta it assumes series is - stale and returns none (a gap). This is why - lookback delta should be set to at least 2 - times of the slowest scrape interval. If unset - it will use the promql default of 5m. + metrics during expression evaluations. + PromQL always evaluates the query for the + certain timestamp (query range timestamps are + deduced by step). Since scrape intervals might + be different, PromQL looks back for given + amount of time to get latest sample. If it + exceeds the maximum lookback delta it assumes + series is stale and returns none (a gap). + This is why lookback delta should be set to at + least 2 times of the slowest scrape interval. + If unset it will use the promql default of 5m. --query.max-concurrent=20 Maximum number of queries processed concurrently by query node. --query.max-concurrent-select=4 @@ -369,21 +369,30 @@ Flags: when the range parameters are not specified. The zero value means range covers the time since the beginning. - --query.partial-response Enable partial response for queries if no - partial_response param is specified. + --query.partial-response Enable partial response for queries if + no partial_response param is specified. --no-query.partial-response for disabling. --query.replica-label=QUERY.REPLICA-LABEL ... Labels to treat as a replica indicator along - which data is deduplicated. Still you will be - able to query without deduplication using + which data is deduplicated. Still you will + be able to query without deduplication using 'dedup=false' parameter. Data includes time series, recording rules, and alerting rules. + --query.telemetry.request-duration-seconds-quantiles=0.1... ... + The quantiles for exporting metrics about the + request duration quantiles. + --query.telemetry.request-samples-quantiles=100... ... + The quantiles for exporting metrics about the + samples count quantiles. + --query.telemetry.request-series-seconds-quantiles=10... ... + The quantiles for exporting metrics about the + series count quantiles. --query.timeout=2m Maximum time to process query by query node. --request.logging-config= Alternative to 'request.logging-config-file' - flag (mutually exclusive). Content of YAML file - with request logging configuration. See format - details: + flag (mutually exclusive). Content + of YAML file with request logging + configuration. See format details: https://thanos.io/tip/thanos/logging.md/#configuration --request.logging-config-file= Path to YAML file with request logging @@ -416,47 +425,47 @@ Flags: Path to files that contain addresses of store API servers. The path can be a glob pattern (repeatable). - --store.sd-interval=5m Refresh interval to re-read file SD files. It - is used as a resync fallback. + --store.sd-interval=5m Refresh interval to re-read file SD files. + It is used as a resync fallback. --store.unhealthy-timeout=5m Timeout before an unhealthy store is cleaned from the store UI page. --tracing.config= Alternative to 'tracing.config-file' flag - (mutually exclusive). Content of YAML file with - tracing configuration. See format details: + (mutually exclusive). Content of YAML file + with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration --tracing.config-file= - Path to YAML file with tracing configuration. - See format details: + Path to YAML file with tracing + configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration --version Show application version. --web.disable-cors Whether to disable CORS headers to be set by Thanos. By default Thanos sets CORS headers to be allowed by all. - --web.external-prefix="" Static prefix for all HTML links and redirect - URLs in the UI query web interface. Actual - endpoints are still served on / or the + --web.external-prefix="" Static prefix for all HTML links and + redirect URLs in the UI query web interface. + Actual endpoints are still served on / or the web.route-prefix. This allows thanos UI to be served behind a reverse proxy that strips a URL sub-path. --web.prefix-header="" Name of HTTP request header used for dynamic - prefixing of UI links and redirects. This - option is ignored if web.external-prefix - argument is set. Security risk: enable this - option only if a reverse proxy in front of - thanos is resetting the header. The - --web.prefix-header=X-Forwarded-Prefix option - can be useful, for example, if Thanos UI is - served via Traefik reverse proxy with + prefixing of UI links and redirects. + This option is ignored if web.external-prefix + argument is set. Security risk: enable + this option only if a reverse proxy in + front of thanos is resetting the header. + The --web.prefix-header=X-Forwarded-Prefix + option can be useful, for example, if Thanos + UI is served via Traefik reverse proxy with PathPrefixStrip option enabled, which sends the stripped prefix value in X-Forwarded-Prefix header. This allows thanos UI to be served on a sub-path. --web.route-prefix="" Prefix for API and UI endpoints. This allows - thanos UI to be served on a sub-path. Defaults - to the value of --web.external-prefix. This - option is analogous to --web.route-prefix of - Prometheus. + thanos UI to be served on a sub-path. + Defaults to the value of --web.external-prefix. + This option is analogous to --web.route-prefix + of Prometheus. ``` From a73840705bdf9bffaf7a2a49663b732b6219ffc3 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Thu, 29 Sep 2022 15:56:41 +0200 Subject: [PATCH 09/31] Remove useless return Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- pkg/store/metrics/metrics.go | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pkg/store/metrics/metrics.go b/pkg/store/metrics/metrics.go index d45b1923f9..14cdab4fea 100644 --- a/pkg/store/metrics/metrics.go +++ b/pkg/store/metrics/metrics.go @@ -74,12 +74,9 @@ func (s *SeriesQueryPerformanceMetricsAggregator) findBucket(value float64, quan return foundBucket } +// NopSeriesQueryPerformanceMetricsAggregator is a query performance series aggregator that does nothing. type NopSeriesQueryPerformanceMetricsAggregator struct{} -func (s *NopSeriesQueryPerformanceMetricsAggregator) Aggregate(_ storepb.SeriesStatsCounter) { - return -} +func (s *NopSeriesQueryPerformanceMetricsAggregator) Aggregate(_ storepb.SeriesStatsCounter) {} -func (s *NopSeriesQueryPerformanceMetricsAggregator) Observe(_ float64) { - return -} +func (s *NopSeriesQueryPerformanceMetricsAggregator) Observe(_ float64) {} From 90572634bc49b9a07a2d7dda20339d314b997b2f Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Thu, 29 Sep 2022 16:47:46 +0200 Subject: [PATCH 10/31] Put back old query docs Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- docs/components/query.md | 129 ++++++++++++++++++--------------------- 1 file changed, 60 insertions(+), 69 deletions(-) diff --git a/docs/components/query.md b/docs/components/query.md index 40f418cb0d..72e3d6e1ab 100644 --- a/docs/components/query.md +++ b/docs/components/query.md @@ -262,26 +262,26 @@ Flags: --alert.query-url=ALERT.QUERY-URL The external Thanos Query URL that would be set in all alerts 'Source' field. - --enable-feature= ... Comma separated experimental feature names - to enable.The current list of features is + --enable-feature= ... Comma separated experimental feature names to + enable.The current list of features is query-pushdown. - --endpoint= ... Addresses of statically configured Thanos - API servers (repeatable). The scheme may be + --endpoint= ... Addresses of statically configured Thanos API + servers (repeatable). The scheme may be prefixed with 'dns+' or 'dnssrv+' to detect Thanos API servers through respective DNS lookups. --endpoint-strict= ... Addresses of only statically configured Thanos - API servers that are always used, even if - the health check fails. Useful if you have a + API servers that are always used, even if the + health check fails. Useful if you have a caching layer on top. --grpc-address="0.0.0.0:10901" Listen ip:port address for gRPC endpoints (StoreAPI). Make sure this address is routable from other components. --grpc-client-server-name="" - Server name to verify the hostname on - the returned gRPC certificates. See + Server name to verify the hostname on the + returned gRPC certificates. See https://tools.ietf.org/html/rfc4366#section-3.1 --grpc-client-tls-ca="" TLS CA Certificates to use to verify gRPC servers @@ -297,14 +297,14 @@ Flags: --grpc-grace-period=2m Time to wait after an interrupt received for GRPC Server. --grpc-server-max-connection-age=60m - The grpc server max connection age. - This controls how often to re-read the tls + The grpc server max connection age. This + controls how often to re-read the tls certificates and redo the TLS handshake --grpc-server-tls-cert="" TLS Certificate for gRPC server, leave blank to disable TLS --grpc-server-tls-client-ca="" - TLS CA to verify clients against. If no - client CA is specified, there is no client + TLS CA to verify clients against. If no client + CA is specified, there is no client verification on server side. (tls.NoClientCert) --grpc-server-tls-key="" TLS Key for the gRPC server, leave blank to disable TLS @@ -320,10 +320,10 @@ Flags: --log.format=logfmt Log format to use. Possible options: logfmt or json. --log.level=info Log filtering level. - --log.request.decision= Deprecation Warning - This flag would - be soon deprecated, and replaced with - `request.logging-config`. Request Logging - for logging the start and end of requests. By + --log.request.decision= Deprecation Warning - This flag would be soon + deprecated, and replaced with + `request.logging-config`. Request Logging for + logging the start and end of requests. By default this flag is disabled. LogFinishCall: Logs the finish call of the requests. LogStartAndFinishCall: Logs the start and @@ -340,24 +340,24 @@ Flags: queries. --query.default-step=1s Set default step for range queries. Default step is only used when step is not set in UI. - In such cases, Thanos UI will use default - step to calculate resolution (resolution - = max(rangeSeconds / 250, defaultStep)). - This will not work from Grafana, but Grafana - has __step variable which can be used. + In such cases, Thanos UI will use default step + to calculate resolution (resolution = + max(rangeSeconds / 250, defaultStep)). This + will not work from Grafana, but Grafana has + __step variable which can be used. --query.lookback-delta=QUERY.LOOKBACK-DELTA The maximum lookback duration for retrieving - metrics during expression evaluations. - PromQL always evaluates the query for the - certain timestamp (query range timestamps are - deduced by step). Since scrape intervals might - be different, PromQL looks back for given - amount of time to get latest sample. If it - exceeds the maximum lookback delta it assumes - series is stale and returns none (a gap). - This is why lookback delta should be set to at - least 2 times of the slowest scrape interval. - If unset it will use the promql default of 5m. + metrics during expression evaluations. PromQL + always evaluates the query for the certain + timestamp (query range timestamps are deduced + by step). Since scrape intervals might be + different, PromQL looks back for given amount + of time to get latest sample. If it exceeds the + maximum lookback delta it assumes series is + stale and returns none (a gap). This is why + lookback delta should be set to at least 2 + times of the slowest scrape interval. If unset + it will use the promql default of 5m. --query.max-concurrent=20 Maximum number of queries processed concurrently by query node. --query.max-concurrent-select=4 @@ -369,30 +369,21 @@ Flags: when the range parameters are not specified. The zero value means range covers the time since the beginning. - --query.partial-response Enable partial response for queries if - no partial_response param is specified. + --query.partial-response Enable partial response for queries if no + partial_response param is specified. --no-query.partial-response for disabling. --query.replica-label=QUERY.REPLICA-LABEL ... Labels to treat as a replica indicator along - which data is deduplicated. Still you will - be able to query without deduplication using + which data is deduplicated. Still you will be + able to query without deduplication using 'dedup=false' parameter. Data includes time series, recording rules, and alerting rules. - --query.telemetry.request-duration-seconds-quantiles=0.1... ... - The quantiles for exporting metrics about the - request duration quantiles. - --query.telemetry.request-samples-quantiles=100... ... - The quantiles for exporting metrics about the - samples count quantiles. - --query.telemetry.request-series-seconds-quantiles=10... ... - The quantiles for exporting metrics about the - series count quantiles. --query.timeout=2m Maximum time to process query by query node. --request.logging-config= Alternative to 'request.logging-config-file' - flag (mutually exclusive). Content - of YAML file with request logging - configuration. See format details: + flag (mutually exclusive). Content of YAML file + with request logging configuration. See format + details: https://thanos.io/tip/thanos/logging.md/#configuration --request.logging-config-file= Path to YAML file with request logging @@ -425,47 +416,47 @@ Flags: Path to files that contain addresses of store API servers. The path can be a glob pattern (repeatable). - --store.sd-interval=5m Refresh interval to re-read file SD files. - It is used as a resync fallback. + --store.sd-interval=5m Refresh interval to re-read file SD files. It + is used as a resync fallback. --store.unhealthy-timeout=5m Timeout before an unhealthy store is cleaned from the store UI page. --tracing.config= Alternative to 'tracing.config-file' flag - (mutually exclusive). Content of YAML file - with tracing configuration. See format details: + (mutually exclusive). Content of YAML file with + tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration --tracing.config-file= - Path to YAML file with tracing - configuration. See format details: + Path to YAML file with tracing configuration. + See format details: https://thanos.io/tip/thanos/tracing.md/#configuration --version Show application version. --web.disable-cors Whether to disable CORS headers to be set by Thanos. By default Thanos sets CORS headers to be allowed by all. - --web.external-prefix="" Static prefix for all HTML links and - redirect URLs in the UI query web interface. - Actual endpoints are still served on / or the + --web.external-prefix="" Static prefix for all HTML links and redirect + URLs in the UI query web interface. Actual + endpoints are still served on / or the web.route-prefix. This allows thanos UI to be served behind a reverse proxy that strips a URL sub-path. --web.prefix-header="" Name of HTTP request header used for dynamic - prefixing of UI links and redirects. - This option is ignored if web.external-prefix - argument is set. Security risk: enable - this option only if a reverse proxy in - front of thanos is resetting the header. - The --web.prefix-header=X-Forwarded-Prefix - option can be useful, for example, if Thanos - UI is served via Traefik reverse proxy with + prefixing of UI links and redirects. This + option is ignored if web.external-prefix + argument is set. Security risk: enable this + option only if a reverse proxy in front of + thanos is resetting the header. The + --web.prefix-header=X-Forwarded-Prefix option + can be useful, for example, if Thanos UI is + served via Traefik reverse proxy with PathPrefixStrip option enabled, which sends the stripped prefix value in X-Forwarded-Prefix header. This allows thanos UI to be served on a sub-path. --web.route-prefix="" Prefix for API and UI endpoints. This allows - thanos UI to be served on a sub-path. - Defaults to the value of --web.external-prefix. - This option is analogous to --web.route-prefix - of Prometheus. + thanos UI to be served on a sub-path. Defaults + to the value of --web.external-prefix. This + option is analogous to --web.route-prefix of + Prometheus. ``` From f0567a9fe9605fb2d70a773c26a836c23a956aa5 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Thu, 29 Sep 2022 16:48:52 +0200 Subject: [PATCH 11/31] Update query docs again Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- docs/components/query.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/components/query.md b/docs/components/query.md index 72e3d6e1ab..37c540c48a 100644 --- a/docs/components/query.md +++ b/docs/components/query.md @@ -378,6 +378,15 @@ Flags: able to query without deduplication using 'dedup=false' parameter. Data includes time series, recording rules, and alerting rules. + --query.telemetry.request-duration-seconds-quantiles=0.1... ... + The quantiles for exporting metrics about the + request duration quantiles. + --query.telemetry.request-samples-quantiles=100... ... + The quantiles for exporting metrics about the + samples count quantiles. + --query.telemetry.request-series-seconds-quantiles=10... ... + The quantiles for exporting metrics about the + series count quantiles. --query.timeout=2m Maximum time to process query by query node. --request.logging-config= Alternative to 'request.logging-config-file' From 792487f9fbab9088880d10a7cfd0ff13433b1001 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Thu, 29 Sep 2022 17:19:02 +0200 Subject: [PATCH 12/31] Fix e2e env name Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- test/e2e/query_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/e2e/query_test.go b/test/e2e/query_test.go index 2837a520ee..13eea45715 100644 --- a/test/e2e/query_test.go +++ b/test/e2e/query_test.go @@ -583,7 +583,7 @@ func TestQueryStoreMetrics(t *testing.T) { t.Parallel() // Build up. - e, err := e2e.New(e2e.WithName("e2e-query-store-metrics")) + e, err := e2e.New(e2e.WithName("storemetrics01")) testutil.Ok(t, err) t.Cleanup(e2ethanos.CleanScenario(t, e)) @@ -676,7 +676,7 @@ func TestQueryStoreMetrics(t *testing.T) { &model.Sample{ Metric: model.Metric{ "__name__": "thanos_store_api_query_duration_seconds_count", - "instance": "e2e-query-store-metrics-querier-1:8080", + "instance": "storemetrics01-querier-1:8080", "job": "querier-1", "samples_le": "100000", "series_le": "10000", @@ -693,7 +693,7 @@ func TestQueryStoreMetrics(t *testing.T) { &model.Sample{ Metric: model.Metric{ "__name__": "thanos_store_api_query_duration_seconds_count", - "instance": "e2e-query-store-metrics-querier-1:8080", + "instance": "storemetrics01-querier-1:8080", "job": "querier-1", "samples_le": "100", "series_le": "10", From d373d2886d1d82b5255dea89e40f1d1ee20ae75f Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Thu, 29 Sep 2022 17:41:30 +0200 Subject: [PATCH 13/31] Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> From 3470ded4cc5a22a7759960be0087e3f8de1963a9 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Fri, 30 Sep 2022 12:18:59 +0200 Subject: [PATCH 14/31] Add missing copyright notice. Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- pkg/store/metrics/metrics.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/store/metrics/metrics.go b/pkg/store/metrics/metrics.go index 14cdab4fea..0a5b8f92ff 100644 --- a/pkg/store/metrics/metrics.go +++ b/pkg/store/metrics/metrics.go @@ -1,3 +1,6 @@ +// Copyright (c) The Thanos Authors. +// Licensed under the Apache License 2.0. + package metrics import ( From 63d45e8f5bbb7f0ed1ff39b5279876785ae00857 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Fri, 30 Sep 2022 13:18:15 +0200 Subject: [PATCH 15/31] Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> From 2a4241dd6b03220e5a9526937b4029c30d335651 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Fri, 30 Sep 2022 15:30:48 +0200 Subject: [PATCH 16/31] Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> From e04acb2b2a79a7a283b2b2b763559aecb8bfd744 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Fri, 30 Sep 2022 16:32:30 +0200 Subject: [PATCH 17/31] Bump wait time to twice scrape interval Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- test/e2e/query_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/query_test.go b/test/e2e/query_test.go index 13eea45715..c7ba14838b 100644 --- a/test/e2e/query_test.go +++ b/test/e2e/query_test.go @@ -1038,7 +1038,7 @@ func queryWaitAndAssert(t *testing.T, ctx context.Context, addr string, q func() "caller", "queryWaitAndAssert", "msg", fmt.Sprintf("Waiting for %d results for query %s", len(expected), q()), ) - testutil.Ok(t, runutil.RetryWithLog(logger, 5*time.Second, ctx.Done(), func() error { + testutil.Ok(t, runutil.RetryWithLog(logger, 10*time.Second, ctx.Done(), func() error { res, warnings, err := promclient.NewDefaultClient().QueryInstant(ctx, urlParse(t, "http://"+addr), q(), ts(), opts) if err != nil { return err From 8e999bae8256ed77f75f29549b0c243768a874f1 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Fri, 30 Sep 2022 16:46:08 +0200 Subject: [PATCH 18/31] Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> From 6a097f0f9e77edd3a2531bfd346dd0fdbe6f0727 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Mon, 3 Oct 2022 14:28:05 +0200 Subject: [PATCH 19/31] Attempt to fix randomly failing test Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- test/e2e/query_test.go | 44 +++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/test/e2e/query_test.go b/test/e2e/query_test.go index c7ba14838b..524b82145b 100644 --- a/test/e2e/query_test.go +++ b/test/e2e/query_test.go @@ -634,7 +634,7 @@ func TestQueryStoreMetrics(t *testing.T) { } } - s1 := e2ethanos.NewStoreGW( + storeGW := e2ethanos.NewStoreGW( e, "s1", client.BucketConfig{ @@ -644,25 +644,27 @@ func TestQueryStoreMetrics(t *testing.T) { "", nil, ) - testutil.Ok(t, e2e.StartAndWaitReady(s1)) + testutil.Ok(t, e2e.StartAndWaitReady(storeGW)) - q := e2ethanos.NewQuerierBuilder(e, "1", s1.InternalEndpoint("grpc")).Init() - testutil.Ok(t, e2e.StartAndWaitReady(q)) - testutil.Ok(t, s1.WaitSumMetrics(e2emon.Equals(2), "thanos_blocks_meta_synced")) + querier := e2ethanos.NewQuerierBuilder(e, "1", storeGW.InternalEndpoint("grpc")).Init() + testutil.Ok(t, e2e.StartAndWaitReady(querier)) + testutil.Ok(t, storeGW.WaitSumMetrics(e2emon.Equals(2), "thanos_blocks_meta_synced")) // Querying the series in the previously created blocks to ensure we produce Store API query metrics. { - instantQuery(t, ctx, q.Endpoint("http"), func() string { + _, err := simpleInstantQuery(t, ctx, querier.Endpoint("http"), func() string { return "max_over_time(one_series{instance='foo_0'}[2h])" }, time.Now, promclient.QueryOptions{ Deduplicate: true, }, 1) + testutil.Ok(t, err) - instantQuery(t, ctx, q.Endpoint("http"), func() string { + _, err = simpleInstantQuery(t, ctx, querier.Endpoint("http"), func() string { return "max_over_time(thousand_one_series[2h])" }, time.Now, promclient.QueryOptions{ Deduplicate: true, }, 1001) + testutil.Ok(t, err) } mon, err := e2emon.Start(e) @@ -1008,18 +1010,10 @@ func instantQuery(t testing.TB, ctx context.Context, addr string, q func() strin "msg", fmt.Sprintf("Waiting for %d results for query %s", expectedSeriesLen, q()), ) testutil.Ok(t, runutil.RetryWithLog(logger, 5*time.Second, ctx.Done(), func() error { - res, warnings, err := promclient.NewDefaultClient().QueryInstant(ctx, urlParse(t, "http://"+addr), q(), ts(), opts) + res, err := simpleInstantQuery(t, ctx, addr, q, ts, opts, expectedSeriesLen) if err != nil { return err } - - if len(warnings) > 0 { - return errors.Errorf("unexpected warnings %s", warnings) - } - - if len(res) != expectedSeriesLen { - return errors.Errorf("unexpected result size, expected %d; result %d: %v", expectedSeriesLen, len(res), res) - } result = res return nil })) @@ -1027,6 +1021,24 @@ func instantQuery(t testing.TB, ctx context.Context, addr string, q func() strin return result } +func simpleInstantQuery(t testing.TB, ctx context.Context, addr string, q func() string, ts func() time.Time, opts promclient.QueryOptions, expectedSeriesLen int) (model.Vector, error) { + res, warnings, err := promclient.NewDefaultClient().QueryInstant(ctx, urlParse(t, "http://"+addr), q(), ts(), opts) + if err != nil { + return nil, err + } + + if len(warnings) > 0 { + return nil, errors.Errorf("unexpected warnings %s", warnings) + } + + if len(res) != expectedSeriesLen { + return nil, errors.Errorf("unexpected result size, expected %d; result %d: %v", expectedSeriesLen, len(res), res) + } + + sortResults(res) + return res, nil +} + func queryWaitAndAssert(t *testing.T, ctx context.Context, addr string, q func() string, ts func() time.Time, opts promclient.QueryOptions, expected model.Vector) { t.Helper() From 61315101c0ee5a1ce1bcafb51b82bee9a6f8e06c Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Mon, 3 Oct 2022 15:02:31 +0200 Subject: [PATCH 20/31] Checking more metrics to ensure the store is ready Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- test/e2e/query_test.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/e2e/query_test.go b/test/e2e/query_test.go index 524b82145b..255edaff5e 100644 --- a/test/e2e/query_test.go +++ b/test/e2e/query_test.go @@ -644,11 +644,12 @@ func TestQueryStoreMetrics(t *testing.T) { "", nil, ) - testutil.Ok(t, e2e.StartAndWaitReady(storeGW)) - querier := e2ethanos.NewQuerierBuilder(e, "1", storeGW.InternalEndpoint("grpc")).Init() - testutil.Ok(t, e2e.StartAndWaitReady(querier)) + testutil.Ok(t, e2e.StartAndWaitReady(storeGW, querier)) testutil.Ok(t, storeGW.WaitSumMetrics(e2emon.Equals(2), "thanos_blocks_meta_synced")) + testutil.Ok(t, storeGW.WaitSumMetrics(e2emon.Equals(2), "thanos_blocks_meta_syncs_total")) + testutil.Ok(t, storeGW.WaitSumMetrics(e2emon.Equals(0), "thanos_blocks_meta_sync_failures_total")) + testutil.Ok(t, storeGW.WaitSumMetrics(e2emon.Equals(0), "thanos_blocks_meta_modified")) // Querying the series in the previously created blocks to ensure we produce Store API query metrics. { From 8bc38bbd6b38b81e9d94b28b88fdf486f3ba4da9 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Mon, 3 Oct 2022 15:21:17 +0200 Subject: [PATCH 21/31] Clean up test Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- test/e2e/query_test.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/test/e2e/query_test.go b/test/e2e/query_test.go index 255edaff5e..629aef044d 100644 --- a/test/e2e/query_test.go +++ b/test/e2e/query_test.go @@ -647,9 +647,6 @@ func TestQueryStoreMetrics(t *testing.T) { querier := e2ethanos.NewQuerierBuilder(e, "1", storeGW.InternalEndpoint("grpc")).Init() testutil.Ok(t, e2e.StartAndWaitReady(storeGW, querier)) testutil.Ok(t, storeGW.WaitSumMetrics(e2emon.Equals(2), "thanos_blocks_meta_synced")) - testutil.Ok(t, storeGW.WaitSumMetrics(e2emon.Equals(2), "thanos_blocks_meta_syncs_total")) - testutil.Ok(t, storeGW.WaitSumMetrics(e2emon.Equals(0), "thanos_blocks_meta_sync_failures_total")) - testutil.Ok(t, storeGW.WaitSumMetrics(e2emon.Equals(0), "thanos_blocks_meta_modified")) // Querying the series in the previously created blocks to ensure we produce Store API query metrics. { From ffcc2d417d968b1fdf16bdc248540eeba9f1d6cc Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Mon, 3 Oct 2022 15:22:14 +0200 Subject: [PATCH 22/31] Do not record store api metrics when didn't touch series or samples Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- pkg/store/metrics/metrics.go | 3 +++ test/e2e/query_test.go | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pkg/store/metrics/metrics.go b/pkg/store/metrics/metrics.go index 0a5b8f92ff..41df3e3f2b 100644 --- a/pkg/store/metrics/metrics.go +++ b/pkg/store/metrics/metrics.go @@ -49,6 +49,9 @@ func (s *SeriesQueryPerformanceMetricsAggregator) Aggregate(stats storepb.Series // Observe commits the aggregated SeriesStatsCounter as an observation. func (s *SeriesQueryPerformanceMetricsAggregator) Observe(duration float64) { + if s.SeriesStats.Series == 0 || s.SeriesStats.Samples == 0 { + return + } // Bucket matching for series/labels matchSeriesBucket/matchSamplesBucket => float64, float64 seriesLeBucket := s.findBucket(float64(s.SeriesStats.Series), s.SeriesLeBuckets) samplesLeBucket := s.findBucket(float64(s.SeriesStats.Samples), s.SamplesLeBuckets) diff --git a/test/e2e/query_test.go b/test/e2e/query_test.go index 629aef044d..2434f97ab7 100644 --- a/test/e2e/query_test.go +++ b/test/e2e/query_test.go @@ -650,14 +650,14 @@ func TestQueryStoreMetrics(t *testing.T) { // Querying the series in the previously created blocks to ensure we produce Store API query metrics. { - _, err := simpleInstantQuery(t, ctx, querier.Endpoint("http"), func() string { + instantQuery(t, ctx, querier.Endpoint("http"), func() string { return "max_over_time(one_series{instance='foo_0'}[2h])" }, time.Now, promclient.QueryOptions{ Deduplicate: true, }, 1) testutil.Ok(t, err) - _, err = simpleInstantQuery(t, ctx, querier.Endpoint("http"), func() string { + instantQuery(t, ctx, querier.Endpoint("http"), func() string { return "max_over_time(thousand_one_series[2h])" }, time.Now, promclient.QueryOptions{ Deduplicate: true, From 6a13a7fc2e5122f86ca3c0e333cc97da2c3c75ad Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Mon, 3 Oct 2022 15:49:27 +0200 Subject: [PATCH 23/31] Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> From 31a9db819af1b0da8f96498e014b29b7e4780e88 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Mon, 3 Oct 2022 16:45:37 +0200 Subject: [PATCH 24/31] Also skip store api metrics on zero chunks touched Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- pkg/store/metrics/metrics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/store/metrics/metrics.go b/pkg/store/metrics/metrics.go index 41df3e3f2b..d16db91ddf 100644 --- a/pkg/store/metrics/metrics.go +++ b/pkg/store/metrics/metrics.go @@ -49,7 +49,7 @@ func (s *SeriesQueryPerformanceMetricsAggregator) Aggregate(stats storepb.Series // Observe commits the aggregated SeriesStatsCounter as an observation. func (s *SeriesQueryPerformanceMetricsAggregator) Observe(duration float64) { - if s.SeriesStats.Series == 0 || s.SeriesStats.Samples == 0 { + if s.SeriesStats.Series == 0 || s.SeriesStats.Samples == 0 || s.SeriesStats.Chunks == 0 { return } // Bucket matching for series/labels matchSeriesBucket/matchSamplesBucket => float64, float64 From 5568a14d414314af9e1620d07f52cfcedf89c5c0 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Mon, 3 Oct 2022 16:53:42 +0200 Subject: [PATCH 25/31] Update changelog Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 502c4e18cf..72dbd5405b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5723](https://github.com/thanos-io/thanos/pull/5723) Compactor: Support disable block viewer UI. - [#5674](https://github.com/thanos-io/thanos/pull/5674) Query Frontend/Store: Add support connecting to redis using TLS. - [#5734](https://github.com/thanos-io/thanos/pull/5734) Store: Support disable block viewer UI. +- [#5741](https://github.com/thanos-io/thanos/pull/5741) Query: add metrics on how much data is being selected by downstream Store APIs. ### Changed From dfc1cf95bbfbc37b3964cc494708833cad8601d4 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Tue, 4 Oct 2022 11:14:29 +0200 Subject: [PATCH 26/31] Fix broken changelog after merge Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 589fb78369..458939b45d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,8 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5641](https://github.com/thanos-io/thanos/pull/5641) Store: Remove hardcoded labels in shard matcher. - [#5641](https://github.com/thanos-io/thanos/pull/5641) Query: Inject unshardable le label in query analyzer. - [#5685](https://github.com/thanos-io/thanos/pull/5685) Receive: Make active/head series limiting configuration per tenant by adding it to new limiting config. +- [#5411](https://github.com/thanos-io/thanos/pull/5411) Tracing: Change Jaeger exporter from OpenTracing to OpenTelemetry. *Options `RPC Metrics`, `Gen128Bit` and `Disabled` are now deprecated and won't have any effect when set :warning:.* + ### Removed From dd1c104eb1b0a1305677cb0ada5adac3bb11bad5 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Tue, 4 Oct 2022 11:15:26 +0200 Subject: [PATCH 27/31] Remove extra empty line Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 458939b45d..aade824c6f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,7 +37,6 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5685](https://github.com/thanos-io/thanos/pull/5685) Receive: Make active/head series limiting configuration per tenant by adding it to new limiting config. - [#5411](https://github.com/thanos-io/thanos/pull/5411) Tracing: Change Jaeger exporter from OpenTracing to OpenTelemetry. *Options `RPC Metrics`, `Gen128Bit` and `Disabled` are now deprecated and won't have any effect when set :warning:.* - ### Removed ## [v0.28.0](https://github.com/thanos-io/thanos/tree/release-0.28) - 2022.08.26 From fed8bf2e5c438153c77baa786809b37ad7be3f10 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Tue, 4 Oct 2022 14:16:18 +0200 Subject: [PATCH 28/31] Refactor names and (un)exported types and fields Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- cmd/thanos/query.go | 2 +- pkg/api/query/v1.go | 2 +- pkg/api/query/v1_test.go | 6 ++-- pkg/query/querier.go | 16 +++++----- pkg/store/metrics/metrics.go | 60 ++++++++++++++++++------------------ 5 files changed, 43 insertions(+), 43 deletions(-) diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go index 9eb0307f09..db63cc0629 100644 --- a/cmd/thanos/query.go +++ b/cmd/thanos/query.go @@ -692,7 +692,7 @@ func runQuery( extprom.WrapRegistererWithPrefix("thanos_query_concurrent_", reg), maxConcurrentQueries, ), - metrics.NewSeriesQueryPerformanceMetricsAggregator( + metrics.NewSeriesStatsAggregator( reg, queryTelemetryDurationQuantiles, queryTelemetrySamplesQuantiles, diff --git a/pkg/api/query/v1.go b/pkg/api/query/v1.go index 2c20aa2cc7..b686bfaa6d 100644 --- a/pkg/api/query/v1.go +++ b/pkg/api/query/v1.go @@ -145,7 +145,7 @@ func NewQueryAPI( reg *prometheus.Registry, ) *QueryAPI { if statsAggregator == nil { - statsAggregator = &metrics.NopSeriesQueryPerformanceMetricsAggregator{} + statsAggregator = &metrics.NoopSeriesStatsAggregator{} } return &QueryAPI{ baseAPI: api.NewBaseAPI(logger, disableCORS, flagsMap), diff --git a/pkg/api/query/v1_test.go b/pkg/api/query/v1_test.go index 0f213ff6ed..d307fee8c1 100644 --- a/pkg/api/query/v1_test.go +++ b/pkg/api/query/v1_test.go @@ -199,7 +199,7 @@ func TestQueryEndpoints(t *testing.T) { queryRangeHist: promauto.With(prometheus.NewRegistry()).NewHistogram(prometheus.HistogramOpts{ Name: "query_range_hist", }), - seriesStatsAggregator: &metrics.NopSeriesQueryPerformanceMetricsAggregator{}, + seriesStatsAggregator: &metrics.NoopSeriesStatsAggregator{}, } start := time.Unix(0, 0) @@ -739,7 +739,7 @@ func TestMetadataEndpoints(t *testing.T) { queryRangeHist: promauto.With(prometheus.NewRegistry()).NewHistogram(prometheus.HistogramOpts{ Name: "query_range_hist", }), - seriesStatsAggregator: &metrics.NopSeriesQueryPerformanceMetricsAggregator{}, + seriesStatsAggregator: &metrics.NoopSeriesStatsAggregator{}, } apiWithLabelLookback := &QueryAPI{ baseAPI: &baseAPI.BaseAPI{ @@ -753,7 +753,7 @@ func TestMetadataEndpoints(t *testing.T) { queryRangeHist: promauto.With(prometheus.NewRegistry()).NewHistogram(prometheus.HistogramOpts{ Name: "query_range_hist", }), - seriesStatsAggregator: &metrics.NopSeriesQueryPerformanceMetricsAggregator{}, + seriesStatsAggregator: &metrics.NoopSeriesStatsAggregator{}, } var tests = []endpointTestCase{ diff --git a/pkg/query/querier.go b/pkg/query/querier.go index 07c97c986e..b094cbd45c 100644 --- a/pkg/query/querier.go +++ b/pkg/query/querier.go @@ -29,11 +29,11 @@ import ( "github.com/thanos-io/thanos/pkg/tracing" ) -type SeriesStatsReporter func(seriesStats storepb.SeriesStatsCounter) +type seriesStatsReporter func(seriesStats storepb.SeriesStatsCounter) -var NoopSeriesStatsReporter SeriesStatsReporter = func(_ storepb.SeriesStatsCounter) {} +var NoopSeriesStatsReporter seriesStatsReporter = func(_ storepb.SeriesStatsCounter) {} -func NewAggregateStatsReporter(stats *[]storepb.SeriesStatsCounter) SeriesStatsReporter { +func NewAggregateStatsReporter(stats *[]storepb.SeriesStatsCounter) seriesStatsReporter { var mutex sync.Mutex return func(s storepb.SeriesStatsCounter) { mutex.Lock() @@ -57,7 +57,7 @@ type QueryableCreator func( enableQueryPushdown, skipChunks bool, shardInfo *storepb.ShardInfo, - seriesStatsReporter SeriesStatsReporter, + seriesStatsReporter seriesStatsReporter, ) storage.Queryable // NewQueryableCreator creates QueryableCreator. @@ -81,7 +81,7 @@ func NewQueryableCreator( enableQueryPushdown, skipChunks bool, shardInfo *storepb.ShardInfo, - seriesStatsReporter SeriesStatsReporter, + seriesStatsReporter seriesStatsReporter, ) storage.Queryable { return &queryable{ logger: logger, @@ -118,7 +118,7 @@ type queryable struct { selectTimeout time.Duration enableQueryPushdown bool shardInfo *storepb.ShardInfo - seriesStatsReporter SeriesStatsReporter + seriesStatsReporter seriesStatsReporter } // Querier returns a new storage querier against the underlying proxy store API. @@ -142,7 +142,7 @@ type querier struct { selectGate gate.Gate selectTimeout time.Duration shardInfo *storepb.ShardInfo - seriesStatsReporter SeriesStatsReporter + seriesStatsReporter seriesStatsReporter } // newQuerier creates implementation of storage.Querier that fetches data from the proxy @@ -163,7 +163,7 @@ func newQuerier( selectGate gate.Gate, selectTimeout time.Duration, shardInfo *storepb.ShardInfo, - seriesStatsReporter SeriesStatsReporter, + seriesStatsReporter seriesStatsReporter, ) *querier { if logger == nil { logger = log.NewNopLogger() diff --git a/pkg/store/metrics/metrics.go b/pkg/store/metrics/metrics.go index d16db91ddf..9c74e5703b 100644 --- a/pkg/store/metrics/metrics.go +++ b/pkg/store/metrics/metrics.go @@ -11,62 +11,62 @@ import ( "github.com/thanos-io/thanos/pkg/store/storepb" ) -// SeriesQueryPerformanceMetricsAggregator aggregates results from fanned-out queries into a histogram given their +// seriesStatsAggregator aggregates results from fanned-out queries into a histogram given their // response's shape. -type SeriesQueryPerformanceMetricsAggregator struct { - QueryDuration *prometheus.HistogramVec +type seriesStatsAggregator struct { + queryDuration *prometheus.HistogramVec - SeriesLeBuckets []int64 - SamplesLeBuckets []int64 - SeriesStats storepb.SeriesStatsCounter + seriesLeBuckets []int64 + samplesLeBuckets []int64 + seriesStats storepb.SeriesStatsCounter } -// NewSeriesQueryPerformanceMetricsAggregator is a constructor for SeriesQueryPerformanceMetricsAggregator. -func NewSeriesQueryPerformanceMetricsAggregator( +// NewSeriesStatsAggregator is a constructor for seriesStatsAggregator. +func NewSeriesStatsAggregator( reg prometheus.Registerer, durationQuantiles []float64, sampleQuantiles []int64, seriesQuantiles []int64, -) *SeriesQueryPerformanceMetricsAggregator { - return &SeriesQueryPerformanceMetricsAggregator{ - QueryDuration: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ +) *seriesStatsAggregator { + return &seriesStatsAggregator{ + queryDuration: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ Name: "thanos_store_api_query_duration_seconds", Help: "Duration of the Thanos Store API select phase for a query.", Buckets: durationQuantiles, }, []string{"series_le", "samples_le"}), - SeriesLeBuckets: seriesQuantiles, - SamplesLeBuckets: sampleQuantiles, - SeriesStats: storepb.SeriesStatsCounter{}, + seriesLeBuckets: seriesQuantiles, + samplesLeBuckets: sampleQuantiles, + seriesStats: storepb.SeriesStatsCounter{}, } } // Aggregate is an aggregator for merging `storepb.SeriesStatsCounter` for each incoming fanned out query. -func (s *SeriesQueryPerformanceMetricsAggregator) Aggregate(stats storepb.SeriesStatsCounter) { - s.SeriesStats.Series += stats.Series - s.SeriesStats.Samples += stats.Samples - s.SeriesStats.Chunks += stats.Chunks +func (s *seriesStatsAggregator) Aggregate(stats storepb.SeriesStatsCounter) { + s.seriesStats.Series += stats.Series + s.seriesStats.Samples += stats.Samples + s.seriesStats.Chunks += stats.Chunks } // Observe commits the aggregated SeriesStatsCounter as an observation. -func (s *SeriesQueryPerformanceMetricsAggregator) Observe(duration float64) { - if s.SeriesStats.Series == 0 || s.SeriesStats.Samples == 0 || s.SeriesStats.Chunks == 0 { +func (s *seriesStatsAggregator) Observe(duration float64) { + if s.seriesStats.Series == 0 || s.seriesStats.Samples == 0 || s.seriesStats.Chunks == 0 { return } // Bucket matching for series/labels matchSeriesBucket/matchSamplesBucket => float64, float64 - seriesLeBucket := s.findBucket(float64(s.SeriesStats.Series), s.SeriesLeBuckets) - samplesLeBucket := s.findBucket(float64(s.SeriesStats.Samples), s.SamplesLeBuckets) - s.QueryDuration.With(prometheus.Labels{ + seriesLeBucket := s.findBucket(float64(s.seriesStats.Series), s.seriesLeBuckets) + samplesLeBucket := s.findBucket(float64(s.seriesStats.Samples), s.samplesLeBuckets) + s.queryDuration.With(prometheus.Labels{ "series_le": strconv.Itoa(int(seriesLeBucket)), "samples_le": strconv.Itoa(int(samplesLeBucket)), }).Observe(duration) s.reset() } -func (s *SeriesQueryPerformanceMetricsAggregator) reset() { - s.SeriesStats = storepb.SeriesStatsCounter{} +func (s *seriesStatsAggregator) reset() { + s.seriesStats = storepb.SeriesStatsCounter{} } -func (s *SeriesQueryPerformanceMetricsAggregator) findBucket(value float64, quantiles []int64) int64 { +func (s *seriesStatsAggregator) findBucket(value float64, quantiles []int64) int64 { if len(quantiles) == 0 { return 0 } @@ -80,9 +80,9 @@ func (s *SeriesQueryPerformanceMetricsAggregator) findBucket(value float64, quan return foundBucket } -// NopSeriesQueryPerformanceMetricsAggregator is a query performance series aggregator that does nothing. -type NopSeriesQueryPerformanceMetricsAggregator struct{} +// NoopSeriesStatsAggregator is a query performance series aggregator that does nothing. +type NoopSeriesStatsAggregator struct{} -func (s *NopSeriesQueryPerformanceMetricsAggregator) Aggregate(_ storepb.SeriesStatsCounter) {} +func (s *NoopSeriesStatsAggregator) Aggregate(_ storepb.SeriesStatsCounter) {} -func (s *NopSeriesQueryPerformanceMetricsAggregator) Observe(_ float64) {} +func (s *NoopSeriesStatsAggregator) Observe(_ float64) {} From 8f6a2456c252b62b8f98db7884a35b68ed896dc8 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Tue, 4 Oct 2022 15:13:27 +0200 Subject: [PATCH 29/31] Start listing metrics exported by Thanos Query Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- docs/components/query.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/components/query.md b/docs/components/query.md index 37c540c48a..22d60cb73a 100644 --- a/docs/components/query.md +++ b/docs/components/query.md @@ -469,3 +469,13 @@ Flags: Prometheus. ``` + +## Exported metrics + +Thanos Query also exports metrics about its own performance. You can find a list with these metrics below. + +**Disclaimer**: this list is incomplete. The remaining metrics will be added over time. + +| Name | Type | Labels | Description | +|-----------------------------------------|-----------|-----------------------|-------------------------------------------------------------------------------------------------------------------| +| thanos_store_api_query_duration_seconds | Histogram | samples_le, series_le | Duration of the Thanos Store API select phase for a query according to the amount of samples and series selected. | From 65e94eef93641c06961d49dab9d05460436e229c Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Tue, 4 Oct 2022 18:18:50 +0200 Subject: [PATCH 30/31] Rename pkg/store/metrics -> pkg/store/telemetry Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- cmd/thanos/query.go | 4 ++-- pkg/api/query/v1.go | 4 ++-- pkg/api/query/v1_test.go | 8 ++++---- pkg/store/{metrics/metrics.go => telemetry/telemetry.go} | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) rename pkg/store/{metrics/metrics.go => telemetry/telemetry.go} (99%) diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go index db63cc0629..a30ee5eeec 100644 --- a/cmd/thanos/query.go +++ b/cmd/thanos/query.go @@ -25,7 +25,7 @@ import ( "github.com/prometheus/prometheus/discovery/targetgroup" "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/promql" - "github.com/thanos-io/thanos/pkg/store/metrics" + "github.com/thanos-io/thanos/pkg/store/telemetry" "google.golang.org/grpc" v1 "github.com/prometheus/prometheus/web/api/v1" @@ -692,7 +692,7 @@ func runQuery( extprom.WrapRegistererWithPrefix("thanos_query_concurrent_", reg), maxConcurrentQueries, ), - metrics.NewSeriesStatsAggregator( + telemetry.NewSeriesStatsAggregator( reg, queryTelemetryDurationQuantiles, queryTelemetrySamplesQuantiles, diff --git a/pkg/api/query/v1.go b/pkg/api/query/v1.go index b686bfaa6d..760c38bca6 100644 --- a/pkg/api/query/v1.go +++ b/pkg/api/query/v1.go @@ -43,7 +43,7 @@ import ( "github.com/prometheus/prometheus/storage" "github.com/prometheus/prometheus/util/stats" v1 "github.com/prometheus/prometheus/web/api/v1" - "github.com/thanos-io/thanos/pkg/store/metrics" + "github.com/thanos-io/thanos/pkg/store/telemetry" "github.com/thanos-io/thanos/pkg/api" "github.com/thanos-io/thanos/pkg/exemplars" @@ -145,7 +145,7 @@ func NewQueryAPI( reg *prometheus.Registry, ) *QueryAPI { if statsAggregator == nil { - statsAggregator = &metrics.NoopSeriesStatsAggregator{} + statsAggregator = &telemetry.NoopSeriesStatsAggregator{} } return &QueryAPI{ baseAPI: api.NewBaseAPI(logger, disableCORS, flagsMap), diff --git a/pkg/api/query/v1_test.go b/pkg/api/query/v1_test.go index d307fee8c1..b5b948088a 100644 --- a/pkg/api/query/v1_test.go +++ b/pkg/api/query/v1_test.go @@ -45,7 +45,7 @@ import ( promgate "github.com/prometheus/prometheus/util/gate" "github.com/prometheus/prometheus/util/stats" "github.com/thanos-io/thanos/pkg/compact" - "github.com/thanos-io/thanos/pkg/store/metrics" + "github.com/thanos-io/thanos/pkg/store/telemetry" baseAPI "github.com/thanos-io/thanos/pkg/api" "github.com/thanos-io/thanos/pkg/component" @@ -199,7 +199,7 @@ func TestQueryEndpoints(t *testing.T) { queryRangeHist: promauto.With(prometheus.NewRegistry()).NewHistogram(prometheus.HistogramOpts{ Name: "query_range_hist", }), - seriesStatsAggregator: &metrics.NoopSeriesStatsAggregator{}, + seriesStatsAggregator: &telemetry.NoopSeriesStatsAggregator{}, } start := time.Unix(0, 0) @@ -739,7 +739,7 @@ func TestMetadataEndpoints(t *testing.T) { queryRangeHist: promauto.With(prometheus.NewRegistry()).NewHistogram(prometheus.HistogramOpts{ Name: "query_range_hist", }), - seriesStatsAggregator: &metrics.NoopSeriesStatsAggregator{}, + seriesStatsAggregator: &telemetry.NoopSeriesStatsAggregator{}, } apiWithLabelLookback := &QueryAPI{ baseAPI: &baseAPI.BaseAPI{ @@ -753,7 +753,7 @@ func TestMetadataEndpoints(t *testing.T) { queryRangeHist: promauto.With(prometheus.NewRegistry()).NewHistogram(prometheus.HistogramOpts{ Name: "query_range_hist", }), - seriesStatsAggregator: &metrics.NoopSeriesStatsAggregator{}, + seriesStatsAggregator: &telemetry.NoopSeriesStatsAggregator{}, } var tests = []endpointTestCase{ diff --git a/pkg/store/metrics/metrics.go b/pkg/store/telemetry/telemetry.go similarity index 99% rename from pkg/store/metrics/metrics.go rename to pkg/store/telemetry/telemetry.go index 9c74e5703b..a75b65219f 100644 --- a/pkg/store/metrics/metrics.go +++ b/pkg/store/telemetry/telemetry.go @@ -1,7 +1,7 @@ // Copyright (c) The Thanos Authors. // Licensed under the Apache License 2.0. -package metrics +package telemetry import ( "strconv" From 0951370b2d6b6eea5e35896958cbef34f27072a3 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Mon, 17 Oct 2022 14:39:11 +0200 Subject: [PATCH 31/31] Get rid of the pkg/store/telemetry package Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- cmd/thanos/query.go | 3 +-- pkg/api/query/v1.go | 5 ++--- pkg/api/query/v1_test.go | 10 ++++------ pkg/store/{telemetry => }/telemetry.go | 2 +- 4 files changed, 8 insertions(+), 12 deletions(-) rename pkg/store/{telemetry => }/telemetry.go (99%) diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go index a30ee5eeec..38df8d8a4f 100644 --- a/cmd/thanos/query.go +++ b/cmd/thanos/query.go @@ -25,7 +25,6 @@ import ( "github.com/prometheus/prometheus/discovery/targetgroup" "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/promql" - "github.com/thanos-io/thanos/pkg/store/telemetry" "google.golang.org/grpc" v1 "github.com/prometheus/prometheus/web/api/v1" @@ -692,7 +691,7 @@ func runQuery( extprom.WrapRegistererWithPrefix("thanos_query_concurrent_", reg), maxConcurrentQueries, ), - telemetry.NewSeriesStatsAggregator( + store.NewSeriesStatsAggregator( reg, queryTelemetryDurationQuantiles, queryTelemetrySamplesQuantiles, diff --git a/pkg/api/query/v1.go b/pkg/api/query/v1.go index 760c38bca6..918bcbf5fd 100644 --- a/pkg/api/query/v1.go +++ b/pkg/api/query/v1.go @@ -43,8 +43,6 @@ import ( "github.com/prometheus/prometheus/storage" "github.com/prometheus/prometheus/util/stats" v1 "github.com/prometheus/prometheus/web/api/v1" - "github.com/thanos-io/thanos/pkg/store/telemetry" - "github.com/thanos-io/thanos/pkg/api" "github.com/thanos-io/thanos/pkg/exemplars" "github.com/thanos-io/thanos/pkg/exemplars/exemplarspb" @@ -57,6 +55,7 @@ import ( "github.com/thanos-io/thanos/pkg/rules" "github.com/thanos-io/thanos/pkg/rules/rulespb" "github.com/thanos-io/thanos/pkg/runutil" + "github.com/thanos-io/thanos/pkg/store" "github.com/thanos-io/thanos/pkg/store/storepb" "github.com/thanos-io/thanos/pkg/targets" "github.com/thanos-io/thanos/pkg/targets/targetspb" @@ -145,7 +144,7 @@ func NewQueryAPI( reg *prometheus.Registry, ) *QueryAPI { if statsAggregator == nil { - statsAggregator = &telemetry.NoopSeriesStatsAggregator{} + statsAggregator = &store.NoopSeriesStatsAggregator{} } return &QueryAPI{ baseAPI: api.NewBaseAPI(logger, disableCORS, flagsMap), diff --git a/pkg/api/query/v1_test.go b/pkg/api/query/v1_test.go index b5b948088a..07c562af9c 100644 --- a/pkg/api/query/v1_test.go +++ b/pkg/api/query/v1_test.go @@ -44,10 +44,8 @@ import ( "github.com/prometheus/prometheus/tsdb/tsdbutil" promgate "github.com/prometheus/prometheus/util/gate" "github.com/prometheus/prometheus/util/stats" - "github.com/thanos-io/thanos/pkg/compact" - "github.com/thanos-io/thanos/pkg/store/telemetry" - baseAPI "github.com/thanos-io/thanos/pkg/api" + "github.com/thanos-io/thanos/pkg/compact" "github.com/thanos-io/thanos/pkg/component" "github.com/thanos-io/thanos/pkg/gate" "github.com/thanos-io/thanos/pkg/query" @@ -199,7 +197,7 @@ func TestQueryEndpoints(t *testing.T) { queryRangeHist: promauto.With(prometheus.NewRegistry()).NewHistogram(prometheus.HistogramOpts{ Name: "query_range_hist", }), - seriesStatsAggregator: &telemetry.NoopSeriesStatsAggregator{}, + seriesStatsAggregator: &store.NoopSeriesStatsAggregator{}, } start := time.Unix(0, 0) @@ -739,7 +737,7 @@ func TestMetadataEndpoints(t *testing.T) { queryRangeHist: promauto.With(prometheus.NewRegistry()).NewHistogram(prometheus.HistogramOpts{ Name: "query_range_hist", }), - seriesStatsAggregator: &telemetry.NoopSeriesStatsAggregator{}, + seriesStatsAggregator: &store.NoopSeriesStatsAggregator{}, } apiWithLabelLookback := &QueryAPI{ baseAPI: &baseAPI.BaseAPI{ @@ -753,7 +751,7 @@ func TestMetadataEndpoints(t *testing.T) { queryRangeHist: promauto.With(prometheus.NewRegistry()).NewHistogram(prometheus.HistogramOpts{ Name: "query_range_hist", }), - seriesStatsAggregator: &telemetry.NoopSeriesStatsAggregator{}, + seriesStatsAggregator: &store.NoopSeriesStatsAggregator{}, } var tests = []endpointTestCase{ diff --git a/pkg/store/telemetry/telemetry.go b/pkg/store/telemetry.go similarity index 99% rename from pkg/store/telemetry/telemetry.go rename to pkg/store/telemetry.go index a75b65219f..a854daaf0c 100644 --- a/pkg/store/telemetry/telemetry.go +++ b/pkg/store/telemetry.go @@ -1,7 +1,7 @@ // Copyright (c) The Thanos Authors. // Licensed under the Apache License 2.0. -package telemetry +package store import ( "strconv"