Skip to content

Commit

Permalink
Expand unit tests and fix bug for query sharding native histograms (#…
Browse files Browse the repository at this point in the history
…4666)

* Expand unit tests for query sharding native histograms
* Fix bug with summing native histograms with query sharding through query frontend
* Account for staleness
* Update change log
* Update pkg/frontend/querymiddleware/querysharding_test.go

Co-authored-by: George Krajcsovits <krajorama@users.noreply.github.com>
  • Loading branch information
zenador and krajorama committed Apr 14, 2023
1 parent 02e0e06 commit eb5ef3a
Show file tree
Hide file tree
Showing 5 changed files with 249 additions and 89 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@
* [BUGFIX] Query-frontend: don't retry queries which error inside PromQL. #4643
* [BUGFIX] Store-gateway & query-frontend: report more consistent statistics for fetched index bytes. #4671
* [BUGFIX] Native histograms: fix how IsFloatHistogram determines if mimirpb.Histogram is a float histogram. #4706
* [BUGFIX] Query-frontend: fix query sharding for native histograms. #4666

### Mixin

Expand Down
241 changes: 187 additions & 54 deletions pkg/frontend/querymiddleware/querysharding_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
"github.com/prometheus/prometheus/model/histogram"
"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/model/value"
"github.com/prometheus/prometheus/promql"
Expand All @@ -40,6 +41,7 @@ import (
"github.com/grafana/mimir/pkg/mimirpb"
"github.com/grafana/mimir/pkg/storage/sharding"
"github.com/grafana/mimir/pkg/util"
util_test "github.com/grafana/mimir/pkg/util/test"
"github.com/grafana/mimir/pkg/util/validation"
)

Expand Down Expand Up @@ -86,36 +88,50 @@ func approximatelyEquals(t *testing.T, a, b *PrometheusResponse) {
b := bs[i]
require.Equal(t, a.Labels, b.Labels)
require.Equal(t, len(a.Samples), len(b.Samples), "expected same number of samples for series %s", a.Labels)
require.Equal(t, len(a.Histograms), len(b.Histograms), "expected same number of histograms for series %s", a.Labels)
require.NotEqual(t, len(a.Samples) > 0, len(a.Histograms) > 0, "expected either samples or histogram but not both for series %s, got %d samples and %d histograms", a.Labels, len(a.Samples), len(a.Histograms))

for j := 0; j < len(a.Samples); j++ {
expected := a.Samples[j]
actual := b.Samples[j]
require.Equalf(t, expected.TimestampMs, actual.TimestampMs, "sample timestamp at position %d for series %s", j, a.Labels)
compareExpectedAndActual(t, expected.TimestampMs, actual.TimestampMs, expected.Value, actual.Value, j, a.Labels, "sample")
}

if value.IsStaleNaN(expected.Value) {
require.Truef(t, value.IsStaleNaN(actual.Value), "sample value at position %d is expected to be stale marker for series %s", j, a.Labels)
} else if math.IsNaN(expected.Value) {
require.Truef(t, math.IsNaN(actual.Value), "sample value at position %d is expected to be NaN for series %s", j, a.Labels)
} else {
if expected.Value == 0 {
require.Zero(t, actual.Value, "sample value at position %d with timestamp %d for series %s", j, expected.TimestampMs, a.Labels)
continue
}
// InEpsilon means the relative error (see https://en.wikipedia.org/wiki/Relative_error#Example) must be less than epsilon (here 1e-12).
// The relative error is calculated using: abs(actual-expected) / abs(expected)
require.InEpsilonf(t, expected.Value, actual.Value, 1e-12, "sample value at position %d with timestamp %d for series %s", j, expected.TimestampMs, a.Labels)
}
for j := 0; j < len(a.Histograms); j++ {
expected := a.Histograms[j]
actual := b.Histograms[j]
compareExpectedAndActual(t, expected.TimestampMs, actual.TimestampMs, expected.Histogram.Sum, actual.Histogram.Sum, j, a.Labels, "histogram")
}
}
}

func compareExpectedAndActual(t *testing.T, expectedTs, actualTs int64, expectedVal, actualVal float64, j int, labels []mimirpb.LabelAdapter, sampleType string) {
require.Equalf(t, expectedTs, actualTs, "%s timestamp at position %d for series %s", sampleType, j, labels)

if value.IsStaleNaN(expectedVal) {
require.Truef(t, value.IsStaleNaN(actualVal), "%s value at position %d is expected to be stale marker for series %s", sampleType, j, labels)
} else if math.IsNaN(expectedVal) {
require.Truef(t, math.IsNaN(actualVal), "%s value at position %d is expected to be NaN for series %s", sampleType, j, labels)
} else {
if expectedVal == 0 {
require.Zero(t, actualVal, "%s value at position %d with timestamp %d for series %s", sampleType, j, expectedTs, labels)
return
}
// InEpsilon means the relative error (see https://en.wikipedia.org/wiki/Relative_error#Example) must be less than epsilon (here 1e-12).
// The relative error is calculated using: abs(actual-expected) / abs(expected)
require.InEpsilonf(t, expectedVal, actualVal, 1e-12, "%s value at position %d with timestamp %d for series %s", sampleType, j, expectedTs, labels)
}
}

func TestQueryShardingCorrectness(t *testing.T) {
var (
numSeries = 1000
numStaleSeries = 100
numHistograms = 1000
numStaleHistograms = 100
histogramBuckets = []float64{1.0, 2.0, 4.0, 10.0, 100.0, math.Inf(1)}
numSeries = 1000
numStaleSeries = 100
numConvHistograms = 1000
numStaleConvHistograms = 100
histogramBuckets = []float64{1.0, 2.0, 4.0, 10.0, 100.0, math.Inf(1)}
numNativeHistograms = 1000
numStaleNativeHistograms = 100
)

tests := map[string]struct {
Expand Down Expand Up @@ -511,21 +527,64 @@ func TestQueryShardingCorrectness(t *testing.T) {
query: `scalar(sum(metric_counter)) < bool 1`,
expectedShardedQueries: 1,
},
`sum({__name__!=""})`: {
query: `sum({__name__!=""})`,
// Summing floats and native histograms together makes no sense, see
// https://prometheus.io/docs/prometheus/latest/querying/operators/#operators-for-native-histograms
// so we exclude native histograms here and in some subsequent tests
`sum({__name__!=""}) excluding native histograms`: {
query: `sum({__name__!="",__name__!="metric_native_histogram"})`,
expectedShardedQueries: 1,
},
`sum by (group_1) ({__name__!=""}) excluding native histograms`: {
query: `sum by (group_1) ({__name__!="",__name__!="metric_native_histogram"})`,
expectedShardedQueries: 1,
},
`sum by (group_1) (count_over_time({__name__!=""}[1m])) excluding native histograms`: {
query: `sum by (group_1) (count_over_time({__name__!="",__name__!="metric_native_histogram"}[1m]))`,
expectedShardedQueries: 1,
},
`sum(metric_native_histogram)`: {
query: `sum(metric_native_histogram)`,
expectedShardedQueries: 1,
},
`sum by (group_1) (metric_native_histogram)`: {
query: `sum by (group_1) (metric_native_histogram)`,
expectedShardedQueries: 1,
},
`sum by (group_1) (count_over_time(metric_native_histogram[1m]))`: {
query: `sum by (group_1) (count_over_time(metric_native_histogram[1m]))`,
expectedShardedQueries: 1,
},
`sum by (group_1) ({__name__!=""})`: {
query: `sum by (group_1) ({__name__!=""})`,
`count(metric_native_histogram)`: {
query: `count(metric_native_histogram)`,
expectedShardedQueries: 1,
},
`sum by (group_1) (count_over_time({__name__!=""}[1m]))`: {
query: `sum by (group_1) (count_over_time({__name__!=""}[1m]))`,
`count by (group_1) (metric_native_histogram)`: {
query: `count by (group_1) (metric_native_histogram)`,
expectedShardedQueries: 1,
},
`count by (group_1) (count_over_time(metric_native_histogram[1m]))`: {
query: `count by (group_1) (count_over_time(metric_native_histogram[1m]))`,
expectedShardedQueries: 1,
},
`histogram_sum(sum(metric_native_histogram))`: {
query: `histogram_sum(sum(metric_native_histogram))`,
expectedShardedQueries: 1,
},
`histogram_count(sum(metric_native_histogram))`: {
query: `histogram_count(sum(metric_native_histogram))`,
expectedShardedQueries: 1,
},
`histogram_quantile(0.5, sum(metric_native_histogram))`: {
query: `histogram_quantile(0.5, sum(metric_native_histogram))`,
expectedShardedQueries: 1,
},
`histogram_fraction(0, 0.5, sum(metric_native_histogram))`: {
query: `histogram_fraction(0, 0.5, sum(metric_native_histogram))`,
expectedShardedQueries: 1,
},
}

series := make([]*promql.StorageSeries, 0, numSeries+(numHistograms*len(histogramBuckets)))
series := make([]*promql.StorageSeries, 0, numSeries+(numConvHistograms*len(histogramBuckets))+numNativeHistograms)
seriesID := 0

// Add counter series.
Expand Down Expand Up @@ -557,24 +616,36 @@ func TestQueryShardingCorrectness(t *testing.T) {
start.Add(5*time.Minute), end, step, factor(2)))
seriesID++

// Add histogram series.
for i := 0; i < numHistograms; i++ {
// Add conventional histogram series.
for i := 0; i < numConvHistograms; i++ {
for bucketIdx, bucketLe := range histogramBuckets {
// We expect each bucket to have a value higher than the previous one.
gen := factor(float64(i) * float64(bucketIdx) * 0.1)
if i >= numHistograms-numStaleHistograms {
if i >= numConvHistograms-numStaleConvHistograms {
// Wrap the generator to inject the staleness marker between minute 10 and 20.
gen = stale(start.Add(10*time.Minute), start.Add(20*time.Minute), gen)
}

series = append(series, newSeries(newTestHistogramLabels(seriesID, bucketLe),
series = append(series, newSeries(newTestConventionalHistogramLabels(seriesID, bucketLe),
start.Add(-lookbackDelta), end, step, gen))
}

// Increase the series ID after all per-bucket series have been created.
seriesID++
}

// Add native histogram series.
for i := 0; i < numNativeHistograms; i++ {
gen := factor(float64(i) * 0.1)
if i >= numNativeHistograms-numStaleNativeHistograms {
// Wrap the generator to inject the staleness marker between minute 10 and 20.
gen = stale(start.Add(10*time.Minute), start.Add(20*time.Minute), gen)
}

series = append(series, newNativeHistogramSeries(newTestNativeHistogramLabels(seriesID), start.Add(-lookbackDelta), end, step, gen))
seriesID++
}

// Create a queryable on the fixtures.
queryable := storageSeriesQueryable(series)

Expand Down Expand Up @@ -681,6 +752,11 @@ func requireValidSamples(t *testing.T, result []SampleStream) {
return
}
}
for _, histogram := range stream.Histograms {
if !math.IsNaN(histogram.Histogram.Sum) {
return
}
}
}
t.Fatalf("Result should have some not-NaN samples")
}
Expand Down Expand Up @@ -773,10 +849,44 @@ func labelsForShardsGenerator(base labels.Labels, shards uint64) func(shard uint
}
}

type queryShardingFunctionCorrectnessTest struct {
fn string
args []string
rangeQuery bool
tpl string
}

// TestQuerySharding_FunctionCorrectness is the old test that probably at some point inspired the TestQuerySharding_Correctness,
// we keep it here since it adds more test cases.
func TestQuerySharding_FunctionCorrectness(t *testing.T) {
mkQueries, tests := func(tpl, fn string, testMatrix bool, fArgs []string) []string {
queryableFloats := storageSeriesQueryable([]*promql.StorageSeries{
newSeries(labels.FromStrings("__name__", "bar1", "baz", "blip", "bar", "blop", "foo", "barr"), start.Add(-lookbackDelta), end, step, factor(5)),
newSeries(labels.FromStrings("__name__", "bar1", "baz", "blip", "bar", "blop", "foo", "bazz"), start.Add(-lookbackDelta), end, step, factor(7)),
newSeries(labels.FromStrings("__name__", "bar1", "baz", "blip", "bar", "blap", "foo", "buzz"), start.Add(-lookbackDelta), end, step, factor(12)),
newSeries(labels.FromStrings("__name__", "bar1", "baz", "blip", "bar", "blap", "foo", "bozz"), start.Add(-lookbackDelta), end, step, factor(11)),
newSeries(labels.FromStrings("__name__", "bar1", "baz", "blip", "bar", "blop", "foo", "buzz"), start.Add(-lookbackDelta), end, step, factor(8)),
newSeries(labels.FromStrings("__name__", "bar1", "baz", "blip", "bar", "blap", "foo", "bazz"), start.Add(-lookbackDelta), end, step, arithmeticSequence(10)),
})
queryableNativeHistograms := storageSeriesQueryable([]*promql.StorageSeries{
newNativeHistogramSeries(labels.FromStrings("__name__", "bar1", "baz", "blip", "bar", "blop", "foo", "barr"), start.Add(-lookbackDelta), end, step, factor(5)),
newNativeHistogramSeries(labels.FromStrings("__name__", "bar1", "baz", "blip", "bar", "blop", "foo", "bazz"), start.Add(-lookbackDelta), end, step, factor(7)),
newNativeHistogramSeries(labels.FromStrings("__name__", "bar1", "baz", "blip", "bar", "blap", "foo", "buzz"), start.Add(-lookbackDelta), end, step, factor(12)),
newNativeHistogramSeries(labels.FromStrings("__name__", "bar1", "baz", "blip", "bar", "blap", "foo", "bozz"), start.Add(-lookbackDelta), end, step, factor(11)),
newNativeHistogramSeries(labels.FromStrings("__name__", "bar1", "baz", "blip", "bar", "blop", "foo", "buzz"), start.Add(-lookbackDelta), end, step, factor(8)),
newNativeHistogramSeries(labels.FromStrings("__name__", "bar1", "baz", "blip", "bar", "blap", "foo", "bazz"), start.Add(-lookbackDelta), end, step, arithmeticSequence(10)),
})

testQueryShardingFunctionCorrectness(t, queryableFloats, []queryShardingFunctionCorrectnessTest{})
testQueryShardingFunctionCorrectness(t, queryableNativeHistograms, []queryShardingFunctionCorrectnessTest{
{fn: "histogram_count"},
{fn: "histogram_sum"},
{fn: "histogram_fraction", tpl: `(<fn>(0,0.5,bar1{}))`},
{fn: "histogram_quantile", tpl: `(<fn>(0.5,bar1{}))`},
})
}

func testQueryShardingFunctionCorrectness(t *testing.T, queryable storage.Queryable, extraTests []queryShardingFunctionCorrectnessTest) {
mkQueries := func(tpl, fn string, testMatrix bool, fArgs []string) []string {
if tpl == "" {
tpl = `(<fn>(bar1{}<args>))`
}
Expand All @@ -801,12 +911,8 @@ func TestQuerySharding_FunctionCorrectness(t *testing.T) {
"count" + result,
"count by (bar)" + result,
}
}, []struct {
fn string
args []string
rangeQuery bool
tpl string
}{
}
tests := append([]queryShardingFunctionCorrectnessTest{
{fn: "abs"},
{fn: "avg_over_time", rangeQuery: true},
{fn: "ceil"},
Expand Down Expand Up @@ -868,21 +974,12 @@ func TestQuerySharding_FunctionCorrectness(t *testing.T) {
{fn: "holt_winters", args: []string{"0.5", "0.7"}, rangeQuery: true},
{fn: "label_replace", args: []string{`"fuzz"`, `"$1"`, `"foo"`, `"b(.*)"`}},
{fn: "label_join", args: []string{`"fuzz"`, `","`, `"foo"`, `"bar"`}},
}
}, extraTests...)

for _, tc := range tests {
const numShards = 4
for _, query := range mkQueries(tc.tpl, tc.fn, tc.rangeQuery, tc.args) {
t.Run(query, func(t *testing.T) {
queryable := storageSeriesQueryable([]*promql.StorageSeries{
newSeries(labels.FromStrings("__name__", "bar1", "baz", "blip", "bar", "blop", "foo", "barr"), start.Add(-lookbackDelta), end, step, factor(5)),
newSeries(labels.FromStrings("__name__", "bar1", "baz", "blip", "bar", "blop", "foo", "bazz"), start.Add(-lookbackDelta), end, step, factor(7)),
newSeries(labels.FromStrings("__name__", "bar1", "baz", "blip", "bar", "blap", "foo", "buzz"), start.Add(-lookbackDelta), end, step, factor(12)),
newSeries(labels.FromStrings("__name__", "bar1", "baz", "blip", "bar", "blap", "foo", "bozz"), start.Add(-lookbackDelta), end, step, factor(11)),
newSeries(labels.FromStrings("__name__", "bar1", "baz", "blip", "bar", "blop", "foo", "buzz"), start.Add(-lookbackDelta), end, step, factor(8)),
newSeries(labels.FromStrings("__name__", "bar1", "baz", "blip", "bar", "blap", "foo", "bazz"), start.Add(-lookbackDelta), end, step, arithmeticSequence(10)),
})

req := &PrometheusRangeQueryRequest{
Path: "/query_range",
Start: util.TimeToMillis(start),
Expand Down Expand Up @@ -934,7 +1031,7 @@ func TestQuerySharding_FunctionCorrectness(t *testing.T) {
"scalar": {},
"vector": {},
"pi": {},
// Until support is added for querying histograms, these can be ignored:
// These are tested only for native histograms but not for floats:
"histogram_count": {},
"histogram_sum": {},
"histogram_fraction": {},
Expand Down Expand Up @@ -1931,6 +2028,14 @@ func filterSeriesByShard(series []*promql.StorageSeries, shard *sharding.ShardSe
}

func newSeries(metric labels.Labels, from, to time.Time, step time.Duration, gen generator) *promql.StorageSeries {
return newSeriesInner(metric, from, to, step, gen, false)
}

func newNativeHistogramSeries(metric labels.Labels, from, to time.Time, step time.Duration, gen generator) *promql.StorageSeries {
return newSeriesInner(metric, from, to, step, gen, true)
}

func newSeriesInner(metric labels.Labels, from, to time.Time, step time.Duration, gen generator, histogram bool) *promql.StorageSeries {
var (
points []promql.Point
prevValue *float64
Expand All @@ -1948,10 +2053,19 @@ func newSeries(metric labels.Labels, from, to time.Time, step time.Duration, gen
continue
}

points = append(points, promql.Point{
T: t,
V: v,
})
var point promql.Point
if histogram {
point = promql.Point{
T: t,
H: generateTestHistogram(v),
}
} else {
point = promql.Point{
T: t,
V: v,
}
}
points = append(points, point)
}

// Ensure series labels are sorted.
Expand All @@ -1963,6 +2077,14 @@ func newSeries(metric labels.Labels, from, to time.Time, step time.Duration, gen
})
}

func generateTestHistogram(v float64) *histogram.FloatHistogram {
h := util_test.GenerateTestFloatHistogram(int(v))
if value.IsStaleNaN(v) {
h.Sum = v
}
return h
}

// newTestCounterLabels generates series labels for a counter metric used in tests.
func newTestCounterLabels(id int) labels.Labels {
return labels.FromStrings(
Expand All @@ -1974,8 +2096,8 @@ func newTestCounterLabels(id int) labels.Labels {
)
}

// newTestCounterLabels generates series labels for an histogram metric used in tests.
func newTestHistogramLabels(id int, bucketLe float64) labels.Labels {
// newTestConventionalHistogramLabels generates series labels for a conventional histogram metric used in tests.
func newTestConventionalHistogramLabels(id int, bucketLe float64) labels.Labels {
return labels.FromStrings(
"__name__", "metric_histogram_bucket",
"le", fmt.Sprintf("%f", bucketLe),
Expand All @@ -1986,6 +2108,17 @@ func newTestHistogramLabels(id int, bucketLe float64) labels.Labels {
)
}

// newTestNativeHistogramLabels generates series labels for a native histogram metric used in tests.
func newTestNativeHistogramLabels(id int) labels.Labels {
return labels.FromStrings(
"__name__", "metric_native_histogram",
"const", "fixed", // A constant label.
"unique", strconv.Itoa(id), // A unique label.
"group_1", strconv.Itoa(id%10), // A first grouping label.
"group_2", strconv.Itoa(id%3), // A second grouping label.
)
}

// generator defined a function used to generate sample values in tests.
type generator func(ts int64) float64

Expand Down
Loading

0 comments on commit eb5ef3a

Please sign in to comment.