Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for ruler not honoring user error with local querier #7567

Merged
merged 4 commits into from
Mar 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@
* [BUGFIX] promql: Fix Range selectors with an @ modifier are wrongly scoped in range queries. #7475
* [BUGFIX] Fix metadata API using wrong JSON field names. #7475
* [BUGFIX] Ruler: fix native histogram recording rule result corruption. #7552
* [BUGFIX] Rules: improve error handling when querier is local to the ruler. #7567

### Mixin

Expand Down
6 changes: 5 additions & 1 deletion integration/ruler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -644,10 +644,14 @@ func TestRulerMetricsForInvalidQueriesAndNoFetchedSeries(t *testing.T) {
// Verify that user-failures don't increase cortex_ruler_queries_failed_total
for groupName, expression := range map[string]string{
// Syntactically correct expression (passes check in ruler), but failing because of invalid regex. This fails in PromQL engine.
"invalid_group": `label_replace(metric, "foo", "$1", "service", "[")`,
// This selects the label "nolabel" which does not exist, thus too many chunks doesn't apply.
"invalid_group": `label_replace(metric{nolabel="none"}, "foo", "$1", "service", "[")`,

// This one fails in querier code, because of limits.
"too_many_chunks_group": `sum(metric)`,

// Combine the errors above to have a compound error.
"invalid_and_too_many_chunks_group": `label_replace(metric, "foo", "$1", "service", "[")`,
} {
t.Run(groupName, func(t *testing.T) {
addNewRuleAndWait(groupName, expression, true)
Expand Down
11 changes: 7 additions & 4 deletions pkg/ruler/compat.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,15 +152,18 @@ type RulesLimits interface {
RulerSyncRulesOnChangesEnabled(userID string) bool
}

func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Counter) rules.QueryFunc {
func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Counter, remoteQuerier bool) rules.QueryFunc {
return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) {
queries.Inc()
result, err := qf(ctx, qs, t)
if err == nil {
return result, nil
}

// We only care about errors returned by underlying Queryable. Errors returned by PromQL engine are "user-errors",
// and not interesting here.
qerr := QueryableError{}
if err != nil && errors.As(err, &qerr) {
if errors.As(err, &qerr) {
origErr := qerr.Unwrap()

// Not all errors returned by Queryable are interesting, only those that would result in 500 status code.
Expand All @@ -178,7 +181,7 @@ func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Coun

// Return unwrapped error.
return result, origErr
} else if err != nil {
} else if remoteQuerier {
// When remote querier enabled, consider anything an error except those with 4xx status code.
st, ok := grpcutil.ErrorToStatus(err)
if !(ok && st.Code()/100 == 4) {
Expand Down Expand Up @@ -307,7 +310,7 @@ func DefaultTenantManagerFactory(

// Wrap the query function with our custom logic.
wrappedQueryFunc := WrapQueryFuncWithReadConsistency(queryFunc, logger)
wrappedQueryFunc = MetricsQueryFunc(wrappedQueryFunc, totalQueries, failedQueries)
wrappedQueryFunc = MetricsQueryFunc(wrappedQueryFunc, totalQueries, failedQueries, cfg.QueryFrontend.Address != "")
wrappedQueryFunc = RecordAndReportRuleQueryMetrics(wrappedQueryFunc, queryTime, zeroFetchedSeriesCount, logger)

// Wrap the queryable with our custom logic.
Expand Down
83 changes: 56 additions & 27 deletions pkg/ruler/compat_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,8 @@ func TestPusherErrors(t *testing.T) {
}

func TestMetricsQueryFuncErrors(t *testing.T) {
for name, tc := range map[string]struct {
// Cases handled the same on remote and local
commonCases := map[string]struct {
returnedError error
expectedError error
expectedQueries int
Expand All @@ -284,35 +285,12 @@ func TestMetricsQueryFuncErrors(t *testing.T) {
expectedQueries: 1,
expectedFailedQueries: 0,
},

"httpgrpc 400 error": {
returnedError: httpgrpc.Errorf(http.StatusBadRequest, "test error"),
expectedError: httpgrpc.Errorf(http.StatusBadRequest, "test error"),
expectedQueries: 1,
expectedFailedQueries: 0, // 400 errors not reported as failures.
},

"httpgrpc 500 error": {
returnedError: httpgrpc.Errorf(http.StatusInternalServerError, "test error"),
expectedError: httpgrpc.Errorf(http.StatusInternalServerError, "test error"),
expectedQueries: 1,
expectedFailedQueries: 1, // 500 errors are failures
},

"unknown but non-queryable error": {
returnedError: errors.New("test error"),
expectedError: errors.New("test error"),
expectedQueries: 1,
expectedFailedQueries: 1, // Any other error should always be reported.
},

"promql.ErrStorage": {
returnedError: WrapQueryableErrors(promql.ErrStorage{Err: errors.New("test error")}),
expectedError: promql.ErrStorage{Err: errors.New("test error")},
expectedQueries: 1,
expectedFailedQueries: 1,
},

"promql.ErrQueryCanceled": {
returnedError: WrapQueryableErrors(promql.ErrQueryCanceled("test error")),
expectedError: promql.ErrQueryCanceled("test error"),
Expand All @@ -333,22 +311,73 @@ func TestMetricsQueryFuncErrors(t *testing.T) {
expectedQueries: 1,
expectedFailedQueries: 0, // Not interesting.
},

"unknown error": {
returnedError: WrapQueryableErrors(errors.New("test error")),
expectedError: errors.New("test error"),
expectedQueries: 1,
expectedFailedQueries: 1, // unknown errors are not 400, so they are reported.
},
} {
}
type testCase struct {
returnedError error
expectedError error
expectedQueries int
expectedFailedQueries int
remoteQuerier bool
}
// Add special cases to test first.
allCases := map[string]testCase{
"httpgrpc 400 error": {
returnedError: httpgrpc.Errorf(http.StatusBadRequest, "test error"),
expectedError: httpgrpc.Errorf(http.StatusBadRequest, "test error"),
expectedQueries: 1,
expectedFailedQueries: 0, // 400 errors not reported as failures.
remoteQuerier: true,
},

"httpgrpc 500 error": {
returnedError: httpgrpc.Errorf(http.StatusInternalServerError, "test error"),
expectedError: httpgrpc.Errorf(http.StatusInternalServerError, "test error"),
expectedQueries: 1,
expectedFailedQueries: 1, // 500 errors are failures
remoteQuerier: true,
},

"unknown but non-queryable error from remote": {
returnedError: errors.New("test error"),
expectedError: errors.New("test error"),
expectedQueries: 1,
expectedFailedQueries: 1, // Any other error should always be reported.
remoteQuerier: true,
},
"unknown but non-queryable error from local": {
returnedError: errors.New("test error"),
expectedError: errors.New("test error"),
expectedQueries: 1,
expectedFailedQueries: 0, // Assume that simple local errors are coming from user errors.
},
}
// Add the common cases for both remote and local.
for _, rc := range []bool{true, false} {
for name, tc := range commonCases {
allCases[fmt.Sprintf("%s remote=%v", name, rc)] = testCase{
returnedError: tc.returnedError,
expectedError: tc.expectedError,
expectedQueries: tc.expectedQueries,
expectedFailedQueries: tc.expectedFailedQueries,
remoteQuerier: rc,
}
}
}
for name, tc := range allCases {
t.Run(name, func(t *testing.T) {
queries := promauto.With(nil).NewCounter(prometheus.CounterOpts{})
failures := promauto.With(nil).NewCounter(prometheus.CounterOpts{})

mockFunc := func(ctx context.Context, q string, t time.Time) (promql.Vector, error) {
return promql.Vector{}, tc.returnedError
}
qf := MetricsQueryFunc(mockFunc, queries, failures)
qf := MetricsQueryFunc(mockFunc, queries, failures, tc.remoteQuerier)

_, err := qf(context.Background(), "test", time.Now())
require.Equal(t, tc.expectedError, err)
Expand Down
Loading