From 41fd54e5a033f0c9e87aec5dd88f66ba3e7f0bc7 Mon Sep 17 00:00:00 2001 From: Nick Pillitteri <56quarters@users.noreply.github.com> Date: Tue, 2 Jan 2024 10:19:02 -0500 Subject: [PATCH] Always validate tenant IDs and introduce max tenants setting (#6959) This change updates dskit to a version that does _not_ rely on global state for tenant ID parsing. Specifically it pulls in grafana/dskit#445. As part of this there are a few things changing: * We use multi-tenant parsing logic everywhere which actually enforces limits on the length of tenant IDs and the legal characters in them. * Instead of relying on single tenant parsing logic when tenant federation is disabled to reject multi-tenant queries, we add a query middleware that validates the number of expected tenants based on configuration. * We introduce a new setting to limit the max number of tenant IDs that may be included in a multi-tenant query. This change will result in different behavior in a few cases. However, it brings the actual behavior of Mimir in line with the documented behavior. Specifically, the following behavior changes (copied from dskit PR): * SingleResolver did not previously enforce a limit on the length of a tenant ID. A limit of 150 characters is now enforced. This has always been the documented behavior as far back as Cortex, where this code originated. Not enforcing it was an oversight. * SingleResolver previously allowed tenant IDs to contain the | character. This is no longer allowed as part of a tenant ID and instead will be treated as a divider between multiple tenant IDs. This has always been the documented behavior as far back as Cortex, where this code originated. Not enforcing it was an oversight. See grafana/dskit#445 --- CHANGELOG.md | 2 + cmd/mimir/config-descriptor.json | 11 ++ cmd/mimir/help-all.txt.tmpl | 2 + .../mimir/configure/about-versioning.md | 1 + .../configuration-parameters/index.md | 5 + go.mod | 2 +- go.sum | 4 +- pkg/api/api.go | 10 +- pkg/api/api_test.go | 13 +- pkg/api/tenant.go | 46 ++++++ pkg/api/tenant_test.go | 113 +++++++++++++++ .../cardinality_query_cache_test.go | 6 - pkg/frontend/querymiddleware/limits_test.go | 2 - pkg/frontend/querymiddleware/roundtrip.go | 2 +- .../querymiddleware/roundtrip_test.go | 6 +- pkg/frontend/querymiddleware/step_align.go | 22 ++- .../querymiddleware/step_align_test.go | 7 +- pkg/mimir/mimir.go | 17 +-- pkg/mimir/modules.go | 2 +- .../merge_exemplar_queryable.go | 4 +- .../tenantfederation/merge_metadata.go | 4 +- .../tenantfederation/merge_queryable.go | 13 +- .../tenantfederation/tenant_federation.go | 3 + pkg/util/noauth/no_auth.go | 12 +- .../grafana/dskit/tenant/resolver.go | 132 ++++-------------- .../github.com/grafana/dskit/tenant/tenant.go | 64 +++++++-- vendor/modules.txt | 2 +- 27 files changed, 322 insertions(+), 185 deletions(-) create mode 100644 pkg/api/tenant.go create mode 100644 pkg/api/tenant_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index c404aaa8ff6..5744e566262 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### Grafana Mimir * [CHANGE] Ingester: Increase default value of `-blocks-storage.tsdb.head-postings-for-matchers-cache-max-bytes` and `-blocks-storage.tsdb.block-postings-for-matchers-cache-max-bytes` to 100 MiB (previous default value was 10 MiB). #6764 +* [CHANGE] Validate tenant IDs according to [documented behavior](https://grafana.com/docs/mimir/latest/configure/about-tenant-ids/) even when tenant federation is not enabled. Note that this will cause some previously accepted tenant IDs to be rejected such as those longer than 150 bytes or containing `|` characters. #6959 * [ENHANCEMENT] Store-gateway: add no-compact details column on store-gateway tenants admin UI. #6848 * [ENHANCEMENT] PromQL: ignore small errors for bucketQuantile #6766 * [ENHANCEMENT] Distributor: improve efficiency of some errors #6785 @@ -13,6 +14,7 @@ * [ENHANCEMENT] Store-gateway: include more information about lazy index-header loading in traces. #6922 * [ENHANCEMENT] Distributor: support disabling metric relabel rules per-tenant via the flag `-distributor.metric-relabeling-enabled` or associated YAML. #6970 * [ENHANCEMENT] Distributor: `-distributor.remote-timeout` is now accounted from the first ingester push request being sent. #6972 +* [FEATURE] Introduce `-tenant-federation.max-tenants` option to limit the max number of tenants allowed for requests when federation is enabled. #6959 * [BUGFIX] Ingester: don't ignore errors encountered while iterating through chunks or samples in response to a query request. #6451 * [BUGFIX] Fix issue where queries can fail or omit OOO samples if OOO head compaction occurs between creating a querier and reading chunks #6766 * [BUGFIX] Fix issue where concatenatingChunkIterator can obscure errors #6766 diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index c7a5751e32e..879719cac9d 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -9953,6 +9953,17 @@ "fieldFlag": "tenant-federation.max-concurrent", "fieldType": "int", "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "max_tenants", + "required": false, + "desc": "The max number of tenant IDs that may be supplied for a federated query if enabled. 0 to disable the limit.", + "fieldValue": null, + "fieldDefaultValue": 0, + "fieldFlag": "tenant-federation.max-tenants", + "fieldType": "int", + "fieldCategory": "experimental" } ], "fieldValue": null, diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index b6afbf37584..ef1f7c7cce2 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -2733,6 +2733,8 @@ Usage of ./cmd/mimir/mimir: If enabled on all services, queries can be federated across multiple tenants. The tenant IDs involved need to be specified separated by a '|' character in the 'X-Scope-OrgID' header. -tenant-federation.max-concurrent int [experimental] The number of workers used for each tenant federated query. This setting limits the maximum number of per-tenant queries executed at a time for a tenant federated query. (default 16) + -tenant-federation.max-tenants int + [experimental] The max number of tenant IDs that may be supplied for a federated query if enabled. 0 to disable the limit. -timeseries-unmarshal-caching-optimization-enabled [experimental] Enables optimized marshaling of timeseries. (default true) -usage-stats.enabled diff --git a/docs/sources/mimir/configure/about-versioning.md b/docs/sources/mimir/configure/about-versioning.md index 7e426f6e813..3fb5ac32afc 100644 --- a/docs/sources/mimir/configure/about-versioning.md +++ b/docs/sources/mimir/configure/about-versioning.md @@ -132,6 +132,7 @@ The following features are currently experimental: - Use of Redis cache backend (`-query-frontend.results-cache.backend=redis`) - Query blocking on a per-tenant basis (configured with the limit `blocked_queries`) - Wait for the query-frontend to complete startup if a query request is received while it is starting up (`-query-frontend.not-running-timeout`) + - Max number of tenants that may be queried at once (`-tenant-federation.max-tenants`) - Query-scheduler - `-query-scheduler.querier-forget-delay` - Store-gateway diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 18b3c026348..eb311dd8cb8 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -210,6 +210,11 @@ tenant_federation: # CLI flag: -tenant-federation.max-concurrent [max_concurrent: | default = 16] + # (experimental) The max number of tenant IDs that may be supplied for a + # federated query if enabled. 0 to disable the limit. + # CLI flag: -tenant-federation.max-tenants + [max_tenants: | default = 0] + activity_tracker: # File where ongoing activities are stored. If empty, activity tracking is # disabled. diff --git a/go.mod b/go.mod index 923ebb7e1aa..ceef242559d 100644 --- a/go.mod +++ b/go.mod @@ -20,7 +20,7 @@ require ( github.com/golang/snappy v0.0.4 github.com/google/gopacket v1.1.19 github.com/gorilla/mux v1.8.1 - github.com/grafana/dskit v0.0.0-20231213223053-84f5540a28dd + github.com/grafana/dskit v0.0.0-20231219164408-2bfd67958535 github.com/grafana/e2e v0.1.1 github.com/hashicorp/golang-lru v1.0.2 // indirect github.com/json-iterator/go v1.1.12 diff --git a/go.sum b/go.sum index 237506288ea..a7ce076d8b5 100644 --- a/go.sum +++ b/go.sum @@ -542,8 +542,8 @@ github.com/gosimple/slug v1.1.1 h1:fRu/digW+NMwBIP+RmviTK97Ho/bEj/C9swrCspN3D4= github.com/gosimple/slug v1.1.1/go.mod h1:ER78kgg1Mv0NQGlXiDe57DpCyfbNywXXZ9mIorhxAf0= github.com/grafana-tools/sdk v0.0.0-20220919052116-6562121319fc h1:PXZQA2WCxe85Tnn+WEvr8fDpfwibmEPgfgFEaC87G24= github.com/grafana-tools/sdk v0.0.0-20220919052116-6562121319fc/go.mod h1:AHHlOEv1+GGQ3ktHMlhuTUwo3zljV3QJbC0+8o2kn+4= -github.com/grafana/dskit v0.0.0-20231213223053-84f5540a28dd h1:znSOnhAwMuyWzb1Vx550Q07o5p9EmqLyiDIYuKipiMc= -github.com/grafana/dskit v0.0.0-20231213223053-84f5540a28dd/go.mod h1:kkWM4WUV230bNG3urVRWPBnSJHs64y/0RmWjftnnn0c= +github.com/grafana/dskit v0.0.0-20231219164408-2bfd67958535 h1:qUdSymzUZ9bpNVcE79kTIW6oEB2J3oDLY7q82sHurjU= +github.com/grafana/dskit v0.0.0-20231219164408-2bfd67958535/go.mod h1:kkWM4WUV230bNG3urVRWPBnSJHs64y/0RmWjftnnn0c= github.com/grafana/e2e v0.1.1 h1:/b6xcv5BtoBnx8cZnCiey9DbjEc8z7gXHO5edoeRYxc= github.com/grafana/e2e v0.1.1/go.mod h1:RpNLgae5VT+BUHvPE+/zSypmOXKwEu4t+tnEMS1ATaE= github.com/grafana/goautoneg v0.0.0-20231010094147-47ce5e72a9ae h1:Yxbw9jKGJVC6qAK5Ubzzb/qZwM6rRMMqaDc/d4Vp3pM= diff --git a/pkg/api/api.go b/pkg/api/api.go index 133ec546281..24d1773e790 100644 --- a/pkg/api/api.go +++ b/pkg/api/api.go @@ -34,6 +34,7 @@ import ( "github.com/grafana/mimir/pkg/ingester/client" "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/querier" + "github.com/grafana/mimir/pkg/querier/tenantfederation" "github.com/grafana/mimir/pkg/ruler" "github.com/grafana/mimir/pkg/scheduler" "github.com/grafana/mimir/pkg/scheduler/schedulerpb" @@ -89,7 +90,7 @@ type API struct { indexPage *IndexPageContent } -func New(cfg Config, serverCfg server.Config, s *server.Server, logger log.Logger) (*API, error) { +func New(cfg Config, federationCfg tenantfederation.Config, serverCfg server.Config, s *server.Server, logger log.Logger) (*API, error) { // Ensure the encoded path is used. Required for the rules API s.HTTP.UseEncodedPath() @@ -113,10 +114,15 @@ func New(cfg Config, serverCfg server.Config, s *server.Server, logger log.Logge } // If no authentication middleware is present in the config, use the default authentication middleware. - if cfg.HTTPAuthMiddleware == nil { + if api.AuthMiddleware == nil { api.AuthMiddleware = middleware.AuthenticateUser } + // Unconditionally add middleware that ensures we only accept requests with an expected number of tenants + // that is applied after any existing auth middleware has run. Only a single tenant is allowed when federation + // is disabled. If federation is enabled, there is optionally a max number of tenants that is supported. + api.AuthMiddleware = middleware.Merge(api.AuthMiddleware, newTenantValidationMiddleware(federationCfg.Enabled, federationCfg.MaxTenants)) + return api, nil } diff --git a/pkg/api/api_test.go b/pkg/api/api_test.go index c7e7481052e..919ba65d2ae 100644 --- a/pkg/api/api_test.go +++ b/pkg/api/api_test.go @@ -17,6 +17,7 @@ import ( "github.com/grafana/dskit/server" "github.com/stretchr/testify/require" + "github.com/grafana/mimir/pkg/querier/tenantfederation" "github.com/grafana/mimir/pkg/util/gziphandler" ) @@ -33,11 +34,12 @@ func TestNewApiWithoutSourceIPExtractor(t *testing.T) { GRPCListenAddress: "localhost", MetricsNamespace: "without_source_ip_extractor", } + federationCfg := tenantfederation.Config{} require.NoError(t, serverCfg.LogLevel.Set("info")) srv, err := server.New(serverCfg) require.NoError(t, err) - api, err := New(cfg, serverCfg, srv, &FakeLogger{}) + api, err := New(cfg, federationCfg, serverCfg, srv, &FakeLogger{}) require.NoError(t, err) require.Nil(t, api.sourceIPs) } @@ -50,11 +52,12 @@ func TestNewApiWithSourceIPExtractor(t *testing.T) { GRPCListenAddress: "localhost", MetricsNamespace: "with_source_ip_extractor", } + federationCfg := tenantfederation.Config{} require.NoError(t, serverCfg.LogLevel.Set("info")) srv, err := server.New(serverCfg) require.NoError(t, err) - api, err := New(cfg, serverCfg, srv, &FakeLogger{}) + api, err := New(cfg, federationCfg, serverCfg, srv, &FakeLogger{}) require.NoError(t, err) require.NotNil(t, api.sourceIPs) } @@ -70,8 +73,9 @@ func TestNewApiWithInvalidSourceIPExtractor(t *testing.T) { LogSourceIPsRegex: "[*", MetricsNamespace: "with_invalid_source_ip_extractor", } + federationCfg := tenantfederation.Config{} - api, err := New(cfg, serverCfg, &s, &FakeLogger{}) + api, err := New(cfg, federationCfg, serverCfg, &s, &FakeLogger{}) require.Error(t, err) require.Nil(t, api) } @@ -79,12 +83,13 @@ func TestNewApiWithInvalidSourceIPExtractor(t *testing.T) { func TestApiGzip(t *testing.T) { cfg := Config{} serverCfg := getServerConfig(t) + federationCfg := tenantfederation.Config{} srv, err := server.New(serverCfg) require.NoError(t, err) go func() { _ = srv.Run() }() t.Cleanup(srv.Stop) - api, err := New(cfg, serverCfg, srv, log.NewNopLogger()) + api, err := New(cfg, federationCfg, serverCfg, srv, log.NewNopLogger()) require.NoError(t, err) handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { diff --git a/pkg/api/tenant.go b/pkg/api/tenant.go new file mode 100644 index 00000000000..227c072241a --- /dev/null +++ b/pkg/api/tenant.go @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package api + +import ( + "fmt" + "net/http" + + "github.com/grafana/dskit/middleware" + "github.com/grafana/dskit/tenant" +) + +const ( + tooManyTenantsTemplate = "too many tenant IDs present in the request. max: %d actual: %d" +) + +// newTenantValidationMiddleware creates a new middleware that validates the number of tenants +// being accessed in a particular request is allowed given the current tenant federation configuration. +// Note that this middleware requires that tenant ID has been set on the request context by something +// like middleware.AuthenticateUser. +func newTenantValidationMiddleware(federation bool, maxTenants int) middleware.Interface { + return middleware.Func(func(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + + ids, err := tenant.TenantIDs(ctx) + if err != nil { + http.Error(w, err.Error(), http.StatusUnauthorized) + return + } + + numIds := len(ids) + if !federation && numIds > 1 { + http.Error(w, fmt.Sprintf(tooManyTenantsTemplate, 1, numIds), http.StatusUnprocessableEntity) + return + } + + if federation && maxTenants > 0 && numIds > maxTenants { + http.Error(w, fmt.Sprintf(tooManyTenantsTemplate, maxTenants, numIds), http.StatusUnprocessableEntity) + return + } + + next.ServeHTTP(w, r.WithContext(ctx)) + }) + }) +} diff --git a/pkg/api/tenant_test.go b/pkg/api/tenant_test.go new file mode 100644 index 00000000000..034b4661f22 --- /dev/null +++ b/pkg/api/tenant_test.go @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package api + +import ( + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/grafana/dskit/middleware" + "github.com/grafana/dskit/tenant" + "github.com/grafana/dskit/user" + "github.com/stretchr/testify/require" +) + +func TestNewTenantValidationMiddleware(t *testing.T) { + for _, tc := range []struct { + name string + federation bool + maxTenants int + header string + expectedHTTPStatus int + expectedBodyText string + }{ + { + name: "federation disabled, invalid tenant header", + federation: false, + maxTenants: 0, + header: strings.Repeat("123", tenant.MaxTenantIDLength), + expectedHTTPStatus: 401, + expectedBodyText: "tenant ID is too long", + }, + { + name: "federation disabled, single tenant", + federation: false, + maxTenants: 0, + header: "tenant-a", + expectedHTTPStatus: 200, + expectedBodyText: "", + }, + { + name: "federation disabled, multiple tenants", + federation: false, + maxTenants: 0, + header: "tenant-a|tenant-b", + expectedHTTPStatus: 422, + expectedBodyText: "too many tenant IDs present", + }, + { + name: "federation enabled, invalid tenant header", + federation: true, + maxTenants: 0, + header: strings.Repeat("123", tenant.MaxTenantIDLength), + expectedHTTPStatus: 401, + expectedBodyText: "tenant ID is too long", + }, + { + name: "federation enabled, single tenant no limit", + federation: true, + maxTenants: 0, + header: "tenant-a", + expectedHTTPStatus: 200, + expectedBodyText: "", + }, + { + name: "federation enabled, multiple tenants no limit", + federation: true, + maxTenants: 0, + header: "tenant-a|tenant-b|tenant-c", + expectedHTTPStatus: 200, + expectedBodyText: "", + }, + { + name: "federation enabled, multiple tenants under limit", + federation: true, + maxTenants: 2, + header: "tenant-a|tenant-b", + expectedHTTPStatus: 200, + expectedBodyText: "", + }, + { + name: "federation enabled, multiple tenants over limit", + federation: true, + maxTenants: 2, + header: "tenant-a|tenant-b|tenant-c", + expectedHTTPStatus: 422, + expectedBodyText: "too many tenant IDs present", + }, + } { + t.Run(tc.name, func(t *testing.T) { + nop := http.HandlerFunc(func(writer http.ResponseWriter, request *http.Request) {}) + // Note that we add the authentication middleware since the tenant validation middleware relies + // on tenant ID being set in the context associated with the request. + handler := middleware.Merge(middleware.AuthenticateUser, newTenantValidationMiddleware(tc.federation, tc.maxTenants)).Wrap(nop) + + req := httptest.NewRequest("GET", "/", nil) + req.Header.Set(user.OrgIDHeaderName, tc.header) + resp := httptest.NewRecorder() + + handler.ServeHTTP(resp, req) + body, err := io.ReadAll(resp.Body) + + require.NoError(t, err) + require.Equal(t, tc.expectedHTTPStatus, resp.Code) + + if tc.expectedBodyText != "" { + require.Contains(t, string(body), tc.expectedBodyText) + } + }) + } +} diff --git a/pkg/frontend/querymiddleware/cardinality_query_cache_test.go b/pkg/frontend/querymiddleware/cardinality_query_cache_test.go index b494118e9df..4dea5249dff 100644 --- a/pkg/frontend/querymiddleware/cardinality_query_cache_test.go +++ b/pkg/frontend/querymiddleware/cardinality_query_cache_test.go @@ -20,12 +20,6 @@ import ( ) func TestCardinalityQueryCache_RoundTrip_WithTenantFederation(t *testing.T) { - // Enable tenant ID resolve used when tenant federation is enabled. - tenant.WithDefaultResolver(tenant.NewMultiResolver()) - t.Cleanup(func() { - tenant.WithDefaultResolver(tenant.NewSingleResolver()) - }) - tests := map[string]struct { tenantIDs []string limits map[string]mockLimits diff --git a/pkg/frontend/querymiddleware/limits_test.go b/pkg/frontend/querymiddleware/limits_test.go index 286b54387f8..3420d4c5363 100644 --- a/pkg/frontend/querymiddleware/limits_test.go +++ b/pkg/frontend/querymiddleware/limits_test.go @@ -15,7 +15,6 @@ import ( "time" "github.com/go-kit/log" - "github.com/grafana/dskit/tenant" "github.com/grafana/dskit/user" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" @@ -173,7 +172,6 @@ func TestLimitsMiddleware_MaxQueryExpressionSizeBytes(t *testing.T) { End: util.TimeToMillis(now.Add(-time.Hour)), } - tenant.WithDefaultResolver(tenant.NewMultiResolver()) limits := multiTenantMockLimits{ byTenant: map[string]mockLimits{ "test1": {maxQueryExpressionSizeBytes: testData.queryLimits["test1"]}, diff --git a/pkg/frontend/querymiddleware/roundtrip.go b/pkg/frontend/querymiddleware/roundtrip.go index 9b5a437a7af..4a98c828506 100644 --- a/pkg/frontend/querymiddleware/roundtrip.go +++ b/pkg/frontend/querymiddleware/roundtrip.go @@ -220,7 +220,7 @@ func newQueryTripperware( newLimitsMiddleware(limits, log), queryBlockerMiddleware, newInstrumentMiddleware("step_align", metrics), - newStepAlignMiddleware(limits, tenant.NewMultiResolver(), log, registerer), + newStepAlignMiddleware(limits, log, registerer), } var c cache.Cache diff --git a/pkg/frontend/querymiddleware/roundtrip_test.go b/pkg/frontend/querymiddleware/roundtrip_test.go index dff19f4e9e3..81ff2df77f1 100644 --- a/pkg/frontend/querymiddleware/roundtrip_test.go +++ b/pkg/frontend/querymiddleware/roundtrip_test.go @@ -65,7 +65,8 @@ func TestRangeTripperware(t *testing.T) { next: http.DefaultTransport, } - tw, err := NewTripperware(Config{}, + tw, err := NewTripperware( + Config{}, log.NewNopLogger(), mockLimits{}, newTestPrometheusCodec(), @@ -289,7 +290,8 @@ func TestTripperware_Metrics(t *testing.T) { for testName, testData := range tests { t.Run(testName, func(t *testing.T) { reg := prometheus.NewPedanticRegistry() - tw, err := NewTripperware(Config{DeprecatedAlignQueriesWithStep: testData.stepAlignEnabled}, + tw, err := NewTripperware( + Config{DeprecatedAlignQueriesWithStep: testData.stepAlignEnabled}, log.NewNopLogger(), mockLimits{}, newTestPrometheusCodec(), diff --git a/pkg/frontend/querymiddleware/step_align.go b/pkg/frontend/querymiddleware/step_align.go index 6e9edcd3616..f8f7bc5676a 100644 --- a/pkg/frontend/querymiddleware/step_align.go +++ b/pkg/frontend/querymiddleware/step_align.go @@ -18,16 +18,15 @@ import ( ) type stepAlignMiddleware struct { - next Handler - limits Limits - resolver tenant.Resolver - logger log.Logger - aligned *prometheus.CounterVec + next Handler + limits Limits + logger log.Logger + aligned *prometheus.CounterVec } // newStepAlignMiddleware creates a middleware that aligns the start and end of request to the step to // improve the cacheability of the query results based on per-tenant configuration. -func newStepAlignMiddleware(limits Limits, resolver tenant.Resolver, logger log.Logger, registerer prometheus.Registerer) Middleware { +func newStepAlignMiddleware(limits Limits, logger log.Logger, registerer prometheus.Registerer) Middleware { aligned := promauto.With(registerer).NewCounterVec(prometheus.CounterOpts{ Name: "cortex_query_frontend_queries_step_aligned_total", Help: "Number of queries whose start or end times have been adjusted to be step-aligned.", @@ -35,17 +34,16 @@ func newStepAlignMiddleware(limits Limits, resolver tenant.Resolver, logger log. return MiddlewareFunc(func(next Handler) Handler { return &stepAlignMiddleware{ - next: next, - limits: limits, - resolver: resolver, - logger: logger, - aligned: aligned, + next: next, + limits: limits, + logger: logger, + aligned: aligned, } }) } func (s *stepAlignMiddleware) Do(ctx context.Context, r Request) (Response, error) { - tenants, err := s.resolver.TenantIDs(ctx) + tenants, err := tenant.TenantIDs(ctx) if err != nil { return s.next.Do(ctx, r) } diff --git a/pkg/frontend/querymiddleware/step_align_test.go b/pkg/frontend/querymiddleware/step_align_test.go index 20ec51a2d7d..eb74be335b7 100644 --- a/pkg/frontend/querymiddleware/step_align_test.go +++ b/pkg/frontend/querymiddleware/step_align_test.go @@ -9,7 +9,6 @@ import ( "context" "testing" - "github.com/grafana/dskit/tenant" "github.com/grafana/dskit/user" "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/require" @@ -59,11 +58,10 @@ func TestStepAlignMiddleware_SingleUser(t *testing.T) { }) limits := mockLimits{alignQueriesWithStep: true} - resolver := tenant.NewMultiResolver() log := test.NewTestingLogger(t) ctx := user.InjectOrgID(context.Background(), "123") - s := newStepAlignMiddleware(limits, resolver, log, prometheus.NewPedanticRegistry()).Wrap(next) + s := newStepAlignMiddleware(limits, log, prometheus.NewPedanticRegistry()).Wrap(next) _, err := s.Do(ctx, tc.input) require.NoError(t, err) require.Equal(t, tc.expected, result) @@ -143,11 +141,10 @@ func TestStepAlignMiddleware_MultipleUsers(t *testing.T) { return nil, nil }) - resolver := tenant.NewMultiResolver() log := test.NewTestingLogger(t) ctx := user.InjectOrgID(context.Background(), "123|456") - s := newStepAlignMiddleware(tc.limits, resolver, log, prometheus.NewPedanticRegistry()).Wrap(next) + s := newStepAlignMiddleware(tc.limits, log, prometheus.NewPedanticRegistry()).Wrap(next) _, err := s.Do(ctx, tc.input) require.NoError(t, err) require.Equal(t, tc.expected, result) diff --git a/pkg/mimir/mimir.go b/pkg/mimir/mimir.go index 975f5bd3e4e..66612e58b4a 100644 --- a/pkg/mimir/mimir.go +++ b/pkg/mimir/mimir.go @@ -30,7 +30,6 @@ import ( "github.com/grafana/dskit/server" "github.com/grafana/dskit/services" "github.com/grafana/dskit/signals" - "github.com/grafana/dskit/tenant" "github.com/opentracing/opentracing-go" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" @@ -725,16 +724,14 @@ func New(cfg Config, reg prometheus.Registerer) (*Mimir, error) { setUpGoRuntimeMetrics(cfg, reg) - // Swap out the default resolver to support multiple tenant IDs separated by a '|' - if cfg.TenantFederation.Enabled { - tenant.WithDefaultResolver(tenant.NewMultiResolver()) - - if cfg.Ruler.TenantFederation.Enabled { - util_log.WarnExperimentalUse("ruler.tenant-federation") - } + if cfg.TenantFederation.Enabled && cfg.Ruler.TenantFederation.Enabled { + util_log.WarnExperimentalUse("ruler.tenant-federation") } - cfg.API.HTTPAuthMiddleware = noauth.SetupAuthMiddleware(&cfg.Server, cfg.MultitenancyEnabled, + cfg.API.HTTPAuthMiddleware = noauth.SetupAuthMiddleware( + &cfg.Server, + cfg.MultitenancyEnabled, + cfg.NoAuthTenant, // Also don't check auth for these gRPC methods, since single call is used for multiple users (or no user like health check). []string{ "/grpc.health.v1.Health/Check", @@ -744,7 +741,7 @@ func New(cfg Config, reg prometheus.Registerer) (*Mimir, error) { "/schedulerpb.SchedulerForFrontend/FrontendLoop", "/schedulerpb.SchedulerForQuerier/QuerierLoop", "/schedulerpb.SchedulerForQuerier/NotifyQuerierShutdown", - }, cfg.NoAuthTenant) + }) // Inject the registerer in the Server config too. cfg.Server.Registerer = reg diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index 7561cb45e93..c88c4dfcc99 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -118,7 +118,7 @@ func newDefaultConfig() *Config { func (t *Mimir) initAPI() (services.Service, error) { t.Cfg.API.ServerPrefix = t.Cfg.Server.PathPrefix - a, err := api.New(t.Cfg.API, t.Cfg.Server, t.Server, util_log.Logger) + a, err := api.New(t.Cfg.API, t.Cfg.TenantFederation, t.Cfg.Server, t.Server, util_log.Logger) if err != nil { return nil, err } diff --git a/pkg/querier/tenantfederation/merge_exemplar_queryable.go b/pkg/querier/tenantfederation/merge_exemplar_queryable.go index e45d4e07502..7436d62e816 100644 --- a/pkg/querier/tenantfederation/merge_exemplar_queryable.go +++ b/pkg/querier/tenantfederation/merge_exemplar_queryable.go @@ -49,7 +49,6 @@ func NewMergeExemplarQueryable(idLabelName string, upstream storage.ExemplarQuer idLabelName: idLabelName, bypassWithSingleQuerier: bypassWithSingleQuerier, upstream: upstream, - resolver: tenant.NewMultiResolver(), maxConcurrency: maxConcurrency, tenantsQueried: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ Name: "cortex_querier_federation_exemplar_tenants_queried", @@ -64,14 +63,13 @@ type mergeExemplarQueryable struct { idLabelName string bypassWithSingleQuerier bool upstream storage.ExemplarQueryable - resolver tenant.Resolver maxConcurrency int tenantsQueried prometheus.Histogram } // tenantsAndQueriers returns a list of tenant IDs and corresponding queriers based on the context func (m *mergeExemplarQueryable) tenantsAndQueriers(ctx context.Context) ([]string, []storage.ExemplarQuerier, error) { - tenantIDs, err := m.resolver.TenantIDs(ctx) + tenantIDs, err := tenant.TenantIDs(ctx) if err != nil { return nil, nil, err } diff --git a/pkg/querier/tenantfederation/merge_metadata.go b/pkg/querier/tenantfederation/merge_metadata.go index 0407e446af0..aa5c0ab76ce 100644 --- a/pkg/querier/tenantfederation/merge_metadata.go +++ b/pkg/querier/tenantfederation/merge_metadata.go @@ -25,14 +25,12 @@ func NewMetadataSupplier(next querier.MetadataSupplier, maxConcurrency int, logg return &mergeMetadataSupplier{ next: next, maxConcurrency: maxConcurrency, - resolver: tenant.NewMultiResolver(), logger: logger, } } type mergeMetadataSupplier struct { next querier.MetadataSupplier - resolver tenant.Resolver maxConcurrency int logger log.Logger } @@ -41,7 +39,7 @@ func (m *mergeMetadataSupplier) MetricsMetadata(ctx context.Context, req *client spanlog, ctx := spanlogger.NewWithLogger(ctx, m.logger, "mergeMetadataSupplier.MetricsMetadata") defer spanlog.Finish() - tenantIDs, err := m.resolver.TenantIDs(ctx) + tenantIDs, err := tenant.TenantIDs(ctx) if err != nil { return nil, err } diff --git a/pkg/querier/tenantfederation/merge_queryable.go b/pkg/querier/tenantfederation/merge_queryable.go index 53f9f758282..285b5f602d8 100644 --- a/pkg/querier/tenantfederation/merge_queryable.go +++ b/pkg/querier/tenantfederation/merge_queryable.go @@ -89,9 +89,9 @@ func (q *tenantQuerier) Close() error { return q.upstream.Close() } -// NewMergeQueryable returns a queryable that merges results for all involved federation IDs. -// The underlying querier is returned by a callback in MergeQueryableCallbacks. The IDs are returned -// by a tenant.Resolver implementation. +// NewMergeQueryable returns a queryable that merges results for all involved +// federation IDs. The underlying querier is returned by a callback in +// MergeQueryableCallbacks. // // By setting bypassWithSingleID to true the mergeQuerier gets bypassed, // and results for requests with a single ID will not contain the ID label. @@ -103,6 +103,9 @@ func (q *tenantQuerier) Close() error { // the previous value is exposed through a new label prefixed with "original_". // This behaviour is not implemented recursively. func NewMergeQueryable(idLabelName string, callbacks MergeQueryableCallbacks, resolver tenant.Resolver, bypassWithSingleID bool, maxConcurrency int, reg prometheus.Registerer, logger log.Logger) storage.Queryable { + // Note that we allow tenant.Resolver to be injected instead of using the + // tenant.TenantIDs() method because GEM needs to inject different behavior + // here for the cluster federation feature. return &mergeQueryable{ logger: logger, idLabelName: idLabelName, @@ -139,8 +142,8 @@ func (m *mergeQueryable) Querier(mint int64, maxt int64) (storage.Querier, error logger: m.logger, idLabelName: m.idLabelName, callbacks: m.callbacks, - upstream: upstream, resolver: m.resolver, + upstream: upstream, maxConcurrency: m.maxConcurrency, bypassWithSingleID: m.bypassWithSingleID, tenantsQueried: m.tenantsQueried, @@ -155,8 +158,8 @@ func (m *mergeQueryable) Querier(mint int64, maxt int64) (storage.Querier, error type mergeQuerier struct { logger log.Logger callbacks MergeQueryableCallbacks - upstream MergeQuerierUpstream resolver tenant.Resolver + upstream MergeQuerierUpstream idLabelName string maxConcurrency int bypassWithSingleID bool diff --git a/pkg/querier/tenantfederation/tenant_federation.go b/pkg/querier/tenantfederation/tenant_federation.go index e25106c5d20..cfb148c7171 100644 --- a/pkg/querier/tenantfederation/tenant_federation.go +++ b/pkg/querier/tenantfederation/tenant_federation.go @@ -15,17 +15,20 @@ const ( defaultTenantLabel = "__tenant_id__" retainExistingPrefix = "original_" defaultConcurrency = 16 + defaultMaxTenants = 0 ) type Config struct { // Enabled switches on support for multi tenant query federation Enabled bool `yaml:"enabled"` MaxConcurrent int `yaml:"max_concurrent" category:"experimental"` + MaxTenants int `yaml:"max_tenants" category:"experimental"` } func (cfg *Config) RegisterFlags(f *flag.FlagSet) { f.BoolVar(&cfg.Enabled, "tenant-federation.enabled", false, "If enabled on all services, queries can be federated across multiple tenants. The tenant IDs involved need to be specified separated by a '|' character in the 'X-Scope-OrgID' header.") f.IntVar(&cfg.MaxConcurrent, "tenant-federation.max-concurrent", defaultConcurrency, "The number of workers used for each tenant federated query. This setting limits the maximum number of per-tenant queries executed at a time for a tenant federated query.") + f.IntVar(&cfg.MaxTenants, "tenant-federation.max-tenants", defaultMaxTenants, "The max number of tenant IDs that may be supplied for a federated query if enabled. 0 to disable the limit.") } // filterValuesByMatchers applies matchers to inputed `idLabelName` and diff --git a/pkg/util/noauth/no_auth.go b/pkg/util/noauth/no_auth.go index 0390f309701..8abcaadd136 100644 --- a/pkg/util/noauth/no_auth.go +++ b/pkg/util/noauth/no_auth.go @@ -3,7 +3,7 @@ // Provenance-includes-license: Apache-2.0 // Provenance-includes-copyright: The Cortex Authors. -// Package noauth provides middlewares thats injects a tenant ID, so the rest of the code +// Package noauth provides middlewares that injects a tenant ID so the rest of the code // can continue to be multitenant. package noauth @@ -18,8 +18,8 @@ import ( ) // SetupAuthMiddleware for the given server config. -func SetupAuthMiddleware(config *server.Config, enabled bool, noGRPCAuthOn []string, noAuthTenant string) middleware.Interface { - if enabled { +func SetupAuthMiddleware(config *server.Config, multitenancyEnabled bool, noMultitenancyTenant string, noGRPCAuthOn []string) middleware.Interface { + if multitenancyEnabled { ignoredMethods := map[string]bool{} for _, m := range noGRPCAuthOn { ignoredMethods[m] = true @@ -46,13 +46,13 @@ func SetupAuthMiddleware(config *server.Config, enabled bool, noGRPCAuthOn []str config.GRPCMiddleware = append(config.GRPCMiddleware, func(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (interface{}, error) { - ctx = user.InjectOrgID(ctx, noAuthTenant) + ctx = user.InjectOrgID(ctx, noMultitenancyTenant) return handler(ctx, req) }, ) config.GRPCStreamMiddleware = append(config.GRPCStreamMiddleware, func(srv interface{}, ss grpc.ServerStream, _ *grpc.StreamServerInfo, handler grpc.StreamHandler) error { - ctx := user.InjectOrgID(ss.Context(), noAuthTenant) + ctx := user.InjectOrgID(ss.Context(), noMultitenancyTenant) return handler(srv, serverStream{ ctx: ctx, @@ -62,7 +62,7 @@ func SetupAuthMiddleware(config *server.Config, enabled bool, noGRPCAuthOn []str ) return middleware.Func(func(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - ctx := user.InjectOrgID(r.Context(), noAuthTenant) + ctx := user.InjectOrgID(r.Context(), noMultitenancyTenant) next.ServeHTTP(w, r.WithContext(ctx)) }) }) diff --git a/vendor/github.com/grafana/dskit/tenant/resolver.go b/vendor/github.com/grafana/dskit/tenant/resolver.go index aa19d75bb4a..35e95b1c831 100644 --- a/vendor/github.com/grafana/dskit/tenant/resolver.go +++ b/vendor/github.com/grafana/dskit/tenant/resolver.go @@ -2,20 +2,11 @@ package tenant import ( "context" - "errors" - "net/http" "strings" "github.com/grafana/dskit/user" ) -var defaultResolver Resolver = NewSingleResolver() - -// WithDefaultResolver updates the resolver used for the package methods. -func WithDefaultResolver(r Resolver) { - defaultResolver = r -} - // TenantID returns exactly a single tenant ID from the context. It should be // used when a certain endpoint should only support exactly a single // tenant ID. It returns an error user.ErrNoOrgID if there is no tenant ID @@ -25,7 +16,16 @@ func WithDefaultResolver(r Resolver) { // //nolint:revive func TenantID(ctx context.Context) (string, error) { - return defaultResolver.TenantID(ctx) + orgIDs, err := TenantIDs(ctx) + if err != nil { + return "", err + } + + if len(orgIDs) > 1 { + return "", user.ErrTooManyOrgIDs + } + + return orgIDs[0], nil } // TenantIDs returns all tenant IDs from the context. It should return @@ -36,7 +36,20 @@ func TenantID(ctx context.Context) (string, error) { // //nolint:revive func TenantIDs(ctx context.Context) ([]string, error) { - return defaultResolver.TenantIDs(ctx) + //lint:ignore faillint wrapper around upstream method + orgID, err := user.ExtractOrgID(ctx) + if err != nil { + return nil, err + } + + orgIDs := strings.Split(orgID, tenantIDsSeparator) + for _, id := range orgIDs { + if err := ValidTenantID(id); err != nil { + return nil, err + } + } + + return NormalizeTenantIDs(orgIDs), nil } type Resolver interface { @@ -52,109 +65,20 @@ type Resolver interface { TenantIDs(context.Context) ([]string, error) } -// NewSingleResolver creates a tenant resolver, which restricts all requests to -// be using a single tenant only. This allows a wider set of characters to be -// used within the tenant ID and should not impose a breaking change. -func NewSingleResolver() *SingleResolver { - return &SingleResolver{} -} - -type SingleResolver struct { -} - -// containsUnsafePathSegments will return true if the string is a directory -// reference like `.` and `..` or if any path separator character like `/` and -// `\` can be found. -func containsUnsafePathSegments(id string) bool { - // handle the relative reference to current and parent path. - if id == "." || id == ".." { - return true - } - - return strings.ContainsAny(id, "\\/") -} - -var errInvalidTenantID = errors.New("invalid tenant ID") - -func (t *SingleResolver) TenantID(ctx context.Context) (string, error) { - //lint:ignore faillint wrapper around upstream method - id, err := user.ExtractOrgID(ctx) - if err != nil { - return "", err - } - - if containsUnsafePathSegments(id) { - return "", errInvalidTenantID - } - - return id, nil -} - -func (t *SingleResolver) TenantIDs(ctx context.Context) ([]string, error) { - orgID, err := t.TenantID(ctx) - if err != nil { - return nil, err - } - return []string{orgID}, err -} - -type MultiResolver struct { -} +type MultiResolver struct{} // NewMultiResolver creates a tenant resolver, which allows request to have // multiple tenant ids submitted separated by a '|' character. This enforces // further limits on the character set allowed within tenants as detailed here: -// https://cortexmetrics.io/docs/guides/limitations/#tenant-id-naming) +// https://grafana.com/docs/mimir/latest/configure/about-tenant-ids/ func NewMultiResolver() *MultiResolver { return &MultiResolver{} } func (t *MultiResolver) TenantID(ctx context.Context) (string, error) { - orgIDs, err := t.TenantIDs(ctx) - if err != nil { - return "", err - } - - if len(orgIDs) > 1 { - return "", user.ErrTooManyOrgIDs - } - - return orgIDs[0], nil + return TenantID(ctx) } func (t *MultiResolver) TenantIDs(ctx context.Context) ([]string, error) { - //lint:ignore faillint wrapper around upstream method - orgID, err := user.ExtractOrgID(ctx) - if err != nil { - return nil, err - } - - orgIDs := strings.Split(orgID, tenantIDsLabelSeparator) - for _, orgID := range orgIDs { - if err := ValidTenantID(orgID); err != nil { - return nil, err - } - if containsUnsafePathSegments(orgID) { - return nil, errInvalidTenantID - } - } - - return NormalizeTenantIDs(orgIDs), nil -} - -// ExtractTenantIDFromHTTPRequest extracts a single TenantID through a given -// resolver directly from a HTTP request. -func ExtractTenantIDFromHTTPRequest(req *http.Request) (string, context.Context, error) { - //lint:ignore faillint wrapper around upstream method - _, ctx, err := user.ExtractOrgIDFromHTTPRequest(req) - if err != nil { - return "", nil, err - } - - tenantID, err := defaultResolver.TenantID(ctx) - if err != nil { - return "", nil, err - } - - return tenantID, ctx, nil + return TenantIDs(ctx) } diff --git a/vendor/github.com/grafana/dskit/tenant/tenant.go b/vendor/github.com/grafana/dskit/tenant/tenant.go index a5807500e52..4a89b57225a 100644 --- a/vendor/github.com/grafana/dskit/tenant/tenant.go +++ b/vendor/github.com/grafana/dskit/tenant/tenant.go @@ -4,14 +4,23 @@ import ( "context" "errors" "fmt" + "net/http" "sort" "strings" "github.com/grafana/dskit/user" ) +const ( + // MaxTenantIDLength is the max length of single tenant ID in bytes + MaxTenantIDLength = 150 + + tenantIDsSeparator = "|" +) + var ( - errTenantIDTooLong = errors.New("tenant ID is too long: max 150 characters") + errTenantIDTooLong = fmt.Errorf("tenant ID is too long: max %d characters", MaxTenantIDLength) + errUnsafeTenantID = errors.New("tenant ID is '.' or '..'") ) type errTenantIDUnsupportedCharacter struct { @@ -27,9 +36,7 @@ func (e *errTenantIDUnsupportedCharacter) Error() string { ) } -const tenantIDsLabelSeparator = "|" - -// NormalizeTenantIDs is creating a normalized form by sortiing and de-duplicating the list of tenantIDs +// NormalizeTenantIDs creates a normalized form by sorting and de-duplicating the list of tenantIDs func NormalizeTenantIDs(tenantIDs []string) []string { sort.Strings(tenantIDs) @@ -49,7 +56,7 @@ func NormalizeTenantIDs(tenantIDs []string) []string { return tenantIDs[0:posOut] } -// ValidTenantID +// ValidTenantID returns an error if the single tenant ID is invalid, nil otherwise func ValidTenantID(s string) error { // check if it contains invalid runes for pos, r := range s { @@ -61,19 +68,49 @@ func ValidTenantID(s string) error { } } - if len(s) > 150 { + if len(s) > MaxTenantIDLength { return errTenantIDTooLong } + if containsUnsafePathSegments(s) { + return errUnsafeTenantID + } + return nil } +// JoinTenantIDs returns all tenant IDs concatenated with the separator character `|` func JoinTenantIDs(tenantIDs []string) string { - return strings.Join(tenantIDs, tenantIDsLabelSeparator) + return strings.Join(tenantIDs, tenantIDsSeparator) +} + +// ExtractTenantIDFromHTTPRequest extracts a single tenant ID directly from a HTTP request. +func ExtractTenantIDFromHTTPRequest(req *http.Request) (string, context.Context, error) { + //lint:ignore faillint wrapper around upstream method + _, ctx, err := user.ExtractOrgIDFromHTTPRequest(req) + if err != nil { + return "", nil, err + } + + tenantID, err := TenantID(ctx) + if err != nil { + return "", nil, err + } + + return tenantID, ctx, nil +} + +// TenantIDsFromOrgID extracts different tenants from an orgID string value +// +// ignore stutter warning +// +//nolint:revive +func TenantIDsFromOrgID(orgID string) ([]string, error) { + return TenantIDs(user.InjectOrgID(context.TODO(), orgID)) } // this checks if a rune is supported in tenant IDs (according to -// https://cortexmetrics.io/docs/guides/limitations/#tenant-id-naming) +// https://grafana.com/docs/mimir/latest/configure/about-tenant-ids/ func isSupported(c rune) bool { // characters if ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') { @@ -96,11 +133,8 @@ func isSupported(c rune) bool { c == ')' } -// TenantIDsFromOrgID extracts different tenants from an orgID string value -// -// ignore stutter warning -// -//nolint:revive -func TenantIDsFromOrgID(orgID string) ([]string, error) { - return TenantIDs(user.InjectOrgID(context.TODO(), orgID)) +// containsUnsafePathSegments will return true if the string is a directory +// reference like `.` and `..` +func containsUnsafePathSegments(id string) bool { + return id == "." || id == ".." } diff --git a/vendor/modules.txt b/vendor/modules.txt index eddf20b226e..d22628726e8 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -535,7 +535,7 @@ github.com/gosimple/slug # github.com/grafana-tools/sdk v0.0.0-20220919052116-6562121319fc ## explicit; go 1.13 github.com/grafana-tools/sdk -# github.com/grafana/dskit v0.0.0-20231213223053-84f5540a28dd +# github.com/grafana/dskit v0.0.0-20231219164408-2bfd67958535 ## explicit; go 1.20 github.com/grafana/dskit/backoff github.com/grafana/dskit/ballast