Skip to content

Commit

Permalink
Receive: Make head series limiting config per tenant (#5685)
Browse files Browse the repository at this point in the history
* Receive: Make head series limiting configuration per-tenant

Signed-off-by: Saswata Mukherjee <saswataminsta@yahoo.com>

* Regen docs and make naming consistent

Signed-off-by: Saswata Mukherjee <saswataminsta@yahoo.com>

* Add changelog

Signed-off-by: Saswata Mukherjee <saswataminsta@yahoo.com>

* Align config with exisiting implementation

Signed-off-by: Saswata Mukherjee <saswataminsta@yahoo.com>

* Implement suggestions

Signed-off-by: Saswata Mukherjee <saswataminsta@yahoo.com>

* Implement suggestions; Initialize limiter in cmd/receive.go

Signed-off-by: Saswata Mukherjee <saswataminsta@yahoo.com>

* Make log 'component' consistent

Signed-off-by: Saswata Mukherjee <saswataminsta@yahoo.com>

* Update unit test

Signed-off-by: Saswata Mukherjee <saswataminsta@yahoo.com>

* Fix after rebase

Signed-off-by: Saswata Mukherjee <saswataminsta@yahoo.com>

Signed-off-by: Saswata Mukherjee <saswataminsta@yahoo.com>
  • Loading branch information
saswatamcode committed Sep 26, 2022
1 parent f4ffce6 commit a31e4c5
Show file tree
Hide file tree
Showing 15 changed files with 433 additions and 309 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re
- [#5255](https://github.com/thanos-io/thanos/pull/5296) Query: Use k-way merging for the proxying logic. The proxying sub-system now uses much less resources (~25-80% less CPU usage, ~30-50% less RAM usage according to our benchmarks). Reduces query duration by a few percent on queries with lots of series.
- [#5690](https://github.com/thanos-io/thanos/pull/5690) Compact: update `--debug.accept-malformed-index` flag to apply to downsampling. Previously the flag only applied to compaction, and fatal errors would still occur when downsampling was attempted.
- [#5707](https://github.com/thanos-io/thanos/pull/5707) Objstore: Update objstore to latest version which includes a refactored Azure Storage Account implementation with a new SDK.
- [#5685](https://github.com/thanos-io/thanos/pull/5685) Receive: Make active/head series limiting configuration per tenant by adding it to new limiting config.

### Removed

Expand Down
61 changes: 20 additions & 41 deletions cmd/thanos/receive.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ package main

import (
"context"
"net/url"
"os"
"path"
"strings"
Expand Down Expand Up @@ -205,9 +204,7 @@ func runReceive(
return errors.Wrap(err, "parse limit configuration")
}
}

// Impose active series limit only if Receiver is in Router or RouterIngestor mode, and config is provided.
seriesLimitSupported := (receiveMode == receive.RouterOnly || receiveMode == receive.RouterIngestor) && conf.maxPerTenantLimit != 0
limiter := receive.NewLimiter(limitsConfig, reg, receiveMode, log.With(logger, "component", "receive-limiter"))

dbs := receive.NewMultiTSDB(
conf.dataDir,
Expand All @@ -222,28 +219,23 @@ func runReceive(
)
writer := receive.NewWriter(log.With(logger, "component", "receive-writer"), dbs)
webHandler := receive.NewHandler(log.With(logger, "component", "receive-handler"), &receive.Options{
Writer: writer,
ListenAddress: conf.rwAddress,
Registry: reg,
Endpoint: conf.endpoint,
TenantHeader: conf.tenantHeader,
TenantField: conf.tenantField,
DefaultTenantID: conf.defaultTenantID,
ReplicaHeader: conf.replicaHeader,
ReplicationFactor: conf.replicationFactor,
RelabelConfigs: relabelConfig,
ReceiverMode: receiveMode,
Tracer: tracer,
TLSConfig: rwTLSConfig,
DialOpts: dialOpts,
ForwardTimeout: time.Duration(*conf.forwardTimeout),
TSDBStats: dbs,
LimitsConfig: limitsConfig,
SeriesLimitSupported: seriesLimitSupported,
MaxPerTenantLimit: conf.maxPerTenantLimit,
MetaMonitoringUrl: conf.metaMonitoringUrl,
MetaMonitoringHttpClient: conf.metaMonitoringHttpClient,
MetaMonitoringLimitQuery: conf.metaMonitoringLimitQuery,
Writer: writer,
ListenAddress: conf.rwAddress,
Registry: reg,
Endpoint: conf.endpoint,
TenantHeader: conf.tenantHeader,
TenantField: conf.tenantField,
DefaultTenantID: conf.defaultTenantID,
ReplicaHeader: conf.replicaHeader,
ReplicationFactor: conf.replicationFactor,
RelabelConfigs: relabelConfig,
ReceiverMode: receiveMode,
Tracer: tracer,
TLSConfig: rwTLSConfig,
DialOpts: dialOpts,
ForwardTimeout: time.Duration(*conf.forwardTimeout),
TSDBStats: dbs,
Limiter: limiter,
})

grpcProbe := prober.NewGRPC()
Expand Down Expand Up @@ -373,13 +365,13 @@ func runReceive(
)
}

if seriesLimitSupported {
if limitsConfig.AreHeadSeriesLimitsConfigured() {
level.Info(logger).Log("msg", "setting up periodic (every 15s) meta-monitoring query for limiting cache")
{
ctx, cancel := context.WithCancel(context.Background())
g.Add(func() error {
return runutil.Repeat(15*time.Second, ctx.Done(), func() error {
if err := webHandler.ActiveSeriesLimit.QueryMetaMonitoring(ctx, log.With(logger, "component", "receive-meta-monitoring")); err != nil {
if err := limiter.HeadSeriesLimiter.QueryMetaMonitoring(ctx); err != nil {
level.Error(logger).Log("msg", "failed to query meta-monitoring", "err", err.Error())
}
return nil
Expand Down Expand Up @@ -737,11 +729,6 @@ type receiveConfig struct {
rwClientServerCA string
rwClientServerName string

maxPerTenantLimit uint64
metaMonitoringLimitQuery string
metaMonitoringUrl *url.URL
metaMonitoringHttpClient *extflag.PathOrContent

dataDir string
labelStrs []string

Expand Down Expand Up @@ -842,14 +829,6 @@ func (rc *receiveConfig) registerFlag(cmd extkingpin.FlagClause) {

cmd.Flag("receive.replication-factor", "How many times to replicate incoming write requests.").Default("1").Uint64Var(&rc.replicationFactor)

cmd.Flag("receive.tenant-limits.max-head-series", "The total number of active (head) series that a tenant is allowed to have within a Receive topology. For more details refer: https://thanos.io/tip/components/receive.md/#limiting").Hidden().Uint64Var(&rc.maxPerTenantLimit)

cmd.Flag("receive.tenant-limits.meta-monitoring-url", "Meta-monitoring URL which is compatible with Prometheus Query API for active series limiting.").Hidden().URLVar(&rc.metaMonitoringUrl)

cmd.Flag("receive.tenant-limits.meta-monitoring-query", "PromQL Query to execute against meta-monitoring, to get the current number of active series for each tenant, across Receive replicas.").Default("sum(prometheus_tsdb_head_series) by (tenant)").Hidden().StringVar(&rc.metaMonitoringLimitQuery)

rc.metaMonitoringHttpClient = extflag.RegisterPathOrContent(cmd, "receive.tenant-limits.meta-monitoring-client", "YAML file or string with http client configs for meta-monitoring.", extflag.WithHidden())

rc.forwardTimeout = extkingpin.ModelDuration(cmd.Flag("receive-forward-timeout", "Timeout for each forward request.").Default("5s").Hidden())

rc.relabelConfigPath = extflag.RegisterPathOrContent(cmd, "receive.relabel-config", "YAML file that contains relabeling configuration.", extflag.WithEnvSubstitution())
Expand Down
31 changes: 20 additions & 11 deletions docs/components/receive.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,34 +101,39 @@ The configuration file follows a few standards:

All the configuration for the remote write endpoint of Receive is contained in the `write` key. Inside it there are 3 subsections:

- `global`: limits and/or gates that are applied considering all the requests.
- `global`: limits, gates and/or options that are applied considering all the requests.
- `default`: the default values for limits in case a given tenant doesn't have any specified.
- `tenants`: the limits for a given tenant.

From the example configuration below, it's understood that:
For a Receive instance with configuration like below, it's understood that:

1. This Receive instance has a max concurrency of 30.
2. This Receive instance has some default request limits that apply of all tenants, **unless** a given tenant has their own limits (i.e. the `acme` tenant and partially for the `ajax` tenant).
3. Tenant `acme` has no request limits.
4. Tenant `ajax` has a request series limit of 50000 and samples limit of 500. Their request size bytes limit is inherited from the default, 1024 bytes.
1. The Receive instance has a max concurrency of 30.
2. The Receive instance has head series limiting enabled as it has `meta_monitoring_.*` options in `global`.
3. The Receive instance has some default request limits as well as head series limits that apply of all tenants, **unless** a given tenant has their own limits (i.e. the `acme` tenant and partially for the `ajax` tenant).
4. Tenant `acme` has no request limits, but has a higher head_series limit.
5. Tenant `ajax` has a request series limit of 50000 and samples limit of 500. Their request size bytes limit is inherited from the default, 1024 bytes. Their head series are also inherited from default i.e, 1000.

The next sections explain what each configuration value means.

```yaml mdox-exec="cat pkg/receive/testdata/limits_config/good_limits.yaml"
write:
global:
max_concurrency: 30
meta_monitoring_url: "http://localhost:9090"
meta_monitoring_limit_query: "sum(prometheus_tsdb_head_series) by (tenant)"
default:
request:
size_bytes_limit: 1024
series_limit: 1000
samples_limit: 10
head_series_limit: 1000
tenants:
acme:
request:
size_bytes_limit: 0
series_limit: 0
samples_limit: 0
head_series_limit: 2000
ajax:
request:
series_limit: 50000
Expand Down Expand Up @@ -168,11 +173,15 @@ Thanos Receive, in Router or RouterIngestor mode, supports limiting tenant activ

Every Receive Router/RouterIngestor node, queries meta-monitoring for active series of all tenants, every 15 seconds, and caches the results in a map. This cached result is used to limit all incoming remote write requests.

To use the feature, one should specify the following (hidden) flags:
- `--receive.tenant-limits.max-head-series`: Specifies the total number of active (head) series for any tenant, across all replicas (including data replication), allowed by Thanos Receive.
- `--receive.tenant-limits.meta-monitoring-url`: Specifies Prometheus Query API compatible meta-monitoring endpoint.
- `--receive.tenant-limits.meta-monitoring-query`: Optional flag to specify PromQL query to execute against meta-monitoring.
- `--receive.tenant-limits.meta-monitoring-client`: Optional YAML file/string specifying HTTP client config for meta-monitoring.
To use the feature, one should specify the following limiting config options:

Under `global`:
- `meta_monitoring_url`: Specifies Prometheus Query API compatible meta-monitoring endpoint.
- `meta_monitoring_limit_query`: Option to specify PromQL query to execute against meta-monitoring. If not specified it is set to `sum(prometheus_tsdb_head_series) by (tenant)` by default.
- `meta_monitoring_http_client`: Optional YAML field specifying HTTP client config for meta-monitoring.

Under `default` and per `tenant`:
- `head_series_limit`: Specifies the total number of active (head) series for any tenant, across all replicas (including data replication), allowed by Thanos Receive.

NOTE:
- It is possible that Receive ingests more active series than the specified limit, as it relies on meta-monitoring, which may not have the latest data for current number of active series of a tenant at all times.
Expand Down
4 changes: 4 additions & 0 deletions pkg/httpconfig/http.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ var defaultTransportConfig TransportConfig = TransportConfig{
TLSHandshakeTimeout: int64(10 * time.Second),
}

func NewDefaultClientConfig() ClientConfig {
return ClientConfig{TransportConfig: defaultTransportConfig}
}

func NewClientConfigFromYAML(cfg []byte) (*ClientConfig, error) {
conf := &ClientConfig{TransportConfig: defaultTransportConfig}
if err := yaml.Unmarshal(cfg, conf); err != nil {
Expand Down
Loading

0 comments on commit a31e4c5

Please sign in to comment.