From 57145061be15671d23493c60d8a500e70849b865 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Giedrius=20Statkevi=C4=8Dius?= Date: Wed, 18 Jan 2023 05:14:50 +0200 Subject: [PATCH 1/2] Merge release 0.30 into main (#6041) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * compact: remove cancel on SyncMetas errors (#5923) in a favour of 86b4039948b0918ca2ba121637d1d0d5b3f768c0 SyncMetas will retry if it's retriable. Also, the cleanPartialMarked calls are surrounded by runutil.Repeat() will be repeated, the ones not and are not retriable will throw an interrupt to run.Group() by returning err and Group will call cancel() as it's configured for its interrupt func. Signed-off-by: Seena Fallah Signed-off-by: Seena Fallah * Cut v0.30.0-rc.0 (#5992) * Cut v0.30.0-rc.0 Signed-off-by: bwplotka * mdox fix. Signed-off-by: bwplotka Signed-off-by: bwplotka Signed-off-by: Giedrius Statkevičius * Cut 0.30.0 (#6011) Signed-off-by: bwplotka Signed-off-by: bwplotka Signed-off-by: Giedrius Statkevičius * *: cut 0.30.1 (#6017) * fix duplicate metrics registration in redis client (#6009) * fix duplicate metrics registration in redis client Signed-off-by: Kama Huang * fixed test Signed-off-by: Kama Huang Signed-off-by: Kama Huang * *: cut 0.30.1 Add CHANGELOG entry. Signed-off-by: Giedrius Statkevičius Signed-off-by: Kama Huang Signed-off-by: Giedrius Statkevičius Co-authored-by: Kama Huang <121007071+kama910@users.noreply.github.com> Signed-off-by: Giedrius Statkevičius * Tracing: Fix sampler defaults (#5887) * Fix sampler defaults Signed-off-by: Matej Gera * Add CHANGELOG Signed-off-by: Matej Gera * Replace checkout with git-shallow-clone (#5829) Signed-off-by: Matej Gera Signed-off-by: Matej Gera Signed-off-by: Matej Gera Signed-off-by: Giedrius Statkevičius * CHANGELOG: fix Signed-off-by: Giedrius Statkevičius Signed-off-by: Seena Fallah Signed-off-by: bwplotka Signed-off-by: Giedrius Statkevičius Signed-off-by: Kama Huang Signed-off-by: Matej Gera Co-authored-by: Seena Fallah Co-authored-by: Kama Huang <121007071+kama910@users.noreply.github.com> --- CHANGELOG.md | 47 ++++++++++++++++++++++--------- cmd/thanos/compact.go | 1 - docs/release-process.md | 2 +- pkg/tracing/jaeger/config_yaml.go | 45 +++++++++++++++++++---------- pkg/tracing/jaeger/jaeger_test.go | 43 ++++++++++++++++++++++++++-- 5 files changed, 106 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b9aa6ad03a6..57a05310b43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,45 +20,66 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#6035](https://github.com/thanos-io/thanos/pull/6035) Replicate: Support all types of matchers to match blocks for replication. Change matcher parameter from string slice to a single string. ### Fixed -- [#5995] (https://github.com/thanos-io/thanos/pull/5993) Sidecar: Loads the TLS certificate during startup. + +- [#5995](https://github.com/thanos-io/thanos/pull/5995) Sidecar: Loads the TLS certificate during startup. - [#6044](https://github.com/thanos-io/thanos/pull/6044) Receive: mark ouf of window errors as conflict, if out-of-window samples ingestion is activated ### Changed - [#6010](https://github.com/thanos-io/thanos/pull/6010) *: Upgrade Prometheus to v0.41.0. +- [#5887](https://github.com/thanos-io/thanos/pull/5887) Tracing: Make sure rate limiting sampler is the default, as was the case in version pre-0.29.0. + +## [v0.30.1](https://github.com/thanos-io/thanos/tree/release-0.30) - 4.01.2023 + +### Fixed + +- [#6009](https://github.com/thanos-io/thanos/pull/6009) Query Frontend/Store: fix duplicate metrics registration in Redis client + +## [v0.30.0](https://github.com/thanos-io/thanos/tree/release-0.30) - 2.01.2023 -## [v0.30.0](https://github.com/thanos-io/thanos/tree/release-0.30) - in progress. +NOTE: Querier's `query.promql-engine` flag enabling new PromQL engine is now unhidden. We encourage users to use new experimental PromQL engine for efficiency reasons. ### Fixed + +- [#5716](https://github.com/thanos-io/thanos/pull/5716) DNS: Fix miekgdns resolver LookupSRV to work with CNAME records. - [#5844](https://github.com/thanos-io/thanos/pull/5844) Query Frontend: Fixes @ modifier time range when splitting queries by interval. -- [#5854](https://github.com/thanos-io/thanos/pull/5854) Query Frontend: Handles `lookback_delta` param in query frontend. +- [#5854](https://github.com/thanos-io/thanos/pull/5854) Query Frontend: `lookback_delta` param is now handled in query frontend. +- [#5860](https://github.com/thanos-io/thanos/pull/5860) Query: Fixed bug of not showing query warnings in Thanos UI. +- [#5856](https://github.com/thanos-io/thanos/pull/5856) Store: Fixed handling of debug logging flag. - [#5230](https://github.com/thanos-io/thanos/pull/5230) Rule: Stateless ruler support restoring `for` state from query API servers. The query API servers should be able to access the remote write storage. - [#5880](https://github.com/thanos-io/thanos/pull/5880) Query Frontend: Fixes some edge cases of query sharding analysis. - [#5893](https://github.com/thanos-io/thanos/pull/5893) Cache: Fixed redis client not respecting `SetMultiBatchSize` config value. - [#5966](https://github.com/thanos-io/thanos/pull/5966) Query: Fixed mint and maxt when selecting series for the `api/v1/series` HTTP endpoint. - [#5997](https://github.com/thanos-io/thanos/pull/5997) Rule: switch to miekgdns DNS resolver as the default one. +- [#5948](https://github.com/thanos-io/thanos/pull/5948) Store: `chunks_fetched_duration` wrong calculation. +- [#5910](https://github.com/thanos-io/thanos/pull/5910) Receive: Fixed ketama quorum bug that was could cause success response for failed replication. This also optimize heavily receiver CPU use. ### Added -- [#5945](https://github.com/thanos-io/thanos/pull/5945) Tools: Added new `no-downsample` marker to skip blocks when downsampling via `thanos tools bucket mark --marker=no-downsample-mark.json`. This will skip downsampling for blocks with the new marker. -- [#5814](https://github.com/thanos-io/thanos/pull/5814) Store: Add metric `thanos_bucket_store_postings_size_bytes` that shows the distribution of how many postings (in bytes) were needed for each Series() call in Thanos Store. Useful for determining limits. -- [#5801](https://github.com/thanos-io/thanos/pull/5801) Store: add a new limiter `--store.grpc.downloaded-bytes-limit` that limits the number of bytes downloaded in each Series/LabelNames/LabelValues call. Use `thanos_bucket_store_postings_size_bytes` for determining the limits. -- [#5839](https://github.com/thanos-io/thanos/pull/5839) Receive: Add parameter `--tsdb.out-of-order.time-window` to set time window for experimental out-of-order samples ingestion. Disabled by default (set to 0s). Please note if you enable this option and you use compactor, make sure you set the `--enable-vertical-compaction` flag, otherwise you might risk compactor halt. -- [#5836](https://github.com/thanos-io/thanos/pull/5836) Receive: Add hidden flag `tsdb.memory-snapshot-on-shutdown` to enable experimental TSDB feature to snapshot on shutdown. This is intended to speed up receiver restart. +- [#5814](https://github.com/thanos-io/thanos/pull/5814) Store: Added metric `thanos_bucket_store_postings_size_bytes` that shows the distribution of how many postings (in bytes) were needed for each Series() call in Thanos Store. Useful for determining limits. +- [#5703](https://github.com/thanos-io/thanos/pull/5703) StoreAPI: Added `hash` field to series' chunks. Store gateway and receive implements that field and proxy leverage that for quicker deduplication. +- [#5801](https://github.com/thanos-io/thanos/pull/5801) Store: Added a new flag `--store.grpc.downloaded-bytes-limit` that limits the number of bytes downloaded in each Series/LabelNames/LabelValues call. Use `thanos_bucket_store_postings_size_bytes` for determining the limits. +- [#5836](https://github.com/thanos-io/thanos/pull/5836) Receive: Added hidden flag `tsdb.memory-snapshot-on-shutdown` to enable experimental TSDB feature to snapshot on shutdown. This is intended to speed up receiver restart. +- [#5839](https://github.com/thanos-io/thanos/pull/5839) Receive: Added parameter `--tsdb.out-of-order.time-window` to set time window for experimental out-of-order samples ingestion. Disabled by default (set to 0s). Please note if you enable this option and you use compactor, make sure you set the `--enable-vertical-compaction` flag, otherwise you might risk compactor halt. +- [#5889](https://github.com/thanos-io/thanos/pull/5889) Query Frontend: Added support for vertical sharding `label_replace` and `label_join` functions. - [#5865](https://github.com/thanos-io/thanos/pull/5865) Compact: Retry on sync metas error. -- [#5889](https://github.com/thanos-io/thanos/pull/5889) Query Frontend: Support sharding vertical sharding `label_replace` and `label_join` functions. -- [#5819](https://github.com/thanos-io/thanos/pull/5819) Store: Add a few objectives for Store's data touched/fetched amount and sizes. They are: 50, 95, and 99 quantiles. +- [#5819](https://github.com/thanos-io/thanos/pull/5819) Store: Added a few objectives for Store's data summaries (touched/fetched amount and sizes). They are: 50, 95, and 99 quantiles. +- [#5837](https://github.com/thanos-io/thanos/pull/5837) Store: Added streaming retrival of series from object storage. - [#5940](https://github.com/thanos-io/thanos/pull/5940) Objstore: Support for authenticating to Swift using application credentials. -- [#5977](https://github.com/thanos-io/thanos/pull/5977) Tools: Added remove flag on bucket mark command to remove deletion, no-downsample or no-compact markers on the block. +- [#5945](https://github.com/thanos-io/thanos/pull/5945) Tools: Added new `no-downsample` marker to skip blocks when downsampling via `thanos tools bucket mark --marker=no-downsample-mark.json`. This will skip downsampling for blocks with the new marker. +- [#5977](https://github.com/thanos-io/thanos/pull/5977) Tools: Added remove flag on bucket mark command to remove deletion, no-downsample or no-compact markers on the block ### Changed -- [#5716](https://github.com/thanos-io/thanos/pull/5716) DNS: Fix miekgdns resolver LookupSRV to work with CNAME records. +- [#5785](https://github.com/thanos-io/thanos/pull/5785) Query: `thanos_store_nodes_grpc_connections` now trimms `external_labels` label name longer than 1000 character. It also allows customizations in what labels to preserve using `query.conn-metric.label` flag. +- [#5542](https://github.com/thanos-io/thanos/pull/5542) Mixin: Added query concurrency panel to Querier dashboard. - [#5846](https://github.com/thanos-io/thanos/pull/5846) Query Frontend: vertical query sharding supports subqueries. -- [#5909](https://github.com/thanos-io/thanos/pull/5909) Receive: compact tenant head after no appends have happened for 1.5 `tsdb.max-block-size`. - [#5593](https://github.com/thanos-io/thanos/pull/5593) Cache: switch Redis client to [Rueidis](https://github.com/rueian/rueidis). Rueidis is [faster](https://github.com/rueian/rueidis#benchmark-comparison-with-go-redis-v9) and provides [client-side caching](https://redis.io/docs/manual/client-side-caching/). It is highly recommended to use it so that repeated requests for the same key would not be needed. - [#5896](https://github.com/thanos-io/thanos/pull/5896) *: Upgrade Prometheus to v0.40.7 without implementing native histogram support. *Querying native histograms will fail with `Error executing query: invalid chunk encoding ""` and native histograms in write requests are ignored.* - [#5999](https://github.com/thanos-io/thanos/pull/5999) *: Upgrade Alertmanager dependency to v0.25.0. +- [#5909](https://github.com/thanos-io/thanos/pull/5909) Receive: Compact tenant head after no appends have happened for 1.5 `tsdb.max-block-size`. +- [#5838](https://github.com/thanos-io/thanos/pull/5838) Mixin: Added data touched type to Store dashboard. +- [#5922](https://github.com/thanos-io/thanos/pull/5922) Compact: Retry on clean, partial marked errors when possible. ### Removed diff --git a/cmd/thanos/compact.go b/cmd/thanos/compact.go index 6cb4eae3e99..94384fa2da2 100644 --- a/cmd/thanos/compact.go +++ b/cmd/thanos/compact.go @@ -406,7 +406,6 @@ func runCompact( defer cleanMtx.Unlock() if err := sy.SyncMetas(ctx); err != nil { - cancel() return errors.Wrap(err, "syncing metas") } diff --git a/docs/release-process.md b/docs/release-process.md index 4773b6dc0e3..bee9247cca2 100644 --- a/docs/release-process.md +++ b/docs/release-process.md @@ -25,7 +25,7 @@ Release shepherd responsibilities: |---------|----------------------|-------------------------------| | v0.32.0 | (planned) 2023.03.09 | No one ATM | | v0.31.0 | (planned) 2023.01.26 | No one ATM | -| v0.30.0 | (planned) 2022.12.15 | `@bwplotka` | +| v0.30.0 | 2022.12.21 | `@bwplotka` | | v0.29.0 | 2022.10.21 | `@GiedriusS` | | v0.28.0 | 2022.08.22 | `@yeya24` | | v0.27.0 | 2022.06.21 | `@wiardvanrij` and `@matej-g` | diff --git a/pkg/tracing/jaeger/config_yaml.go b/pkg/tracing/jaeger/config_yaml.go index 71009070b3d..fae3f8c21cc 100644 --- a/pkg/tracing/jaeger/config_yaml.go +++ b/pkg/tracing/jaeger/config_yaml.go @@ -19,6 +19,13 @@ import ( tracesdk "go.opentelemetry.io/otel/sdk/trace" ) +const ( + SamplerTypeRemote = "remote" + SamplerTypeProbabilistic = "probabilistic" + SamplerTypeConstant = "const" + SamplerTypeRateLimiting = "ratelimiting" +) + type ParentBasedSamplerConfig struct { LocalParentSampled bool `yaml:"local_parent_sampled"` RemoteParentSampled bool `yaml:"remote_parent_sampled"` @@ -114,22 +121,27 @@ func getSamplingFraction(samplerType string, samplingFactor float64) float64 { func getSampler(config Config) tracesdk.Sampler { samplerType := config.SamplerType + if samplerType == "" { + samplerType = SamplerTypeRateLimiting + } samplingFraction := getSamplingFraction(samplerType, config.SamplerParam) var sampler tracesdk.Sampler switch samplerType { - case "probabilistic": - sampler = tracesdk.ParentBased(tracesdk.TraceIDRatioBased(samplingFraction)) - case "const": + case SamplerTypeProbabilistic: + sampler = tracesdk.TraceIDRatioBased(samplingFraction) + case SamplerTypeConstant: if samplingFraction == 1.0 { sampler = tracesdk.AlwaysSample() } else { sampler = tracesdk.NeverSample() } - case "remote": + case SamplerTypeRemote: remoteOptions := getRemoteOptions(config) sampler = jaegerremote.New(config.ServiceName, remoteOptions...) - case "ratelimiting": + // Fallback always to default (rate limiting). + case SamplerTypeRateLimiting: + default: // The same config options are applicable to both remote and rate-limiting samplers. remoteOptions := getRemoteOptions(config) sampler = jaegerremote.New(config.ServiceName, remoteOptions...) @@ -137,17 +149,20 @@ func getSampler(config Config) tracesdk.Sampler { if ok { sampler.Update(config.SamplerParam) } - default: - var root tracesdk.Sampler - var parentOptions []tracesdk.ParentBasedSamplerOption - if config.SamplerParentConfig.LocalParentSampled { - parentOptions = append(parentOptions, tracesdk.WithLocalParentSampled(root)) - } - if config.SamplerParentConfig.RemoteParentSampled { - parentOptions = append(parentOptions, tracesdk.WithRemoteParentSampled(root)) - } - sampler = tracesdk.ParentBased(root, parentOptions...) } + + // Use parent-based to make sure we respect the span parent, if + // it is sampled. Optionally, allow user to specify the + // parent-based options. + var parentOptions []tracesdk.ParentBasedSamplerOption + if config.SamplerParentConfig.LocalParentSampled { + parentOptions = append(parentOptions, tracesdk.WithLocalParentSampled(sampler)) + } + if config.SamplerParentConfig.RemoteParentSampled { + parentOptions = append(parentOptions, tracesdk.WithRemoteParentSampled(sampler)) + } + sampler = tracesdk.ParentBased(sampler, parentOptions...) + return sampler } diff --git a/pkg/tracing/jaeger/jaeger_test.go b/pkg/tracing/jaeger/jaeger_test.go index 3c8b2f2e0bc..021b8156dc2 100644 --- a/pkg/tracing/jaeger/jaeger_test.go +++ b/pkg/tracing/jaeger/jaeger_test.go @@ -24,7 +24,7 @@ var parentConfig = ParentBasedSamplerConfig{LocalParentSampled: true} // This test shows that if sample factor will enable tracing on client process, even when it would be disabled on server // it will be still enabled for all spans within this span. -func TestContextTracing_ClientEnablesTracing(t *testing.T) { +func TestContextTracing_ClientEnablesProbabilisticTracing(t *testing.T) { exp := tracetest.NewInMemoryExporter() config := Config{ SamplerType: "probabilistic", @@ -65,7 +65,7 @@ func TestContextTracing_ClientEnablesTracing(t *testing.T) { // This test shows that if sample factor will disable tracing on client process, when it would be enabled on server // it will be still disabled for all spans within this span. -func TestContextTracing_ClientDisablesTracing(t *testing.T) { +func TestContextTracing_ClientDisablesProbabilisticTracing(t *testing.T) { exp := tracetest.NewInMemoryExporter() config := Config{ @@ -105,6 +105,45 @@ func TestContextTracing_ClientDisablesTracing(t *testing.T) { tracing.ContextTracing_ClientDisablesTracing(t, exp, clientRoot, srvRoot, srvChild) } +func TestContextTracing_ClientDisablesAlwaysOnSampling(t *testing.T) { + exp := tracetest.NewInMemoryExporter() + + config := Config{ + SamplerType: SamplerTypeConstant, + SamplerParam: 0, + } + sampler := getSampler(config) + tracerOtel := newTraceProvider( + context.Background(), + "tracerOtel", + log.NewNopLogger(), + tracesdk.NewSimpleSpanProcessor(exp), + sampler, // never sample + []attribute.KeyValue{}, + ) + tracer, _ := migration.Bridge(tracerOtel, log.NewNopLogger()) + + clientRoot, clientCtx := tracing.StartSpan(tracing.ContextWithTracer(context.Background(), tracer), "a") + + config.SamplerParam = 1 + sampler2 := getSampler(config) + // Simulate Server process with different tracer, but with client span in context. + srvTracerOtel := newTraceProvider( + context.Background(), + "srvTracerOtel", + log.NewNopLogger(), + tracesdk.NewSimpleSpanProcessor(exp), + sampler2, // never sample + []attribute.KeyValue{}, + ) + srvTracer, _ := migration.Bridge(srvTracerOtel, log.NewNopLogger()) + + srvRoot, srvCtx := tracing.StartSpan(tracing.ContextWithTracer(clientCtx, srvTracer), "b") + srvChild, _ := tracing.StartSpan(srvCtx, "bb") + + tracing.ContextTracing_ClientDisablesTracing(t, exp, clientRoot, srvRoot, srvChild) +} + // This test shows that if span will contain special baggage (for example from special HTTP header), even when sample // factor will disable client & server tracing, it will be still enabled for all spans within this span. func TestContextTracing_ForceTracing(t *testing.T) { From 80ad25646d7e05c15f44dc2ca1253198cb870229 Mon Sep 17 00:00:00 2001 From: Kartik-Garg Date: Tue, 17 Jan 2023 17:45:14 +0530 Subject: [PATCH 2/2] Store: Make initial sync more robust Added re-try mechanism for store inital sync, where if the initial sync fails, it tries to do the initial sync again every 5 seconds for 15 seconds duration (total 3 re-tries for initial sync of store). Signed-off-by: Kartik-Garg --- CHANGELOG.md | 1 + cmd/thanos/store.go | 20 ++++++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b9aa6ad03a6..27e0f52cae5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re ### Fixed - [#5995] (https://github.com/thanos-io/thanos/pull/5993) Sidecar: Loads the TLS certificate during startup. - [#6044](https://github.com/thanos-io/thanos/pull/6044) Receive: mark ouf of window errors as conflict, if out-of-window samples ingestion is activated +- [#6050](https://github.com/thanos-io/thanos/pull/6050) Store: Re-try bucket store initial sync upon failure. ### Changed diff --git a/cmd/thanos/store.go b/cmd/thanos/store.go index 30df09ba5e2..86dfb249c34 100644 --- a/cmd/thanos/store.go +++ b/cmd/thanos/store.go @@ -49,6 +49,11 @@ import ( "github.com/thanos-io/thanos/pkg/ui" ) +// const ( +// timeoutDuration = 15 +// intervalDuration = 5 +// ) + type storeConfig struct { indexCacheConfigs extflag.PathOrContent objStoreConfig extflag.PathOrContent @@ -381,14 +386,25 @@ func runStore( level.Info(logger).Log("msg", "initializing bucket store") begin := time.Now() - if err := bs.InitialSync(ctx); err != nil { + + //This will stop retrying after 15 seconds. + initialSyncCtx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + + //If error occurs, it will re-try after every 5 seconds, but only for 15 seconds, (so total re-try is three). + err := runutil.Retry(5*time.Second, initialSyncCtx.Done(), func() error { + return bs.InitialSync(ctx) + }) + + if err != nil { close(bucketStoreReady) return errors.Wrap(err, "bucket store initial sync") } + level.Info(logger).Log("msg", "bucket store ready", "init_duration", time.Since(begin).String()) close(bucketStoreReady) - err := runutil.Repeat(conf.syncInterval, ctx.Done(), func() error { + err = runutil.Repeat(conf.syncInterval, ctx.Done(), func() error { if err := bs.SyncBlocks(ctx); err != nil { level.Warn(logger).Log("msg", "syncing blocks failed", "err", err) }