From 94d249bb2daf715b978a94a70e8c55d05e211ad7 Mon Sep 17 00:00:00 2001 From: Kartik-Garg Date: Tue, 17 Jan 2023 17:45:14 +0530 Subject: [PATCH 1/4] Store: Make initial sync more robust Added re-try mechanism for store inital sync, where if the initial sync fails, it tries to do the initial sync again every 5 seconds for 15 seconds duration (total 3 re-tries for initial sync of store). Signed-off-by: Kartik-Garg --- CHANGELOG.md | 1 + cmd/thanos/store.go | 20 ++++++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 57a05310b43..4e069f2fbe8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5995](https://github.com/thanos-io/thanos/pull/5995) Sidecar: Loads the TLS certificate during startup. - [#6044](https://github.com/thanos-io/thanos/pull/6044) Receive: mark ouf of window errors as conflict, if out-of-window samples ingestion is activated +- [#6050](https://github.com/thanos-io/thanos/pull/6050) Store: Re-try bucket store initial sync upon failure. ### Changed diff --git a/cmd/thanos/store.go b/cmd/thanos/store.go index 30df09ba5e2..f3bc40c9dc2 100644 --- a/cmd/thanos/store.go +++ b/cmd/thanos/store.go @@ -49,6 +49,11 @@ import ( "github.com/thanos-io/thanos/pkg/ui" ) +const ( + timeoutDuration = 30 + intervalDuration = 10 +) + type storeConfig struct { indexCacheConfigs extflag.PathOrContent objStoreConfig extflag.PathOrContent @@ -381,14 +386,25 @@ func runStore( level.Info(logger).Log("msg", "initializing bucket store") begin := time.Now() - if err := bs.InitialSync(ctx); err != nil { + + //This will stop retrying after set timeout duration. + initialSyncCtx, cancel := context.WithTimeout(context.Background(), timeoutDuration*time.Second) + defer cancel() + + //Retry in case of error. + err := runutil.Retry(intervalDuration*time.Second, initialSyncCtx.Done(), func() error { + return bs.InitialSync(ctx) + }) + + if err != nil { close(bucketStoreReady) return errors.Wrap(err, "bucket store initial sync") } + level.Info(logger).Log("msg", "bucket store ready", "init_duration", time.Since(begin).String()) close(bucketStoreReady) - err := runutil.Repeat(conf.syncInterval, ctx.Done(), func() error { + err = runutil.Repeat(conf.syncInterval, ctx.Done(), func() error { if err := bs.SyncBlocks(ctx); err != nil { level.Warn(logger).Log("msg", "syncing blocks failed", "err", err) } From 5874623535f23055e954f2506c8ee4189b49cc3f Mon Sep 17 00:00:00 2001 From: Kartik-Garg Date: Tue, 17 Jan 2023 17:45:14 +0530 Subject: [PATCH 2/4] Store: Make initial sync more robust Added re-try mechanism for store inital sync, where if the initial sync fails, it tries to do the initial sync again every 5 seconds for 15 seconds duration (total 3 re-tries for initial sync of store). Signed-off-by: Kartik-Garg --- CHANGELOG.md | 1 + cmd/thanos/store.go | 20 ++++++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9af04b5ae59..aae02ffb721 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5995](https://github.com/thanos-io/thanos/pull/5995) Sidecar: Loads the TLS certificate during startup. - [#6044](https://github.com/thanos-io/thanos/pull/6044) Receive: mark ouf of window errors as conflict, if out-of-window samples ingestion is activated - [#6066](https://github.com/thanos-io/thanos/pull/6066) Tracing: fixed panic because of nil sampler +- [#6050](https://github.com/thanos-io/thanos/pull/6050) Store: Re-try bucket store initial sync upon failure. ### Changed diff --git a/cmd/thanos/store.go b/cmd/thanos/store.go index 30df09ba5e2..f3bc40c9dc2 100644 --- a/cmd/thanos/store.go +++ b/cmd/thanos/store.go @@ -49,6 +49,11 @@ import ( "github.com/thanos-io/thanos/pkg/ui" ) +const ( + timeoutDuration = 30 + intervalDuration = 10 +) + type storeConfig struct { indexCacheConfigs extflag.PathOrContent objStoreConfig extflag.PathOrContent @@ -381,14 +386,25 @@ func runStore( level.Info(logger).Log("msg", "initializing bucket store") begin := time.Now() - if err := bs.InitialSync(ctx); err != nil { + + //This will stop retrying after set timeout duration. + initialSyncCtx, cancel := context.WithTimeout(context.Background(), timeoutDuration*time.Second) + defer cancel() + + //Retry in case of error. + err := runutil.Retry(intervalDuration*time.Second, initialSyncCtx.Done(), func() error { + return bs.InitialSync(ctx) + }) + + if err != nil { close(bucketStoreReady) return errors.Wrap(err, "bucket store initial sync") } + level.Info(logger).Log("msg", "bucket store ready", "init_duration", time.Since(begin).String()) close(bucketStoreReady) - err := runutil.Repeat(conf.syncInterval, ctx.Done(), func() error { + err = runutil.Repeat(conf.syncInterval, ctx.Done(), func() error { if err := bs.SyncBlocks(ctx); err != nil { level.Warn(logger).Log("msg", "syncing blocks failed", "err", err) } From 7ccba0162140d894e89e5ab6ac1112f2f2fdbc1a Mon Sep 17 00:00:00 2001 From: Kartik-Garg Date: Tue, 17 Jan 2023 17:45:14 +0530 Subject: [PATCH 3/4] Store: Make initial sync more robust Added re-try mechanism for store inital sync, where if the initial sync fails, it tries to do the initial sync again every 5 seconds for 15 seconds duration (total 3 re-tries for initial sync of store). Signed-off-by: Kartik-Garg --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index aae02ffb721..7247cfbe274 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,8 +23,8 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5995](https://github.com/thanos-io/thanos/pull/5995) Sidecar: Loads the TLS certificate during startup. - [#6044](https://github.com/thanos-io/thanos/pull/6044) Receive: mark ouf of window errors as conflict, if out-of-window samples ingestion is activated -- [#6066](https://github.com/thanos-io/thanos/pull/6066) Tracing: fixed panic because of nil sampler - [#6050](https://github.com/thanos-io/thanos/pull/6050) Store: Re-try bucket store initial sync upon failure. +- [#6066](https://github.com/thanos-io/thanos/pull/6066) Tracing: fixed panic because of nil sampler ### Changed From a75b8ad6df10f7c209182c34db697d5643041efe Mon Sep 17 00:00:00 2001 From: Kartik-Garg Date: Tue, 17 Jan 2023 17:45:14 +0530 Subject: [PATCH 4/4] Store: Make initial sync more robust Added re-try mechanism for store inital sync, where if the initial sync fails, it tries to do the initial sync again every 5 seconds for 15 seconds duration (total 3 re-tries for initial sync of store). Signed-off-by: Kartik-Garg Store: Make initial sync more robust Added re-try mechanism for store inital sync, where if the initial sync fails, it tries to do the initial sync again every 5 seconds for 15 seconds duration (total 3 re-tries for initial sync of store). Signed-off-by: Kartik-Garg --- CHANGELOG.md | 1 + cmd/thanos/store.go | 20 ++++++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9af04b5ae59..7247cfbe274 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5995](https://github.com/thanos-io/thanos/pull/5995) Sidecar: Loads the TLS certificate during startup. - [#6044](https://github.com/thanos-io/thanos/pull/6044) Receive: mark ouf of window errors as conflict, if out-of-window samples ingestion is activated +- [#6050](https://github.com/thanos-io/thanos/pull/6050) Store: Re-try bucket store initial sync upon failure. - [#6066](https://github.com/thanos-io/thanos/pull/6066) Tracing: fixed panic because of nil sampler ### Changed diff --git a/cmd/thanos/store.go b/cmd/thanos/store.go index 30df09ba5e2..f3bc40c9dc2 100644 --- a/cmd/thanos/store.go +++ b/cmd/thanos/store.go @@ -49,6 +49,11 @@ import ( "github.com/thanos-io/thanos/pkg/ui" ) +const ( + timeoutDuration = 30 + intervalDuration = 10 +) + type storeConfig struct { indexCacheConfigs extflag.PathOrContent objStoreConfig extflag.PathOrContent @@ -381,14 +386,25 @@ func runStore( level.Info(logger).Log("msg", "initializing bucket store") begin := time.Now() - if err := bs.InitialSync(ctx); err != nil { + + //This will stop retrying after set timeout duration. + initialSyncCtx, cancel := context.WithTimeout(context.Background(), timeoutDuration*time.Second) + defer cancel() + + //Retry in case of error. + err := runutil.Retry(intervalDuration*time.Second, initialSyncCtx.Done(), func() error { + return bs.InitialSync(ctx) + }) + + if err != nil { close(bucketStoreReady) return errors.Wrap(err, "bucket store initial sync") } + level.Info(logger).Log("msg", "bucket store ready", "init_duration", time.Since(begin).String()) close(bucketStoreReady) - err := runutil.Repeat(conf.syncInterval, ctx.Done(), func() error { + err = runutil.Repeat(conf.syncInterval, ctx.Done(), func() error { if err := bs.SyncBlocks(ctx); err != nil { level.Warn(logger).Log("msg", "syncing blocks failed", "err", err) }