From c9178dbd2c93086fab408047fdefceceef74bbd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Giedrius=20Statkevi=C4=8Dius?= Date: Wed, 2 Dec 2020 23:46:46 +0200 Subject: [PATCH] compact: do not cleanup blocks on boot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not cleanup blocks on boot because in some very bad cases there could be thousands of blocks ready-to-be deleted and doing that makes Thanos Compact exceed `initialDelaySeconds` on k8s. Signed-off-by: Giedrius Statkevičius --- CHANGELOG.md | 6 ++++++ cmd/thanos/compact.go | 30 ++++++++++-------------------- test/e2e/compact_test.go | 12 +++++++----- 3 files changed, 23 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e2d15ecc26..8098e77538 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,12 @@ NOTE: As semantic versioning states all 0.y.z releases can contain breaking chan We use *breaking :warning:* to mark changes that are not backward compatible (relates only to v0.y.z releases.) +## [v0.17.2](https://github.com/thanos-io/thanos/releases/tag/v0.17.2) - 2020.XX.XX + +### Fixed + +- [#3532](https://github.com/thanos-io/thanos/pull/3532) compact: do not cleanup blocks on boot. Reverts the behavior change introduced in [#3115](https://github.com/thanos-io/thanos/pull/3115) as in some very bad cases the boot of Thanos Compact took a very long time since there were a lot of blocks-to-be-cleaned. + ## [v0.17.1](https://github.com/thanos-io/thanos/releases/tag/v0.17.1) - 2020.11.24 ### Fixed diff --git a/cmd/thanos/compact.go b/cmd/thanos/compact.go index 711c90261b..d3b6ade588 100644 --- a/cmd/thanos/compact.go +++ b/cmd/thanos/compact.go @@ -113,6 +113,10 @@ func runCompact( Name: "thanos_compact_iterations_total", Help: "Total number of iterations that were executed successfully.", }) + cleanups := promauto.With(reg).NewCounter(prometheus.CounterOpts{ + Name: "thanos_compact_block_cleanup_loops_total", + Help: "Total number of concurrent cleanup loops of partially uploaded blocks and marked blocks that were executed successfully.", + }) partialUploadDeleteAttempts := promauto.With(reg).NewCounter(prometheus.CounterOpts{ Name: "thanos_compact_aborted_partial_uploads_deletion_attempts_total", Help: "Total number of started deletions of blocks that are assumed aborted and only partially uploaded.", @@ -336,29 +340,20 @@ func runCompact( cleanMtx.Lock() defer cleanMtx.Unlock() - // No need to resync before partial uploads and delete marked blocks. Last sync should be valid. + if err := sy.SyncMetas(ctx); err != nil { + cancel() + return errors.Wrap(err, "syncing metas") + } + compact.BestEffortCleanAbortedPartialUploads(ctx, logger, sy.Partial(), bkt, partialUploadDeleteAttempts, blocksCleaned, blockCleanupFailures) if err := blocksCleaner.DeleteMarkedBlocks(ctx); err != nil { return errors.Wrap(err, "cleaning marked blocks") } + cleanups.Inc() - if err := sy.SyncMetas(ctx); err != nil { - level.Error(logger).Log("msg", "failed to sync metas", "err", err) - } return nil } - // Do it once at the beginning to ensure that it runs at least once before - // the main loop. - if err := sy.SyncMetas(ctx); err != nil { - cancel() - return errors.Wrap(err, "syncing metas") - } - if err := cleanPartialMarked(); err != nil { - cancel() - return errors.Wrap(err, "cleaning partial and marked blocks") - } - compactMainFn := func() error { if err := compactor.Compact(ctx); err != nil { return errors.Wrap(err, "compaction") @@ -478,11 +473,6 @@ func runCompact( // since one iteration potentially could take a long time. if conf.cleanupBlocksInterval > 0 { g.Add(func() error { - // Wait the whole period at the beginning because we've executed this on boot. - select { - case <-time.After(conf.cleanupBlocksInterval): - case <-ctx.Done(): - } return runutil.Repeat(conf.cleanupBlocksInterval, ctx.Done(), cleanPartialMarked) }, func(error) { cancel() diff --git a/test/e2e/compact_test.go b/test/e2e/compact_test.go index b3deb8f19a..bfba33da5c 100644 --- a/test/e2e/compact_test.go +++ b/test/e2e/compact_test.go @@ -535,12 +535,14 @@ func TestCompactWithStoreGateway(t *testing.T) { c, err := e2ethanos.NewCompactor(s.SharedDir(), "expect-to-halt", svcConfig, nil) testutil.Ok(t, err) testutil.Ok(t, s.StartAndWaitReady(c)) - testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(float64(len(rawBlockIDs)+7)), "thanos_blocks_meta_synced")) - testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_blocks_meta_sync_failures_total")) - testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_blocks_meta_modified")) - // Expect compactor halted. + // Expect compactor halted and for one cleanup iteration to happen. testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(1), "thanos_compact_halted")) + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(1), "thanos_compact_block_cleanup_loops_total")) + + testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(float64(len(rawBlockIDs)+5)), "thanos_blocks_meta_synced")) + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_blocks_meta_sync_failures_total")) + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_blocks_meta_modified")) // The compact directory is still there. dataDir := filepath.Join(s.SharedDir(), "data", "compact", "expect-to-halt") @@ -559,8 +561,8 @@ func TestCompactWithStoreGateway(t *testing.T) { testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(2), "thanos_compact_group_compaction_runs_completed_total")) // However, the blocks have been cleaned because that happens concurrently. - testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(2), "thanos_compact_blocks_cleaned_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(2), "thanos_compact_aborted_partial_uploads_deletion_attempts_total")) + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(2), "thanos_compact_blocks_cleaned_total")) // Ensure bucket UI. ensureGETStatusCode(t, http.StatusOK, "http://"+path.Join(c.HTTPEndpoint(), "global"))