Skip to content

Commit

Permalink
Track OOO setting via anonymous usage statistics (#2940)
Browse files Browse the repository at this point in the history
Signed-off-by: Marco Pracucci <marco@pracucci.com>

Signed-off-by: Marco Pracucci <marco@pracucci.com>
  • Loading branch information
pracucci committed Sep 15, 2022
1 parent a8d85bb commit 8af7209
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 18 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
* [CHANGE] Query-frontend: CLI flag `-query-frontend.align-querier-with-step` has been deprecated. Please use `-query-frontend.align-queries-with-step` instead. #2840
* [CHANGE] Distributor: change the default value of `-distributor.remote-timeout` to `2s` from `20s` and `-distributor.forwarding.request-timeout` to `2s` from `10s` to improve distributor resource usage when ingesters crash. #2728
* [CHANGE] Ingester: changed default setting for `-ingester.ring.readiness-check-ring-health` from `true` to `false`. #2953
* [FEATURE] Introduced an anonymous usage statistics tracking (enabled by default), to help Mimir maintainers make better decisions to support the open source community. The tracking system anonymously collects non-sensitive, non-personally identifiable information about the running Mimir cluster. #2643 #2662 #2685 #2732 #2733 #2735 #2939
* [FEATURE] Introduced an anonymous usage statistics tracking (enabled by default), to help Mimir maintainers make better decisions to support the open source community. The tracking system anonymously collects non-sensitive, non-personally identifiable information about the running Mimir cluster. #2643 #2662 #2685 #2732 #2733 #2735 #2939 #2940
* [FEATURE] Introduced an experimental deployment mode called read-write and running a fully featured Mimir cluster with three components: write, read and backend. The read-write deployment mode is a trade-off between the monolithic mode (only one component, no isolation) and the microservices mode (many components, high isolation). #2754 #2838
* [ENHANCEMENT] Distributor: Add `cortex_distributor_query_ingester_chunks_deduped_total` and `cortex_distributor_query_ingester_chunks_total` metrics for determining how effective ingester chunk deduplication at query time is. #2713
* [ENHANCEMENT] Upgrade Docker base images to `alpine:3.16.2`. #2729
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,12 @@ When the usage statistics reporting is enabled, Grafana Mimir collects the follo
- The `-target` parameter value, such as `all` when running Mimir in monolithic mode.
- The `-blocks-storage.backend` value, such as `s3`.
- The `-ingester.ring.replication-factor` value, such as `3`.
- The minimum and maximum value of `-ingester.out-of-order-time-window`, which can be overridden on a per-tenant basis (the tenant ID is not shared).
- Information about the Mimir **cluster scale**:
- Ingester:
- The number of in-memory series.
- The number of tenants that have in-memory series.
- The number of tenants that have out-of-order ingestion enabled.
- The number of samples and exemplars ingested.
- Querier, _where no information is tracked about the actual request or query_:
- The number of requests to queriers that are split by API endpoint type:
Expand Down
64 changes: 47 additions & 17 deletions pkg/ingester/ingester.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,14 @@ const (
newValueForTimestamp = "new-value-for-timestamp"
sampleOutOfBounds = "sample-out-of-bounds"

replicationFactorStatsName = "ingester_replication_factor"
memorySeriesStatsName = "ingester_inmemory_series"
memoryTenantsStatsName = "ingester_inmemory_tenants"
appendedSamplesStatsName = "ingester_appended_samples"
appendedExemplarsStatsName = "ingester_appended_exemplars"
replicationFactorStatsName = "ingester_replication_factor"
memorySeriesStatsName = "ingester_inmemory_series"
memoryTenantsStatsName = "ingester_inmemory_tenants"
appendedSamplesStatsName = "ingester_appended_samples"
appendedExemplarsStatsName = "ingester_appended_exemplars"
tenantsWithOutOfOrderEnabledStatName = "ingester_ooo_enabled_tenants"
minOutOfOrderTimeWindowSecondsStatName = "ingester_ooo_min_window"
maxOutOfOrderTimeWindowSecondsStatName = "ingester_ooo_max_window"
)

// BlocksUploader interface is used to have an easy way to mock it in tests.
Expand Down Expand Up @@ -238,10 +241,13 @@ type Ingester struct {
inflightPushRequests atomic.Int64

// Anonymous usage statistics tracked by ingester.
memorySeriesStats *expvar.Int
memoryTenantsStats *expvar.Int
appendedSamplesStats *usagestats.Counter
appendedExemplarsStats *usagestats.Counter
memorySeriesStats *expvar.Int
memoryTenantsStats *expvar.Int
appendedSamplesStats *usagestats.Counter
appendedExemplarsStats *usagestats.Counter
tenantsWithOutOfOrderEnabledStat *expvar.Int
minOutOfOrderTimeWindowSecondsStat *expvar.Int
maxOutOfOrderTimeWindowSecondsStat *expvar.Int
}

func newIngester(cfg Config, limits *validation.Overrides, registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) {
Expand Down Expand Up @@ -270,10 +276,13 @@ func newIngester(cfg Config, limits *validation.Overrides, registerer prometheus
shipTrigger: make(chan requestWithUsersAndCallback),
seriesHashCache: hashcache.NewSeriesHashCache(cfg.BlocksStorageConfig.TSDB.SeriesHashCacheMaxBytes),

memorySeriesStats: usagestats.GetAndResetInt(memorySeriesStatsName),
memoryTenantsStats: usagestats.GetAndResetInt(memoryTenantsStatsName),
appendedSamplesStats: usagestats.GetAndResetCounter(appendedSamplesStatsName),
appendedExemplarsStats: usagestats.GetAndResetCounter(appendedExemplarsStatsName),
memorySeriesStats: usagestats.GetAndResetInt(memorySeriesStatsName),
memoryTenantsStats: usagestats.GetAndResetInt(memoryTenantsStatsName),
appendedSamplesStats: usagestats.GetAndResetCounter(appendedSamplesStatsName),
appendedExemplarsStats: usagestats.GetAndResetCounter(appendedExemplarsStatsName),
tenantsWithOutOfOrderEnabledStat: usagestats.GetAndResetInt(tenantsWithOutOfOrderEnabledStatName),
minOutOfOrderTimeWindowSecondsStat: usagestats.GetAndResetInt(minOutOfOrderTimeWindowSecondsStatName),
maxOutOfOrderTimeWindowSecondsStat: usagestats.GetAndResetInt(maxOutOfOrderTimeWindowSecondsStatName),
}, nil
}

Expand Down Expand Up @@ -531,23 +540,44 @@ func (i *Ingester) updateActiveSeries(now time.Time) {
func (i *Ingester) updateUsageStats() {
memoryUsersCount := int64(0)
memorySeriesCount := int64(0)
tenantsWithOutOfOrderEnabledCount := int64(0)
minOutOfOrderTimeWindow := time.Duration(0)
maxOutOfOrderTimeWindow := time.Duration(0)

for _, userID := range i.getTSDBUsers() {
userDB := i.getTSDB(userID)
if userDB == nil {
continue
}

// Count only tenants with at least 1 series.
if numSeries := userDB.Head().NumSeries(); numSeries > 0 {
memoryUsersCount++
memorySeriesCount += int64(numSeries)
// Track only tenants with at least 1 series.
numSeries := userDB.Head().NumSeries()
if numSeries == 0 {
continue
}

memoryUsersCount++
memorySeriesCount += int64(numSeries)

oooWindow := time.Duration(i.limits.OutOfOrderTimeWindow(userID))
if oooWindow > 0 {
tenantsWithOutOfOrderEnabledCount++

if minOutOfOrderTimeWindow == 0 || oooWindow < minOutOfOrderTimeWindow {
minOutOfOrderTimeWindow = oooWindow
}
if oooWindow > maxOutOfOrderTimeWindow {
maxOutOfOrderTimeWindow = oooWindow
}
}
}

// Track anonymous usage stats.
i.memorySeriesStats.Set(memorySeriesCount)
i.memoryTenantsStats.Set(memoryUsersCount)
i.tenantsWithOutOfOrderEnabledStat.Set(tenantsWithOutOfOrderEnabledCount)
i.minOutOfOrderTimeWindowSecondsStat.Set(int64(minOutOfOrderTimeWindow.Seconds()))
i.maxOutOfOrderTimeWindowSecondsStat.Set(int64(maxOutOfOrderTimeWindow.Seconds()))
}

// applyTSDBSettings goes through all tenants and applies
Expand Down
24 changes: 24 additions & 0 deletions pkg/ingester/ingester_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -822,6 +822,9 @@ func TestIngester_Push(t *testing.T) {
assert.Equal(t, int64(expectedTenantsCount), usagestats.GetInt(memoryTenantsStatsName).Value())
assert.Equal(t, int64(expectedSamplesCount), usagestats.GetCounter(appendedSamplesStatsName).Total())
assert.Equal(t, int64(expectedExemplarsCount), usagestats.GetCounter(appendedExemplarsStatsName).Total())
assert.Equal(t, int64(0), usagestats.GetInt(tenantsWithOutOfOrderEnabledStatName).Value())
assert.Equal(t, int64(0), usagestats.GetInt(minOutOfOrderTimeWindowSecondsStatName).Value())
assert.Equal(t, int64(0), usagestats.GetInt(maxOutOfOrderTimeWindowSecondsStatName).Value())
})
}
}
Expand Down Expand Up @@ -5879,8 +5882,14 @@ func Test_Ingester_OutOfOrder(t *testing.T) {
pushSamples(90, 99, true)
verifySamples(100, 100)

i.updateUsageStats()
assert.Equal(t, int64(0), usagestats.GetInt(tenantsWithOutOfOrderEnabledStatName).Value())
assert.Equal(t, int64(0), usagestats.GetInt(minOutOfOrderTimeWindowSecondsStatName).Value())
assert.Equal(t, int64(0), usagestats.GetInt(maxOutOfOrderTimeWindowSecondsStatName).Value())

// Increasing the OOO time window.
setOOOTimeWindow(model.Duration(30 * time.Minute))

// Now it works.
pushSamples(90, 99, false)
verifySamples(90, 100)
Expand All @@ -5893,15 +5902,30 @@ func Test_Ingester_OutOfOrder(t *testing.T) {
pushSamples(50, 69, true)
verifySamples(70, 100)

i.updateUsageStats()
assert.Equal(t, int64(1), usagestats.GetInt(tenantsWithOutOfOrderEnabledStatName).Value())
assert.Equal(t, int64(30*60), usagestats.GetInt(minOutOfOrderTimeWindowSecondsStatName).Value())
assert.Equal(t, int64(30*60), usagestats.GetInt(maxOutOfOrderTimeWindowSecondsStatName).Value())

// Increase the time window again. It works.
setOOOTimeWindow(model.Duration(60 * time.Minute))
pushSamples(50, 69, false)
verifySamples(50, 100)

i.updateUsageStats()
assert.Equal(t, int64(1), usagestats.GetInt(tenantsWithOutOfOrderEnabledStatName).Value())
assert.Equal(t, int64(60*60), usagestats.GetInt(minOutOfOrderTimeWindowSecondsStatName).Value())
assert.Equal(t, int64(60*60), usagestats.GetInt(maxOutOfOrderTimeWindowSecondsStatName).Value())

// Decrease the time window again. Same push should fail.
setOOOTimeWindow(model.Duration(30 * time.Minute))
pushSamples(50, 69, true)
verifySamples(50, 100)

i.updateUsageStats()
assert.Equal(t, int64(1), usagestats.GetInt(tenantsWithOutOfOrderEnabledStatName).Value())
assert.Equal(t, int64(30*60), usagestats.GetInt(minOutOfOrderTimeWindowSecondsStatName).Value())
assert.Equal(t, int64(30*60), usagestats.GetInt(maxOutOfOrderTimeWindowSecondsStatName).Value())
}

func TestNewIngestErrMsgs(t *testing.T) {
Expand Down

0 comments on commit 8af7209

Please sign in to comment.