Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bugfix: native histogram: exemplars index out of range #1608

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 66 additions & 26 deletions prometheus/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -1658,7 +1658,10 @@ func addAndResetCounts(hot, cold *histogramCounts) {
type nativeExemplars struct {
sync.Mutex

ttl time.Duration
// Time-to-live for exemplars, it is set to -1 if exemplars are disabled, that is NativeHistogramMaxExemplars is below 0.
// The ttl is used on insertion to remove an exemplar that is older than ttl, if present.
ttl time.Duration

exemplars []*dto.Exemplar
}

Expand All @@ -1673,6 +1676,7 @@ func makeNativeExemplars(ttl time.Duration, maxCount int) nativeExemplars {

if maxCount < 0 {
maxCount = 0
ttl = -1
}

return nativeExemplars{
Expand All @@ -1682,20 +1686,18 @@ func makeNativeExemplars(ttl time.Duration, maxCount int) nativeExemplars {
}

func (n *nativeExemplars) addExemplar(e *dto.Exemplar) {
if cap(n.exemplars) == 0 {
if n.ttl == -1 {
bwplotka marked this conversation as resolved.
Show resolved Hide resolved
return
}

n.Lock()
defer n.Unlock()

// The index where to insert the new exemplar.
var nIdx int = -1

// When the number of exemplars has not yet exceeded or
// is equal to cap(n.exemplars), then
// insert the new exemplar directly.
if len(n.exemplars) < cap(n.exemplars) {
var nIdx int
for nIdx = 0; nIdx < len(n.exemplars); nIdx++ {
if *e.Value < *n.exemplars[nIdx].Value {
break
Expand All @@ -1705,17 +1707,46 @@ func (n *nativeExemplars) addExemplar(e *dto.Exemplar) {
return
}

if len(n.exemplars) == 1 {
// When the number of exemplars is 1, then
// replace the existing exemplar with the new exemplar.
n.exemplars[0] = e
return
}
// From this point on, the number of exemplars is greater than 1.

// When the number of exemplars exceeds the limit, remove one exemplar.
var (
rIdx int // The index where to remove the old exemplar.

ot = time.Now() // Oldest timestamp seen.
otIdx = -1 // Index of the exemplar with the oldest timestamp.

md = -1.0 // Logarithm of the delta of the closest pair of exemplars.
mdIdx = -1 // Index of the older exemplar within the closest pair.
cLog float64 // Logarithm of the current exemplar.
pLog float64 // Logarithm of the previous exemplar.
ot = time.Time{} // Oldest timestamp seen. Initial value doesn't matter as we replace it due to otIdx == -1 in the loop.
otIdx = -1 // Index of the exemplar with the oldest timestamp.

md = -1.0 // Logarithm of the delta of the closest pair of exemplars.

// The insertion point of the new exemplar in the exemplars slice after insertion.
// This is calculated purely based on the order of the exemplars by value.
// nIdx == len(n.exemplars) means the new exemplar is to be inserted after the end.
nIdx = -1

// rIdx is ultimately the index for the exemplar that we are replacing with the new exemplar.
// The aim is to keep a good spread of exemplars by value and not let them bunch up too much.
// It is calculated in 3 steps:
// 1. First we set rIdx to the index of the older exemplar within the closest pair by value.
// That is the following will be true (on log scale):
// either the exemplar pair on index (rIdx-1, rIdx) or (rIdx, rIdx+1) will have
// the closest values to each other from all pairs.
// For example, suppose the values are distributed like this:
// |-----------x-------------x----------------x----x-----|
// ^--rIdx as this is older.
// Or like this:
// |-----------x-------------x----------------x----x-----|
// ^--rIdx as this is older.
// 2. If there is an exemplar that expired, then we simple reset rIdx to that index.
// 3. We check if by inserting the new exemplar we would create a closer pair at
// (nIdx-1, nIdx) or (nIdx, nIdx+1) and set rIdx to nIdx-1 or nIdx accordingly to
// keep the spread of exemplars by value; otherwise we keep rIdx as it is.
rIdx = -1
cLog float64 // Logarithm of the current exemplar.
pLog float64 // Logarithm of the previous exemplar.
)

for i, exemplar := range n.exemplars {
Expand All @@ -1726,7 +1757,7 @@ func (n *nativeExemplars) addExemplar(e *dto.Exemplar) {
}

// Find the index at which to insert new the exemplar.
if *e.Value <= *exemplar.Value && nIdx == -1 {
if nIdx == -1 && *e.Value <= *exemplar.Value {
nIdx = i
}

Expand All @@ -1738,11 +1769,13 @@ func (n *nativeExemplars) addExemplar(e *dto.Exemplar) {
}
diff := math.Abs(cLog - pLog)
if md == -1 || diff < md {
// The closest exemplar pair is at index: i-1, i.
// Choose the exemplar with the older timestamp for replacement.
md = diff
if n.exemplars[i].Timestamp.AsTime().Before(n.exemplars[i-1].Timestamp.AsTime()) {
mdIdx = i
rIdx = i
} else {
mdIdx = i - 1
rIdx = i - 1
}
}

Expand All @@ -1753,8 +1786,12 @@ func (n *nativeExemplars) addExemplar(e *dto.Exemplar) {
if nIdx == -1 {
nIdx = len(n.exemplars)
}
// Here, we have the following relationships:
// n.exemplars[nIdx-1].Value < e.Value (if nIdx > 0)
// e.Value <= n.exemplars[nIdx].Value (if nIdx < len(n.exemplars))

if otIdx != -1 && e.Timestamp.AsTime().Sub(ot) > n.ttl {
// If the oldest exemplar has expired, then replace it with the new exemplar.
rIdx = otIdx
} else {
// In the previous for loop, when calculating the closest pair of exemplars,
Expand All @@ -1764,23 +1801,26 @@ func (n *nativeExemplars) addExemplar(e *dto.Exemplar) {
if nIdx > 0 {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did is odd, why we need this, we just set it to len if it's -1, plus we know len is > 0 because we are in the logic that has to remove something. Is it because nIdx can be zero which means we have to put exemplar upfront?

Then let's fix comment on line 1766, it's wrong:

// Here, we have the following relationships:
// n.exemplars[nIdx-1].Value < e.Value <= n.exemplars[nIdx].Value

diff := math.Abs(elog - math.Log(n.exemplars[nIdx-1].GetValue()))
if diff < md {
// The value we are about to insert is closer to the previous exemplar at the insertion point than what we calculated before in rIdx.
// v--rIdx
// |-----------x-n-----------x----------------x----x-----|
// nIdx-1--^ ^--new exemplar value
// Do not make the spread worse, replace nIdx-1 and not rIdx.
md = diff
mdIdx = nIdx
if n.exemplars[nIdx-1].Timestamp.AsTime().Before(e.Timestamp.AsTime()) {
mdIdx = nIdx - 1
}
rIdx = nIdx - 1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we not forgetting to choose larger exemplar rIdx = nIdx for diff > md case? Why we have to recalculate another diff below?

}
}
if nIdx < len(n.exemplars) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this is then guarding the case when nIdx is the last element? Then again, comment 1766 is kind of wrong, or misleading.

diff := math.Abs(math.Log(n.exemplars[nIdx].GetValue()) - elog)
if diff < md {
mdIdx = nIdx
if n.exemplars[nIdx].Timestamp.AsTime().Before(e.Timestamp.AsTime()) {
mdIdx = nIdx
}
// The value we are about to insert is closer to the next exemplar at the insertion point than what we calculated before in rIdx.
// v--rIdx
// |-----------x-----------n-x----------------x----x-----|
// new exemplar value--^ ^--nIdx
// Do not make the spread worse, replace nIdx-1 and not rIdx.
rIdx = nIdx
}
}
rIdx = mdIdx
}

// Adjust the slice according to rIdx and nIdx.
Expand Down
8 changes: 6 additions & 2 deletions prometheus/histogram_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1049,10 +1049,14 @@ func TestNativeHistogramConcurrency(t *testing.T) {

go func(vals []float64) {
start.Wait()
for _, v := range vals {
for i, v := range vals {
// An observation every 1 to 10 seconds.
atomic.AddInt64(&ts, rand.Int63n(10)+1)
his.Observe(v)
if i%2 == 0 {
his.Observe(v)
} else {
his.(ExemplarObserver).ObserveWithExemplar(v, Labels{"foo": "bar"})
}
}
end.Done()
}(vals)
Expand Down