Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ring: Include unhealthy replicas in error message #66

Merged
merged 2 commits into from
Nov 3, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions ring/replication_strategy.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@ package ring

import (
"fmt"
"strings"
"time"

"github.com/pkg/errors"
)

type ReplicationStrategy interface {
Expand Down Expand Up @@ -40,10 +39,12 @@ func (s *defaultReplicationStrategy) Filter(instances []InstanceDesc, op Operati
// Skip those that have not heartbeated in a while. NB these are still
// included in the calculation of minSuccess, so if too many failed instances
// will cause the whole write to fail.
var unhealthy []string
for i := 0; i < len(instances); {
if instances[i].IsHealthy(op, heartbeatTimeout, now) {
i++
} else {
unhealthy = append(unhealthy, instances[i].Addr)
instances = append(instances[:i], instances[i+1:]...)
}
}
Expand All @@ -52,11 +53,15 @@ func (s *defaultReplicationStrategy) Filter(instances []InstanceDesc, op Operati
// after filtering out dead ones, don't even bother trying.
if len(instances) < minSuccess {
var err error
var unhealthyStr string
if len(unhealthy) > 0 {
unhealthyStr = fmt.Sprintf(" - unhealthy instances: %s", strings.Join(unhealthy, ","))
}

if zoneAwarenessEnabled {
err = fmt.Errorf("at least %d live replicas required across different availability zones, could only find %d", minSuccess, len(instances))
err = fmt.Errorf("at least %d live replicas required across different availability zones, could only find %d%s", minSuccess, len(instances), unhealthyStr)
} else {
err = fmt.Errorf("at least %d live replicas required, could only find %d", minSuccess, len(instances))
err = fmt.Errorf("at least %d live replicas required, could only find %d%s", minSuccess, len(instances), unhealthyStr)
}

return nil, 0, err
Expand All @@ -74,17 +79,23 @@ func NewIgnoreUnhealthyInstancesReplicationStrategy() ReplicationStrategy {
func (r *ignoreUnhealthyInstancesReplicationStrategy) Filter(instances []InstanceDesc, op Operation, _ int, heartbeatTimeout time.Duration, _ bool) (healthy []InstanceDesc, maxFailures int, err error) {
now := time.Now()
// Filter out unhealthy instances.
var unhealthy []string
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

go has a builtin strings.Builder type you could use for this.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@56quarters yeah, familiar with strings.Builder, but do you gain much in this case?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same thing you gain when you replace any code you've written with the stdlib 😄

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@56quarters discussed with @pracucci - he agrees that the current algorithm with strings.Join should be fine (it's dubious that fine tuning for perf will give us much), using strings.Builder would require some rewriting with uncertain gain versus effort.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My motivation was to reduce the amount of code we're responsible for more than performance, but that's fine 👍

for i := 0; i < len(instances); {
if instances[i].IsHealthy(op, heartbeatTimeout, now) {
i++
} else {
unhealthy = append(unhealthy, instances[i].Addr)
instances = append(instances[:i], instances[i+1:]...)
}
}

// We need at least 1 healthy instance no matter what is the replication factor set to.
if len(instances) == 0 {
return nil, 0, errors.New("at least 1 healthy replica required, could only find 0")
var unhealthyStr string
if len(unhealthy) > 0 {
unhealthyStr = fmt.Sprintf(" - unhealthy instances: %s", strings.Join(unhealthy, ","))
}
return nil, 0, fmt.Errorf("at least 1 healthy replica required, could only find 0%s", unhealthyStr)
}

return instances, len(instances) - 1, nil
Expand Down
12 changes: 6 additions & 6 deletions ring/replication_strategy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ func TestRingReplicationStrategy(t *testing.T) {
{
replicationFactor: 1,
deadIngesters: 1,
expectedError: "at least 1 live replicas required, could only find 0",
expectedError: "at least 1 live replicas required, could only find 0 - unhealthy instances: dead1",
},

// Ensure it works for RF=3 and 2 ingesters.
Expand Down Expand Up @@ -52,7 +52,7 @@ func TestRingReplicationStrategy(t *testing.T) {
replicationFactor: 3,
liveIngesters: 1,
deadIngesters: 2,
expectedError: "at least 2 live replicas required, could only find 1",
expectedError: "at least 2 live replicas required, could only find 1 - unhealthy instances: dead1,dead2",
},

// Ensure it works when adding / removing nodes.
Expand All @@ -75,7 +75,7 @@ func TestRingReplicationStrategy(t *testing.T) {
replicationFactor: 3,
liveIngesters: 2,
deadIngesters: 2,
expectedError: "at least 3 live replicas required, could only find 2",
expectedError: "at least 3 live replicas required, could only find 2 - unhealthy instances: dead1,dead2",
},
} {
ingesters := []InstanceDesc{}
Expand All @@ -85,7 +85,7 @@ func TestRingReplicationStrategy(t *testing.T) {
})
}
for i := 0; i < tc.deadIngesters; i++ {
ingesters = append(ingesters, InstanceDesc{})
ingesters = append(ingesters, InstanceDesc{Addr: fmt.Sprintf("dead%d", i+1)})
}

t.Run(fmt.Sprintf("[%d]", i), func(t *testing.T) {
Expand Down Expand Up @@ -137,7 +137,7 @@ func TestIgnoreUnhealthyInstancesReplicationStrategy(t *testing.T) {
liveIngesters: 0,
deadIngesters: 3,
expectedMaxFailure: 0,
expectedError: "at least 1 healthy replica required, could only find 0",
expectedError: "at least 1 healthy replica required, could only find 0 - unhealthy instances: dead1,dead2,dead3",
},
} {
ingesters := []InstanceDesc{}
Expand All @@ -147,7 +147,7 @@ func TestIgnoreUnhealthyInstancesReplicationStrategy(t *testing.T) {
})
}
for i := 0; i < tc.deadIngesters; i++ {
ingesters = append(ingesters, InstanceDesc{})
ingesters = append(ingesters, InstanceDesc{Addr: fmt.Sprintf("dead%d", i+1)})
}

t.Run(tc.name, func(t *testing.T) {
Expand Down