diff --git a/CHANGELOG.md b/CHANGELOG.md index f33b8015fdf..75c3bfb667e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ * [ENHANCEMENT] Memberlist KV: incoming messages are now processed on per-key goroutine. This may reduce loss of "maintanance" packets in busy memberlist installations, but use more CPU. New `memberlist_client_received_broadcasts_dropped_total` counter tracks number of dropped per-key messages. #1912 * [BUGFIX] Fix regexp parsing panic for regexp label matchers with start/end quantifiers. #1883 * [BUGFIX] Ingester: fixed deceiving error log "failed to update cached shipped blocks after shipper initialisation", occurring for each new tenant in the ingester. #1893 +* [BUGFIX] Ring: fix bug where ingesters may appear unhealthy even though they are not #1933 ### Mixin diff --git a/go.mod b/go.mod index d68c2a07474..7c026b1b44d 100644 --- a/go.mod +++ b/go.mod @@ -18,7 +18,7 @@ require ( github.com/golang/snappy v0.0.4 github.com/google/gopacket v1.1.19 github.com/gorilla/mux v1.8.0 - github.com/grafana/dskit v0.0.0-20220523143435-f5a17a2c14c8 + github.com/grafana/dskit v0.0.0-20220526081034-789ec0ca4a3b github.com/grafana/e2e v0.1.1-0.20220519104354-1db01e4751fe github.com/hashicorp/golang-lru v0.5.4 github.com/json-iterator/go v1.1.12 diff --git a/go.sum b/go.sum index 861c2469839..96e673ba938 100644 --- a/go.sum +++ b/go.sum @@ -1047,8 +1047,8 @@ github.com/grafana-tools/sdk v0.0.0-20211220201350-966b3088eec9 h1:LQAhgcUPnzdjU github.com/grafana-tools/sdk v0.0.0-20211220201350-966b3088eec9/go.mod h1:AHHlOEv1+GGQ3ktHMlhuTUwo3zljV3QJbC0+8o2kn+4= github.com/grafana/dskit v0.0.0-20211021180445-3bd016e9d7f1/go.mod h1:uPG2nyK4CtgNDmWv7qyzYcdI+S90kHHRWvHnBtEMBXM= github.com/grafana/dskit v0.0.0-20220112093026-95274ccc858d/go.mod h1:M0/dlftwBvH7+hdNNpjMa/CUXD7gsew67mbkCuDlFXE= -github.com/grafana/dskit v0.0.0-20220523143435-f5a17a2c14c8 h1:u7rtjSEjrX/WOGTgRq8ZsNXOv62tRqhkb1gJtNTEzR8= -github.com/grafana/dskit v0.0.0-20220523143435-f5a17a2c14c8/go.mod h1:9It/K30QPyj/FuTqBb/SYnaS4/BJCP5YL4SRfXB7dG0= +github.com/grafana/dskit v0.0.0-20220526081034-789ec0ca4a3b h1:9h79WowXGj6wErMzoi35pBECxnh7ucKJlQTs4Gs0yOI= +github.com/grafana/dskit v0.0.0-20220526081034-789ec0ca4a3b/go.mod h1:9It/K30QPyj/FuTqBb/SYnaS4/BJCP5YL4SRfXB7dG0= github.com/grafana/e2e v0.1.1-0.20220519104354-1db01e4751fe h1:mxrRWDjKtob43xF9nEhJthdtCzX35/800Sk7nE//YHQ= github.com/grafana/e2e v0.1.1-0.20220519104354-1db01e4751fe/go.mod h1:+26VJWpczg2OU3D0537acnHSHzhJORpxOs6F+M27tZo= github.com/grafana/memberlist v0.3.1-0.20220425183535-6b97a09b7167 h1:PgEQkGHR4YimSCEGT5IoswN9gJKZDVskf+he6UClCLw= diff --git a/pkg/alertmanager/alertmanager_ring.go b/pkg/alertmanager/alertmanager_ring.go index c66dfd23678..482759919a3 100644 --- a/pkg/alertmanager/alertmanager_ring.go +++ b/pkg/alertmanager/alertmanager_ring.go @@ -114,6 +114,7 @@ func (cfg *RingConfig) ToLifecyclerConfig(logger log.Logger) (ring.BasicLifecycl ID: cfg.InstanceID, Addr: fmt.Sprintf("%s:%d", instanceAddr, instancePort), HeartbeatPeriod: cfg.HeartbeatPeriod, + HeartbeatTimeout: cfg.HeartbeatTimeout, TokensObservePeriod: 0, Zone: cfg.InstanceZone, NumTokens: RingNumTokens, diff --git a/pkg/compactor/compactor_ring.go b/pkg/compactor/compactor_ring.go index 6e19b6d0792..b5932f52e4e 100644 --- a/pkg/compactor/compactor_ring.go +++ b/pkg/compactor/compactor_ring.go @@ -102,6 +102,7 @@ func (cfg *RingConfig) ToLifecyclerConfig() ring.LifecyclerConfig { lc.InfNames = cfg.InstanceInterfaceNames lc.UnregisterOnShutdown = true lc.HeartbeatPeriod = cfg.HeartbeatPeriod + lc.HeartbeatTimeout = cfg.HeartbeatTimeout lc.ObservePeriod = cfg.ObservePeriod lc.JoinAfter = 0 lc.MinReadyDuration = 0 diff --git a/pkg/compactor/compactor_ring_test.go b/pkg/compactor/compactor_ring_test.go index 3ccc35a583e..47e1848b636 100644 --- a/pkg/compactor/compactor_ring_test.go +++ b/pkg/compactor/compactor_ring_test.go @@ -51,6 +51,7 @@ func TestRingConfig_CustomConfigToLifecyclerConfig(t *testing.T) { // The lifecycler config should be generated based upon the compactor // ring config expected.HeartbeatPeriod = cfg.HeartbeatPeriod + expected.HeartbeatTimeout = cfg.HeartbeatTimeout expected.RingConfig.HeartbeatTimeout = cfg.HeartbeatTimeout expected.RingConfig.SubringCacheDisabled = true expected.RingConfig.KVStore.Store = "memberlist" diff --git a/pkg/distributor/distributor_ring.go b/pkg/distributor/distributor_ring.go index 377a9fd33f6..e19ae140c75 100644 --- a/pkg/distributor/distributor_ring.go +++ b/pkg/distributor/distributor_ring.go @@ -86,6 +86,7 @@ func (cfg *RingConfig) ToLifecyclerConfig() ring.LifecyclerConfig { lc.InfNames = cfg.InstanceInterfaceNames lc.UnregisterOnShutdown = true lc.HeartbeatPeriod = cfg.HeartbeatPeriod + lc.HeartbeatTimeout = cfg.HeartbeatTimeout lc.ObservePeriod = 0 lc.NumTokens = 1 lc.JoinAfter = 0 diff --git a/pkg/distributor/distributor_ring_test.go b/pkg/distributor/distributor_ring_test.go index 383bbe7627c..735879179c4 100644 --- a/pkg/distributor/distributor_ring_test.go +++ b/pkg/distributor/distributor_ring_test.go @@ -54,6 +54,7 @@ func TestRingConfig_CustomConfigToLifecyclerConfig(t *testing.T) { // The lifecycler config should be generated based upon the distributor // ring config expected.HeartbeatPeriod = cfg.HeartbeatPeriod + expected.HeartbeatTimeout = cfg.HeartbeatTimeout expected.RingConfig.HeartbeatTimeout = cfg.HeartbeatTimeout expected.RingConfig.KVStore.Store = "memberlist" expected.ID = cfg.InstanceID diff --git a/pkg/ingester/ingester_ring.go b/pkg/ingester/ingester_ring.go index 29c175025d7..02868a68e29 100644 --- a/pkg/ingester/ingester_ring.go +++ b/pkg/ingester/ingester_ring.go @@ -121,6 +121,7 @@ func (cfg *RingConfig) ToLifecyclerConfig() ring.LifecyclerConfig { lc.RingConfig = cfg.ToRingConfig() lc.NumTokens = cfg.NumTokens lc.HeartbeatPeriod = cfg.HeartbeatPeriod + lc.HeartbeatTimeout = cfg.HeartbeatTimeout lc.ObservePeriod = cfg.ObservePeriod lc.JoinAfter = cfg.JoinAfter lc.MinReadyDuration = cfg.MinReadyDuration diff --git a/pkg/ingester/ingester_ring_test.go b/pkg/ingester/ingester_ring_test.go index e26dd378b9a..39eeb395f51 100644 --- a/pkg/ingester/ingester_ring_test.go +++ b/pkg/ingester/ingester_ring_test.go @@ -65,6 +65,7 @@ func TestRingConfig_CustomConfigToLifecyclerConfig(t *testing.T) { expected.NumTokens = cfg.NumTokens expected.HeartbeatPeriod = cfg.HeartbeatPeriod + expected.HeartbeatTimeout = cfg.HeartbeatTimeout expected.ObservePeriod = cfg.ObservePeriod expected.JoinAfter = cfg.JoinAfter expected.MinReadyDuration = cfg.MinReadyDuration diff --git a/pkg/ruler/ruler_ring.go b/pkg/ruler/ruler_ring.go index 9e66d59e7c7..55274b943d3 100644 --- a/pkg/ruler/ruler_ring.go +++ b/pkg/ruler/ruler_ring.go @@ -90,6 +90,7 @@ func (cfg *RingConfig) ToLifecyclerConfig(logger log.Logger) (ring.BasicLifecycl ID: cfg.InstanceID, Addr: fmt.Sprintf("%s:%d", instanceAddr, instancePort), HeartbeatPeriod: cfg.HeartbeatPeriod, + HeartbeatTimeout: cfg.HeartbeatTimeout, TokensObservePeriod: 0, NumTokens: cfg.NumTokens, }, nil diff --git a/pkg/storegateway/gateway_ring.go b/pkg/storegateway/gateway_ring.go index d2c11348b0e..687004b9bb8 100644 --- a/pkg/storegateway/gateway_ring.go +++ b/pkg/storegateway/gateway_ring.go @@ -154,6 +154,7 @@ func (cfg *RingConfig) ToLifecyclerConfig(logger log.Logger) (ring.BasicLifecycl Addr: fmt.Sprintf("%s:%d", instanceAddr, instancePort), Zone: cfg.InstanceZone, HeartbeatPeriod: cfg.HeartbeatPeriod, + HeartbeatTimeout: cfg.HeartbeatTimeout, TokensObservePeriod: 0, NumTokens: RingNumTokens, KeepInstanceInTheRingOnShutdown: !cfg.UnregisterOnShutdown, diff --git a/vendor/github.com/grafana/dskit/ring/basic_lifecycler.go b/vendor/github.com/grafana/dskit/ring/basic_lifecycler.go index 1bb95c08370..780926c0d75 100644 --- a/vendor/github.com/grafana/dskit/ring/basic_lifecycler.go +++ b/vendor/github.com/grafana/dskit/ring/basic_lifecycler.go @@ -49,6 +49,7 @@ type BasicLifecyclerConfig struct { Zone string HeartbeatPeriod time.Duration + HeartbeatTimeout time.Duration TokensObservePeriod time.Duration NumTokens int @@ -512,5 +513,5 @@ func (l *BasicLifecycler) getRing(ctx context.Context) (*Desc, error) { } func (l *BasicLifecycler) ServeHTTP(w http.ResponseWriter, req *http.Request) { - newRingPageHandler(l, l.cfg.HeartbeatPeriod).handle(w, req) + newRingPageHandler(l, l.cfg.HeartbeatTimeout).handle(w, req) } diff --git a/vendor/github.com/grafana/dskit/ring/http.go b/vendor/github.com/grafana/dskit/ring/http.go index bcf3d1cc89a..26d28e3e5d0 100644 --- a/vendor/github.com/grafana/dskit/ring/http.go +++ b/vendor/github.com/grafana/dskit/ring/http.go @@ -53,14 +53,14 @@ type ringAccess interface { } type ringPageHandler struct { - r ringAccess - heartbeatPeriod time.Duration + r ringAccess + heartbeatTimeout time.Duration } -func newRingPageHandler(r ringAccess, heartbeatPeriod time.Duration) *ringPageHandler { +func newRingPageHandler(r ringAccess, heartbeatTimeout time.Duration) *ringPageHandler { return &ringPageHandler{ - r: r, - heartbeatPeriod: heartbeatPeriod, + r: r, + heartbeatTimeout: heartbeatTimeout, } } @@ -106,7 +106,7 @@ func (h *ringPageHandler) handle(w http.ResponseWriter, req *http.Request) { for _, id := range ingesterIDs { ing := ringDesc.Ingesters[id] state := ing.State.String() - if !ing.IsHealthy(Reporting, h.heartbeatPeriod, now) { + if !ing.IsHealthy(Reporting, h.heartbeatTimeout, now) { state = "UNHEALTHY" } diff --git a/vendor/github.com/grafana/dskit/ring/lifecycler.go b/vendor/github.com/grafana/dskit/ring/lifecycler.go index 2479ad03c8e..da5f603eec0 100644 --- a/vendor/github.com/grafana/dskit/ring/lifecycler.go +++ b/vendor/github.com/grafana/dskit/ring/lifecycler.go @@ -29,6 +29,7 @@ type LifecyclerConfig struct { // Config for the ingester lifecycle control NumTokens int `yaml:"num_tokens" category:"advanced"` HeartbeatPeriod time.Duration `yaml:"heartbeat_period" category:"advanced"` + HeartbeatTimeout time.Duration `yaml:"heartbeat_timeout" category:"advanced"` ObservePeriod time.Duration `yaml:"observe_period" category:"advanced"` JoinAfter time.Duration `yaml:"join_after" category:"advanced"` MinReadyDuration time.Duration `yaml:"min_ready_duration" category:"advanced"` @@ -70,6 +71,7 @@ func (cfg *LifecyclerConfig) RegisterFlagsWithPrefix(prefix string, f *flag.Flag f.IntVar(&cfg.NumTokens, prefix+"num-tokens", 128, "Number of tokens for each ingester.") f.DurationVar(&cfg.HeartbeatPeriod, prefix+"heartbeat-period", 5*time.Second, "Period at which to heartbeat to consul. 0 = disabled.") + f.DurationVar(&cfg.HeartbeatTimeout, prefix+"heartbeat-timeout", 1*time.Minute, "Heartbeat timeout after which instance is assumed to be unhealthy. 0 = disabled.") f.DurationVar(&cfg.JoinAfter, prefix+"join-after", 0*time.Second, "Period to wait for a claim from another member; will join automatically after this.") f.DurationVar(&cfg.ObservePeriod, prefix+"observe-period", 0*time.Second, "Observe tokens after generating to resolve collisions. Useful when using gossiping ring.") f.DurationVar(&cfg.MinReadyDuration, prefix+"min-ready-duration", 15*time.Second, "Minimum duration to wait after the internal readiness checks have passed but before succeeding the readiness endpoint. This is used to slowdown deployment controllers (eg. Kubernetes) after an instance is ready and before they proceed with a rolling update, to give the rest of the cluster instances enough time to receive ring updates.") @@ -885,7 +887,7 @@ func (i *Lifecycler) getRing(ctx context.Context) (*Desc, error) { } func (i *Lifecycler) ServeHTTP(w http.ResponseWriter, req *http.Request) { - newRingPageHandler(i, i.cfg.HeartbeatPeriod).handle(w, req) + newRingPageHandler(i, i.cfg.HeartbeatTimeout).handle(w, req) } // unregister removes our entry from consul. diff --git a/vendor/modules.txt b/vendor/modules.txt index e8ee582c35c..bdfcbbddeb5 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -434,7 +434,7 @@ github.com/gosimple/slug # github.com/grafana-tools/sdk v0.0.0-20211220201350-966b3088eec9 ## explicit; go 1.13 github.com/grafana-tools/sdk -# github.com/grafana/dskit v0.0.0-20220523143435-f5a17a2c14c8 +# github.com/grafana/dskit v0.0.0-20220526081034-789ec0ca4a3b ## explicit; go 1.17 github.com/grafana/dskit/backoff github.com/grafana/dskit/concurrency