diff --git a/CHANGELOG.md b/CHANGELOG.md index 527a284cde..0e15c2bdf4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ We use *breaking* word for marking changes that are not backward compatible (rel ## Unreleased. +- [#1338](https://github.com/improbable-eng/thanos/pull/1338) Querier still warns on store API duplicate, but allows a single one from duplicated set. This is gracefully warn about the problematic logic and not disrupt immediately. + ### Fixed - [#1327](https://github.com/improbable-eng/thanos/pull/1327) `/series` API end-point now properly returns an empty array just like Prometheus if there are no results @@ -42,6 +44,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel To migrate over the old `--gcloudtrace.*` configuration, your tracing configuration should look like this: ```yaml + --- type: STACKDRIVER config: @@ -54,7 +57,9 @@ The other `type` you can use is `JAEGER` now. The `config` keys and values are J ### Changed -- [#1284](https://github.com/improbable-eng/thanos/pull/1284) Add support for multiple label-sets in Info gRPC service. This deprecates the single `Labels` slice of the `InfoResponse`, in a future release backward compatible handling for the single set of Labels will be removed. Upgrading to v0.6.0 or higher is advised. +- [#1284](https://github.com/improbable-eng/thanos/pull/1284) Add support for multiple label-sets in Info gRPC service. +This deprecates the single `Labels` slice of the `InfoResponse`, in a future release backward compatible handling for the single set of Labels will be removed. Upgrading to v0.6.0 or higher is advised. +*breaking* If you run have duplicate queries in your Querier configuration with hierarchical federation of multiple Queries this PR makes Thanos Querier to detect this case and block all duplicates. Refer to 0.6.1 which at least allows for single replica to work. - [#1314](https://github.com/improbable-eng/thanos/pull/1314) Removes `http_request_duration_microseconds` (Summary) and adds `http_request_duration_seconds` (Histogram) from http server instrumentation used in Thanos APIs and UIs. @@ -70,7 +75,7 @@ The other `type` you can use is `JAEGER` now. The `config` keys and values are J - [#1227](https://github.com/improbable-eng/thanos/pull/1227) Some context handling issues were fixed in Thanos Compact; some unnecessary memory allocations were removed in the hot path of Thanos Store. -- [#1183](https://github.com/improbable-eng/thanos/pull/1183) Compactor now correctly propogates retriable/haltable errors which means that it will not unnecessarily restart if such an error occurs +- [#1183](https://github.com/improbable-eng/thanos/pull/1183) Compactor now correctly propagates retriable/haltable errors which means that it will not unnecessarily restart if such an error occurs - [#1231](https://github.com/improbable-eng/thanos/pull/1231) Receive now correctly handles SIGINT and closes without deadlocking diff --git a/pkg/query/storeset.go b/pkg/query/storeset.go index 49a36d8981..2c7909fcb5 100644 --- a/pkg/query/storeset.go +++ b/pkg/query/storeset.go @@ -240,21 +240,24 @@ func (s *StoreSet) Update(ctx context.Context) { // Add stores that are not yet in s.stores. for addr, store := range healthyStores { - // Check if it has some external labels specified. - // No external labels means strictly store gateway or ruler and it is fine to have access to multiple instances of them. + if _, ok := s.stores[addr]; ok { + s.updateStoreStatus(store, nil) + continue + } + + // Check if the store has some external labels specified and if any if there are duplicates. + // We warn and exclude duplicates because it unnecessarily puts additional load on querier, network and underlying StoreAPIs and + // it indicates misconfiguration. // - // Sidecar will error out if it will be configured with empty external labels. + // Note: No external labels means strictly store gateway or ruler and it is fine to have access to multiple instances of them. + // Any other component will error out if it will be configured with empty external labels. externalLabels := externalLabelsFromStore(store) - storesWithExternalLabels := externalLabelOccurrencesInStores[externalLabels] - if len(store.LabelSets()) > 0 && storesWithExternalLabels != 1 { + if len(store.LabelSets()) > 0 && externalLabelOccurrencesInStores[externalLabels] != 1 { store.close() s.updateStoreStatus(store, errors.New(droppingStoreMessage)) - level.Warn(s.logger).Log("msg", droppingStoreMessage, "address", addr, "extLset", externalLabels, "duplicates", storesWithExternalLabels) - continue - } - - if _, ok := s.stores[addr]; ok { - s.updateStoreStatus(store, nil) + level.Warn(s.logger).Log("msg", droppingStoreMessage, "address", addr, "extLset", externalLabels, "duplicates", externalLabelOccurrencesInStores[externalLabels]) + // We don't want to block all of them. Leave one to not disrupt in terms of migration. + externalLabelOccurrencesInStores[externalLabels]-- continue } @@ -262,6 +265,7 @@ func (s *StoreSet) Update(ctx context.Context) { s.updateStoreStatus(store, nil) level.Info(s.logger).Log("msg", "adding new store to query storeset", "address", addr) } + s.externalLabelOccurrencesInStores = externalLabelOccurrencesInStores s.storeNodeConnections.Set(float64(len(s.stores))) s.cleanUpStoreStatuses() diff --git a/pkg/query/storeset_test.go b/pkg/query/storeset_test.go index 994358f81b..b0bcef079e 100644 --- a/pkg/query/storeset_test.go +++ b/pkg/query/storeset_test.go @@ -241,38 +241,35 @@ func TestStoreSet_AllAvailable_BlockExtLsetDuplicates(t *testing.T) { storeExtLabels := [][]storepb.Label{ { - { - Name: "l1", - Value: "v1", - }, + {Name: "l1", Value: "v1"}, }, { - { - Name: "l1", - Value: "v2", - }, + {Name: "l1", Value: "v2"}, + {Name: "l2", Value: "v3"}, }, { - { - // Duplicate with above. - Name: "l1", - Value: "v2", - }, + // Duplicate with above. + {Name: "l1", Value: "v2"}, + {Name: "l2", Value: "v3"}, }, // Two store nodes, they don't have ext labels set. nil, nil, + { + // Duplicate with two others. + {Name: "l1", Value: "v2"}, + {Name: "l2", Value: "v3"}, + }, } - st, err := newTestStores(5, storeExtLabels...) + + st, err := newTestStores(6, storeExtLabels...) testutil.Ok(t, err) defer st.Close() - initialStoreAddr := st.StoreAddresses() - logger := log.NewLogfmtLogger(log.NewSyncWriter(os.Stderr)) logger = level.NewFilter(logger, level.AllowDebug()) logger = log.With(logger, "ts", log.DefaultTimestampUTC, "caller", log.DefaultCaller) - storeSet := NewStoreSet(logger, nil, specsFromAddrFunc(initialStoreAddr), testGRPCOpts, time.Minute) + storeSet := NewStoreSet(logger, nil, specsFromAddrFunc(st.StoreAddresses()), testGRPCOpts, time.Minute) storeSet.gRPCInfoCallTimeout = 2 * time.Second defer storeSet.Close() @@ -282,12 +279,9 @@ func TestStoreSet_AllAvailable_BlockExtLsetDuplicates(t *testing.T) { storeSet.Update(context.Background()) storeSet.Update(context.Background()) - storeNum := len(storeSet.stores) - expectedStoreNum := 5 - 2 - testutil.Assert(t, storeNum == expectedStoreNum, fmt.Sprintf("all services should respond just fine, but we expect duplicates being blocked. Expected %d stores, got %d", expectedStoreNum, storeNum)) + testutil.Assert(t, len(storeSet.stores) == 4, fmt.Sprintf("all services should respond just fine, but we expect duplicates being blocked. Expected %d stores, got %d", 4, len(storeSet.stores))) // Sort result to be able to compare. - var existingStoreLabels [][]storepb.Label for _, store := range storeSet.stores { for _, ls := range store.LabelSets() { @@ -298,14 +292,13 @@ func TestStoreSet_AllAvailable_BlockExtLsetDuplicates(t *testing.T) { return len(existingStoreLabels[i]) > len(existingStoreLabels[j]) }) - var i int - for _, lset := range existingStoreLabels { - testutil.Equals(t, storeExtLabels[i], lset) - - i++ - if i == 1 { - // Store 1 and 2 should be blocked, so fast forward. - i += 2 - } - } + testutil.Equals(t, [][]storepb.Label{ + { + {Name: "l1", Value: "v2"}, + {Name: "l2", Value: "v3"}, + }, + { + {Name: "l1", Value: "v1"}, + }, + }, existingStoreLabels) }