Skip to content

Commit

Permalink
Fix -ruler.alertmanager-url flag (#2989)
Browse files Browse the repository at this point in the history
* fix: `-ruler.alertmanager-url` should be a string

This was never space separated, you needed to register the flag multiple times for it to append to the list of strings.

On top of that, the previous change implied that when using a config file you would need to provide a a list instead of a string thus breaking exisiting configuration.

Signed-off-by: gotjosh <josue@grafana.com>

* Unify the use of StringSlice flags

We had two very similar flag extensions: `Strings` and `StringSlice`.

With this, we unify its use by removing `Strings` in favour of `StringSlice` but keeping the string representation favoured in `Strings`.

The difference here is that by using `Sprintf` we would get the brackets representation as part of the string.

Signed-off-by: gotjosh <josue@grafana.com>

* Rename `api_ruler_test` to `ruler_test`

This is no longer only about the API.

Signed-off-by: gotjosh <josue@grafana.com>

* Create an integration test for Alertmanager discovery

Signed-off-by: gotjosh <josue@grafana.com>

* Fix existing test

Signed-off-by: gotjosh <josue@grafana.com>
  • Loading branch information
gotjosh committed Aug 7, 2020
1 parent 9a9648f commit 15d2d14
Show file tree
Hide file tree
Showing 14 changed files with 102 additions and 71 deletions.
14 changes: 7 additions & 7 deletions docs/configuration/config-file-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -1011,12 +1011,12 @@ storage:
# CLI flag: -ruler.rule-path
[rule_path: <string> | default = "/rules"]

# Space-separated list of URL(s) of the Alertmanager(s) to send notifications
# Comma-separated list of URL(s) of the Alertmanager(s) to send notifications
# to. Each Alertmanager URL is treated as a separate group in the configuration.
# Multiple Alertmanagers in HA per group can be supported by using DNS
# resolution via -ruler.alertmanager-discovery.
# CLI flag: -ruler.alertmanager-url
[alertmanager_url: <list of string> | default = ]
[alertmanager_url: <string> | default = ""]

# Use DNS SRV records to discover Alertmanager hosts.
# CLI flag: -ruler.alertmanager-discovery
Expand Down Expand Up @@ -1152,7 +1152,7 @@ The `alertmanager_config` configures the Cortex alertmanager.
# Initial peers (may be repeated).
# CLI flag: -cluster.peer
[peers: <list of string> | default = ]
[peers: <list of string> | default = []]
# Time to wait between peers to send notifications.
# CLI flag: -cluster.peer-timeout
Expand Down Expand Up @@ -1888,7 +1888,7 @@ cassandra:
# If set, when authenticating with cassandra a custom authenticator will be
# expected during the handshake. This flag can be set multiple times.
# CLI flag: -cassandra.custom-authenticator
[custom_authenticators: <list of string> | default = ]
[custom_authenticators: <list of string> | default = []]

# Timeout when connecting to cassandra.
# CLI flag: -cassandra.timeout
Expand Down Expand Up @@ -2514,7 +2514,7 @@ The `memberlist_config` configures the Gossip memberlist.
# https://cortexmetrics.io/docs/configuration/arguments/#dns-service-discovery
# for more details).
# CLI flag: -memberlist.join
[join_members: <list of string> | default = ]
[join_members: <list of string> | default = []]
# Min backoff duration to join other cluster members.
# CLI flag: -memberlist.min-join-backoff
Expand Down Expand Up @@ -2552,7 +2552,7 @@ The `memberlist_config` configures the Gossip memberlist.
# IP address to listen on for gossip messages. Multiple addresses may be
# specified. Defaults to 0.0.0.0
# CLI flag: -memberlist.bind-addr
[bind_addr: <list of string> | default = ]
[bind_addr: <list of string> | default = []]
# Port to listen on for gossip messages.
# CLI flag: -memberlist.bind-port
Expand Down Expand Up @@ -2602,7 +2602,7 @@ The `limits_config` configures default and per-tenant limits imposed by Cortex s
# ingestion within the distributor and can be repeated in order to drop multiple
# labels.
# CLI flag: -distributor.drop-label
[drop_labels: <list of string> | default = ]
[drop_labels: <list of string> | default = []]
# Maximum length accepted for label names
# CLI flag: -validation.max-length-label-name
Expand Down
89 changes: 74 additions & 15 deletions integration/api_ruler_test.go → integration/ruler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ package main

import (
"path/filepath"
"strings"
"testing"

"github.com/prometheus/prometheus/pkg/labels"
Expand All @@ -20,22 +21,9 @@ func TestRulerAPI(t *testing.T) {
var (
namespaceOne = "test_/encoded_+namespace/?"
namespaceTwo = "test_/encoded_+namespace/?/two"

recordNode = yaml.Node{}
exprNode = yaml.Node{}
)
recordNode.SetString("test_rule")
exprNode.SetString("up")
ruleGroup := rulefmt.RuleGroup{
Name: "test_encoded_+\"+group_name/?",
Interval: 100,
Rules: []rulefmt.RuleNode{
{
Record: recordNode,
Expr: exprNode,
},
},
}
ruleGroup := createTestRuleGroup(t)

s, err := e2e.NewScenario(networkName)
require.NoError(t, err)
defer s.Close()
Expand Down Expand Up @@ -144,3 +132,74 @@ func TestRulerAPISingleBinary(t *testing.T) {
require.NoError(t, cortex.WaitSumMetricsWithOptions(e2e.Equals(0), []string{"prometheus_engine_queries"}, e2e.WithLabelMatchers(
labels.MustNewMatcher(labels.MatchEqual, "engine", "ruler"))))
}

func TestRulerAlertmanager(t *testing.T) {
var namespaceOne = "test_/encoded_+namespace/?"
ruleGroup := createTestRuleGroup(t)

s, err := e2e.NewScenario(networkName)
require.NoError(t, err)
defer s.Close()

// Start dependencies.
dynamo := e2edb.NewDynamoDB()
minio := e2edb.NewMinio(9000, RulerConfigs["-ruler.storage.s3.buckets"])
require.NoError(t, s.StartAndWaitReady(minio, dynamo))

// Have at least one alertmanager configuration.
require.NoError(t, writeFileToSharedDir(s, "alertmanager_configs/user-1.yaml", []byte(cortexAlertmanagerUserConfigYaml)))

// Start Alertmanagers.
am1 := e2ecortex.NewAlertmanager("alertmanager1", mergeFlags(AlertmanagerFlags, AlertmanagerLocalFlags), "")
require.NoError(t, s.StartAndWaitReady(am1))
am2 := e2ecortex.NewAlertmanager("alertmanager2", mergeFlags(AlertmanagerFlags, AlertmanagerLocalFlags), "")
require.NoError(t, s.StartAndWaitReady(am2))

am1URL := "http://" + am1.HTTPEndpoint()
am2URL := "http://" + am2.HTTPEndpoint()

// Connect the ruler to Alertmanagers
configOverrides := map[string]string{
"-ruler.alertmanager-url": strings.Join([]string{am1URL, am2URL}, ","),
}

// Start Ruler.
require.NoError(t, writeFileToSharedDir(s, cortexSchemaConfigFile, []byte(cortexSchemaConfigYaml)))
ruler := e2ecortex.NewRuler("ruler", mergeFlags(ChunksStorageFlags, RulerConfigs, configOverrides), "")
require.NoError(t, s.StartAndWaitReady(ruler))

// Create a client with the ruler address configured
c, err := e2ecortex.NewClient("", "", "", ruler.HTTPEndpoint(), "user-1")
require.NoError(t, err)

// Set the rule group into the ruler
require.NoError(t, c.SetRuleGroup(ruleGroup, namespaceOne))

// Wait until the user manager is created
require.NoError(t, ruler.WaitSumMetrics(e2e.Equals(1), "cortex_ruler_managers_total"))

// Wait until we've discovered the alertmanagers.
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.Equals(2), []string{"cortex_prometheus_notifications_alertmanagers_discovered"}, e2e.WaitMissingMetrics))
}

func createTestRuleGroup(t *testing.T) rulefmt.RuleGroup {
t.Helper()

var (
recordNode = yaml.Node{}
exprNode = yaml.Node{}
)

recordNode.SetString("test_rule")
exprNode.SetString("up")
return rulefmt.RuleGroup{
Name: "test_encoded_+\"+group_name/?",
Interval: 100,
Rules: []rulefmt.RuleNode{
{
Record: recordNode,
Expr: exprNode,
},
},
}
}
2 changes: 1 addition & 1 deletion pkg/compactor/compactor_ring.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) {

// Instance flags
cfg.InstanceInterfaceNames = []string{"eth0", "en0"}
f.Var((*flagext.Strings)(&cfg.InstanceInterfaceNames), "compactor.ring.instance-interface", "Name of network interface to read address from.")
f.Var((*flagext.StringSlice)(&cfg.InstanceInterfaceNames), "compactor.ring.instance-interface", "Name of network interface to read address from.")
f.StringVar(&cfg.InstanceAddr, "compactor.ring.instance-addr", "", "IP address to advertise in the ring.")
f.IntVar(&cfg.InstancePort, "compactor.ring.instance-port", 0, "Port to advertise in the ring (defaults to server.grpc-listen-port).")
f.StringVar(&cfg.InstanceID, "compactor.ring.instance-id", hostname, "Instance ID to register in the ring.")
Expand Down
2 changes: 1 addition & 1 deletion pkg/distributor/distributor_ring.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) {

// Instance flags
cfg.InstanceInterfaceNames = []string{"eth0", "en0"}
f.Var((*flagext.Strings)(&cfg.InstanceInterfaceNames), "distributor.ring.instance-interface", "Name of network interface to read address from.")
f.Var((*flagext.StringSlice)(&cfg.InstanceInterfaceNames), "distributor.ring.instance-interface", "Name of network interface to read address from.")
f.StringVar(&cfg.InstanceAddr, "distributor.ring.instance-addr", "", "IP address to advertise in the ring.")
f.IntVar(&cfg.InstancePort, "distributor.ring.instance-port", 0, "Port to advertise in the ring (defaults to server.grpc-listen-port).")
f.StringVar(&cfg.InstanceID, "distributor.ring.instance-id", hostname, "Instance ID to register in the ring.")
Expand Down
2 changes: 1 addition & 1 deletion pkg/ring/kv/etcd/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ type Client struct {
// RegisterFlagsWithPrefix adds the flags required to config this to the given FlagSet.
func (cfg *Config) RegisterFlagsWithPrefix(f *flag.FlagSet, prefix string) {
cfg.Endpoints = []string{}
f.Var((*flagext.Strings)(&cfg.Endpoints), prefix+"etcd.endpoints", "The etcd endpoints to connect to.")
f.Var((*flagext.StringSlice)(&cfg.Endpoints), prefix+"etcd.endpoints", "The etcd endpoints to connect to.")
f.DurationVar(&cfg.DialTimeout, prefix+"etcd.dial-timeout", 10*time.Second, "The dial timeout for the etcd connection.")
f.IntVar(&cfg.MaxRetries, prefix+"etcd.max-retries", 10, "The maximum number of retries to do for failed ops.")
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/ring/lifecycler.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ func (cfg *LifecyclerConfig) RegisterFlagsWithPrefix(prefix string, f *flag.Flag
}

cfg.InfNames = []string{"eth0", "en0"}
f.Var((*flagext.Strings)(&cfg.InfNames), prefix+"lifecycler.interface", "Name of network interface to read address from.")
f.Var((*flagext.StringSlice)(&cfg.InfNames), prefix+"lifecycler.interface", "Name of network interface to read address from.")
f.StringVar(&cfg.Addr, prefix+"lifecycler.addr", "", "IP address to advertise in consul.")
f.IntVar(&cfg.Port, prefix+"lifecycler.port", 0, "port to advertise in consul (defaults to server.grpc-listen-port).")
f.StringVar(&cfg.ID, prefix+"lifecycler.ID", hostname, "ID to register into consul.")
Expand Down
6 changes: 4 additions & 2 deletions pkg/ruler/notifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"fmt"
"net/url"
"regexp"
"strings"
"sync"

gklog "github.com/go-kit/kit/log"
Expand Down Expand Up @@ -75,10 +76,11 @@ func (rn *rulerNotifier) stop() {
// Builds a Prometheus config.Config from a ruler.Config with just the required
// options to configure notifications to Alertmanager.
func buildNotifierConfig(rulerConfig *Config) (*config.Config, error) {
validURLs := make([]*url.URL, 0, len(rulerConfig.AlertmanagerURL))
amURLs := strings.Split(rulerConfig.AlertmanagerURL, ",")
validURLs := make([]*url.URL, 0, len(amURLs))

srvDNSregexp := regexp.MustCompile(`^_.+._.+`)
for _, h := range rulerConfig.AlertmanagerURL {
for _, h := range amURLs {
url, err := url.Parse(h)
if err != nil {
return nil, err
Expand Down
20 changes: 6 additions & 14 deletions pkg/ruler/notifier_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ func TestBuildNotifierConfig(t *testing.T) {
{
name: "with a single URL and no service discovery",
cfg: &Config{
AlertmanagerURL: []string{"http://alertmanager.default.svc.cluster.local/alertmanager"},
AlertmanagerURL: "http://alertmanager.default.svc.cluster.local/alertmanager",
},
ncfg: &config.Config{
AlertingConfig: config.AlertingConfig{
Expand All @@ -49,7 +49,7 @@ func TestBuildNotifierConfig(t *testing.T) {
{
name: "with a single URL and service discovery",
cfg: &Config{
AlertmanagerURL: []string{"http://_http._tcp.alertmanager.default.svc.cluster.local/alertmanager"},
AlertmanagerURL: "http://_http._tcp.alertmanager.default.svc.cluster.local/alertmanager",
AlertmanagerDiscovery: true,
AlertmanagerRefreshInterval: time.Duration(60),
},
Expand All @@ -74,18 +74,15 @@ func TestBuildNotifierConfig(t *testing.T) {
{
name: "with service discovery and an invalid URL",
cfg: &Config{
AlertmanagerURL: []string{"http://_http.default.svc.cluster.local/alertmanager"},
AlertmanagerURL: "http://_http.default.svc.cluster.local/alertmanager",
AlertmanagerDiscovery: true,
},
err: fmt.Errorf("when alertmanager-discovery is on, host name must be of the form _portname._tcp.service.fqdn (is \"alertmanager.default.svc.cluster.local\")"),
},
{
name: "with multiple URLs and no service discovery",
cfg: &Config{
AlertmanagerURL: []string{
"http://alertmanager-0.default.svc.cluster.local/alertmanager",
"http://alertmanager-1.default.svc.cluster.local/alertmanager",
},
AlertmanagerURL: "http://alertmanager-0.default.svc.cluster.local/alertmanager,http://alertmanager-1.default.svc.cluster.local/alertmanager",
},
ncfg: &config.Config{
AlertingConfig: config.AlertingConfig{
Expand Down Expand Up @@ -113,10 +110,7 @@ func TestBuildNotifierConfig(t *testing.T) {
{
name: "with multiple URLs and service discovery",
cfg: &Config{
AlertmanagerURL: []string{
"http://_http._tcp.alertmanager-0.default.svc.cluster.local/alertmanager",
"http://_http._tcp.alertmanager-1.default.svc.cluster.local/alertmanager",
},
AlertmanagerURL: "http://_http._tcp.alertmanager-0.default.svc.cluster.local/alertmanager,http://_http._tcp.alertmanager-1.default.svc.cluster.local/alertmanager",
AlertmanagerDiscovery: true,
AlertmanagerRefreshInterval: time.Duration(60),
},
Expand Down Expand Up @@ -152,9 +146,7 @@ func TestBuildNotifierConfig(t *testing.T) {
{
name: "with Basic Authentication",
cfg: &Config{
AlertmanagerURL: []string{
"http://marco:hunter2@alertmanager-0.default.svc.cluster.local/alertmanager",
},
AlertmanagerURL: "http://marco:hunter2@alertmanager-0.default.svc.cluster.local/alertmanager",
},
ncfg: &config.Config{
AlertingConfig: config.AlertingConfig{
Expand Down
4 changes: 2 additions & 2 deletions pkg/ruler/ruler.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ type Config struct {
RulePath string `yaml:"rule_path"`

// URL of the Alertmanager to send notifications to.
AlertmanagerURL flagext.StringSlice `yaml:"alertmanager_url"`
AlertmanagerURL string `yaml:"alertmanager_url"`
// Whether to use DNS SRV records to discover Alertmanager.
AlertmanagerDiscovery bool `yaml:"enable_alertmanager_discovery"`
// How long to wait between refreshing the list of Alertmanager based on DNS service discovery.
Expand Down Expand Up @@ -132,7 +132,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
f.DurationVar(&cfg.EvaluationDelay, "ruler.evaluation-delay-duration", 0, "Duration to delay the evaluation of rules to ensure they underlying metrics have been pushed to cortex.")
f.DurationVar(&cfg.PollInterval, "ruler.poll-interval", 1*time.Minute, "How frequently to poll for rule changes")

f.Var(&cfg.AlertmanagerURL, "ruler.alertmanager-url", "Space-separated list of URL(s) of the Alertmanager(s) to send notifications to. Each Alertmanager URL is treated as a separate group in the configuration. Multiple Alertmanagers in HA per group can be supported by using DNS resolution via -ruler.alertmanager-discovery.")
f.StringVar(&cfg.AlertmanagerURL, "ruler.alertmanager-url", "", "Comma-separated list of URL(s) of the Alertmanager(s) to send notifications to. Each Alertmanager URL is treated as a separate group in the configuration. Multiple Alertmanagers in HA per group can be supported by using DNS resolution via -ruler.alertmanager-discovery.")
f.BoolVar(&cfg.AlertmanagerDiscovery, "ruler.alertmanager-discovery", false, "Use DNS SRV records to discover Alertmanager hosts.")
f.DurationVar(&cfg.AlertmanagerRefreshInterval, "ruler.alertmanager-refresh-interval", 1*time.Minute, "How long to wait between refreshing DNS resolutions of Alertmanager hosts.")
f.BoolVar(&cfg.AlertmanangerEnableV2API, "ruler.alertmanager-use-v2", false, "If enabled requests to Alertmanager will utilize the V2 API.")
Expand Down
2 changes: 1 addition & 1 deletion pkg/ruler/ruler_ring.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) {

// Instance flags
cfg.InstanceInterfaceNames = []string{"eth0", "en0"}
f.Var((*flagext.Strings)(&cfg.InstanceInterfaceNames), "ruler.ring.instance-interface", "Name of network interface to read address from.")
f.Var((*flagext.StringSlice)(&cfg.InstanceInterfaceNames), "ruler.ring.instance-interface", "Name of network interface to read address from.")
f.StringVar(&cfg.InstanceAddr, "ruler.ring.instance-addr", "", "IP address to advertise in the ring.")
f.IntVar(&cfg.InstancePort, "ruler.ring.instance-port", 0, "Port to advertise in the ring (defaults to server.grpc-listen-port).")
f.StringVar(&cfg.InstanceID, "ruler.ring.instance-id", hostname, "Instance ID to register in the ring.")
Expand Down
3 changes: 1 addition & 2 deletions pkg/ruler/ruler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,7 @@ func TestNotifierSendsUserIDHeader(t *testing.T) {
cfg, cleanup := defaultRulerConfig(newMockRuleStore(nil))
defer cleanup()

err := cfg.AlertmanagerURL.Set(ts.URL)
require.NoError(t, err)
cfg.AlertmanagerURL = ts.URL
cfg.AlertmanagerDiscovery = false

r, rcleanup := newTestRuler(t, cfg)
Expand Down
2 changes: 1 addition & 1 deletion pkg/storegateway/gateway_ring.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) {

// Instance flags
cfg.InstanceInterfaceNames = []string{"eth0", "en0"}
f.Var((*flagext.Strings)(&cfg.InstanceInterfaceNames), "experimental.store-gateway.sharding-ring.instance-interface", "Name of network interface to read address from.")
f.Var((*flagext.StringSlice)(&cfg.InstanceInterfaceNames), "experimental.store-gateway.sharding-ring.instance-interface", "Name of network interface to read address from.")
f.StringVar(&cfg.InstanceAddr, "experimental.store-gateway.sharding-ring.instance-addr", "", "IP address to advertise in the ring.")
f.IntVar(&cfg.InstancePort, "experimental.store-gateway.sharding-ring.instance-port", 0, "Port to advertise in the ring (defaults to server.grpc-listen-port).")
f.StringVar(&cfg.InstanceID, "experimental.store-gateway.sharding-ring.instance-id", hostname, "Instance ID to register in the ring.")
Expand Down
19 changes: 0 additions & 19 deletions pkg/util/flagext/strings.go

This file was deleted.

6 changes: 2 additions & 4 deletions pkg/util/flagext/stringslice.go
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
package flagext

import (
"strings"
)
import "fmt"

// StringSlice is a slice of strings that implements flag.Value
type StringSlice []string

// String implements flag.Value
func (v StringSlice) String() string {
return strings.Join(v, " ")
return fmt.Sprintf("%s", []string(v))
}

// Set implements flag.Value
Expand Down

0 comments on commit 15d2d14

Please sign in to comment.