Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a lifetime manager for Vault authentication tokens #7337

Merged
merged 11 commits into from
Feb 9, 2024
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
* [FEATURE] Alertmanager API: added `-alertmanager.grafana-alertmanager-compatibility-enabled` CLI flag (and respective YAML config option) to enable an experimental API endpoints that support the migration of the Grafana Alertmanager. #7057
* [FEATURE] Alertmanager: Added `-alertmanager.utf8-strict-mode-enabled` to control support for any UTF-8 character as part of Alertmanager configuration/API matchers and labels. It's default value is set to `false`. #6898
* [FEATURE] Querier: added `histogram_avg()` function support to PromQL. #7293
* [ENHANCEMENT] Vault: add lifecycle manager for token used to authenticate to Vault. This ensures the client token is always valid. #7337
* [ENHANCEMENT] Store-gateway: add no-compact details column on store-gateway tenants admin UI. #6848
* [ENHANCEMENT] PromQL: ignore small errors for bucketQuantile #6766
* [ENHANCEMENT] Distributor: improve efficiency of some errors #6785
Expand Down
7 changes: 7 additions & 0 deletions integration/images.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
// SPDX-License-Identifier: AGPL-3.0-only

package integration

var (
VaultImage = "hashicorp/vault:1.13.2"
)
80 changes: 80 additions & 0 deletions integration/vault_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// SPDX-License-Identifier: AGPL-3.0-only
//go:build requires_docker

package integration

import (
"fmt"
"testing"

"github.com/grafana/e2e"
e2edb "github.com/grafana/e2e/db"
hashivault "github.com/hashicorp/vault/api"
"github.com/stretchr/testify/require"

"github.com/grafana/mimir/integration/e2emimir"
)

func TestVaultTokenRenewal(t *testing.T) {
const devToken = "dev_token"
const httpPort = 8200

s, err := e2e.NewScenario(networkName)
require.NoError(t, err)
defer s.Close()

// Initialize Vault
vault := e2e.NewHTTPService(
"vault",
VaultImage,
nil,
e2e.NewHTTPReadinessProbe(httpPort, "/v1/sys/health", 200, 200),
httpPort,
)
vault.SetEnvVars(map[string]string{"VAULT_DEV_ROOT_TOKEN_ID": devToken})
require.NoError(t, s.StartAndWaitReady(vault))

cli, err := hashivault.NewClient(&hashivault.Config{Address: fmt.Sprintf("http://%s", vault.HTTPEndpoint())})
require.NoError(t, err)

cli.SetToken(devToken)

err = cli.Sys().EnableAuthWithOptions("userpass", &hashivault.EnableAuthOptions{
Type: "userpass",
})
require.NoError(t, err)

_, err = cli.Logical().Write("auth/userpass/users/foo", map[string]interface{}{
"password": "bar",
"ttl": "5s",
"max_ttl": "10s",
})
require.NoError(t, err)

consul := e2edb.NewConsul()
minio := e2edb.NewMinio(9000, blocksBucketName)
require.NoError(t, s.StartAndWaitReady(consul, minio))

flags := mergeFlags(
BlocksStorageFlags(),
BlocksStorageS3Flags(),
map[string]string{
"-vault.enabled": "true",
"-vault.url": fmt.Sprintf("http://%s", vault.NetworkHTTPEndpoint()),
"-vault.mount-path": "secret",
"-vault.auth.type": "userpass",
"-vault.auth.userpass.username": "foo",
"-vault.auth.userpass.password": "bar",
"-log.level": "debug",
},
)

// Start Mimir
mimir := e2emimir.NewSingleBinary("mimir-1", e2e.MergeFlags(DefaultSingleBinaryFlags(), flags))
require.NoError(t, s.StartAndWaitReady(mimir))

// Check that the token lease has been updated before hitting max_ttl
require.NoError(t, mimir.WaitSumMetrics(e2e.GreaterOrEqual(2), "cortex_vault_token_lease_renewal_total"))
// Check that re-authentication occurred
require.NoError(t, mimir.WaitSumMetrics(e2e.Equals(2), "cortex_vault_auth_total"))
}
11 changes: 9 additions & 2 deletions pkg/mimir/modules.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ func (t *Mimir) initVault() (services.Service, error) {
return nil, nil
}

v, err := vault.NewVault(t.Cfg.Vault)
v, err := vault.NewVault(t.Cfg.Vault, util_log.Logger, t.Registerer)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -264,7 +264,14 @@ func (t *Mimir) initVault() (services.Service, error) {
}
}

return nil, nil
runFunc := func(ctx context.Context) error {
err := t.Vault.KeepRenewingTokenLease(ctx)
// We don't want to turn Mimir into an unready state if Vault fails here
<-ctx.Done()
return err
}

return services.NewBasicService(nil, runFunc, nil), nil
}

func (t *Mimir) initSanityCheck() (services.Service, error) {
Expand Down
114 changes: 100 additions & 14 deletions pkg/vault/vault.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,16 @@ import (
"errors"
"flag"
"fmt"
"time"

"github.com/go-kit/log"
"github.com/go-kit/log/level"
hashivault "github.com/hashicorp/vault/api"
"github.com/hashicorp/vault/api/auth/approle"
"github.com/hashicorp/vault/api/auth/kubernetes"
"github.com/hashicorp/vault/api/auth/userpass"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)

// Config for the Vault used to fetch secrets
Expand Down Expand Up @@ -57,13 +62,22 @@ type SecretsEngine interface {
}

type Vault struct {
KVStore SecretsEngine
kvStore SecretsEngine
auth AuthConfig

client *hashivault.Client
token *hashivault.Secret
logger log.Logger

authTotal prometheus.Counter
authLeaseRenewalSuccessTotal prometheus.Counter
authLeaseRenewalFailureTotal prometheus.Counter
}

func NewVault(cfg Config) (*Vault, error) {
func NewVault(cfg Config, l log.Logger, registerer prometheus.Registerer) (*Vault, error) {
if cfg.Mock != nil {
return &Vault{
KVStore: cfg.Mock,
kvStore: cfg.Mock,
}, nil
}

Expand All @@ -75,26 +89,36 @@ func NewVault(cfg Config) (*Vault, error) {
return nil, err
}

authMethod, err := cfg.Auth.authMethod()
if err != nil {
return nil, err
}

authFac := authFactoryReal{}
_, err = authMethod.authenticate(context.Background(), &authFac, client)
authToken, err := getAuthToken(context.Background(), &cfg.Auth, client)
if err != nil {
return nil, fmt.Errorf("error authenticating to vault: %w", err)
return nil, fmt.Errorf("failed to get auth token from vault: %v", err)
}

vault := &Vault{
KVStore: client.KVv2(cfg.MountPath),
kvStore: client.KVv2(cfg.MountPath),
auth: cfg.Auth,
token: authToken,
client: client,
logger: l,
authTotal: promauto.With(registerer).NewCounter(prometheus.CounterOpts{
Name: "cortex_vault_auth_total",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Total number of times authentication to Vault happened during token lifecycle management

I'm not sure this description is correct. It's only how many times we started renewal process, but we don't know how many authentication attempts there were.

From user's perspective, how is this metric different from authLeaseRenewalSuccessTotal?

I wonder if we should have a single "gauge" metric that's set to 1 as long as renewal is running, and we drop cortex_vault_auth_total and cortex_vault_token_lease_renewal_failure_total.
metrics. (However I'd keep cortex_vault_token_lease_renewal_success_total)

WDYT?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So cortex_vault_auth_total is only incremented upon re-authentication after the call to getAuthToken - authLeaseRenewalSuccessTotal gets incremented within manageTokenLifecycle (there is no return statement in the case of RenewCh being received, which is why they differ)

The thought process behind this was to have a metric to track token renewal, and a metric to track re-auth.

Help: "Total number of times authentication to Vault happened during token lifecycle management",
}),
authLeaseRenewalSuccessTotal: promauto.With(registerer).NewCounter(prometheus.CounterOpts{
Name: "cortex_vault_token_lease_renewal_total",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Name: "cortex_vault_token_lease_renewal_total",
Name: "cortex_vault_token_lease_renewal_success_total",

Help: "Total number of times the auth token was renewed successfully",
}),
authLeaseRenewalFailureTotal: promauto.With(registerer).NewCounter(prometheus.CounterOpts{
Name: "cortex_vault_token_lease_renewal_failure_total",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: cortex_vault_token_lease_renewal_failure_total -> cortex_vault_token_lease_renewal_failures_total

Help: "Total number of times auth token lease renewal failed",
}),
}

return vault, nil
}

func (v *Vault) ReadSecret(path string) ([]byte, error) {
secret, err := v.KVStore.Get(context.Background(), path)
secret, err := v.kvStore.Get(context.Background(), path)

if err != nil {
return nil, fmt.Errorf("unable to read secret from vault: %v", err)
Expand All @@ -106,14 +130,76 @@ func (v *Vault) ReadSecret(path string) ([]byte, error) {

data, ok := secret.Data["value"].(string)
if !ok {
return nil, fmt.Errorf("secret data type is not string, found %T value: %#v", secret.Data["value"], secret.Data["value"])
return nil, fmt.Errorf("secret data type is not string, found %T value: %#v at path: %s", secret.Data["value"], secret.Data["value"], path)
}

return []byte(data), nil
}

func (v *Vault) manageTokenLifecycle(ctx context.Context, authTokenWatcher *hashivault.LifetimeWatcher) {
go authTokenWatcher.Start()
defer authTokenWatcher.Stop()

for {
select {
case <-ctx.Done():
return

case <-authTokenWatcher.DoneCh():
// Token failed to renew (e.g expired), re-auth required
return

case renewalInfo := <-authTokenWatcher.RenewCh():
// Token was successfully renewed
if renewalInfo.Secret.Auth != nil {
level.Debug(v.logger).Log("msg", "token renewed", "leaseDuration", time.Duration(renewalInfo.Secret.Auth.LeaseDuration)*time.Second)
v.authLeaseRenewalSuccessTotal.Inc()
}
}
}
}

func (v *Vault) KeepRenewingTokenLease(ctx context.Context) error {
for ctx.Err() == nil {
authTokenWatcher, err := v.client.NewLifetimeWatcher(&hashivault.LifetimeWatcherInput{
Secret: v.token,
})
if err != nil {
v.authLeaseRenewalFailureTotal.Inc()
return fmt.Errorf("error initializing auth token lifetime watcher: %v", err)
}

v.manageTokenLifecycle(ctx, authTokenWatcher)

if ctx.Err() != nil {
return ctx.Err()
}

newAuthToken, err := getAuthToken(ctx, &v.auth, v.client)
if err != nil {
level.Error(v.logger).Log("msg", "error during re-authentication after token expiry", "err", err)
v.authLeaseRenewalFailureTotal.Inc()
return err
}

v.authTotal.Inc()
v.token = newAuthToken
}

return nil
}

type authFactoryReal struct{}

func getAuthToken(ctx context.Context, authCfg *AuthConfig, client *hashivault.Client) (*hashivault.Secret, error) {
am, err := authCfg.authMethod()
if err != nil {
return nil, err
}

return am.authenticate(ctx, &authFactoryReal{}, client)
}

func (af *authFactoryReal) NewAppRoleAuth(roleID string, secretID *approle.SecretID, opts ...approle.LoginOption) (*approle.AppRoleAuth, error) {
return approle.NewAppRoleAuth(
roleID,
Expand Down
2 changes: 1 addition & 1 deletion pkg/vault/vault_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import (
func TestReadSecret(t *testing.T) {
mockKVStore := newMockKVStore()
mimirVaultClient := Vault{
KVStore: mockKVStore,
kvStore: mockKVStore,
}

tests := map[string]struct {
Expand Down
3 changes: 3 additions & 0 deletions tools/pre-pull-images/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ func main() {
fmt.Println(images.Memcached)
fmt.Println(images.Kafka)

// vault image
fmt.Println(integration.VaultImage)

// images from previous releases
for image := range integration.DefaultPreviousVersionImages {
fmt.Println(image)
Expand Down
Loading