Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a lifetime manager for Vault authentication tokens #7337

Merged
merged 11 commits into from
Feb 9, 2024
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
* [FEATURE] Alertmanager API: added `-alertmanager.grafana-alertmanager-compatibility-enabled` CLI flag (and respective YAML config option) to enable an experimental API endpoints that support the migration of the Grafana Alertmanager. #7057
* [FEATURE] Alertmanager: Added `-alertmanager.utf8-strict-mode-enabled` to control support for any UTF-8 character as part of Alertmanager configuration/API matchers and labels. It's default value is set to `false`. #6898
* [FEATURE] Querier: added `histogram_avg()` function support to PromQL. #7293
* [ENHANCEMENT] Vault: add lifecycle manager for token used to authenticate to Vault. #7337
pstibrany marked this conversation as resolved.
Show resolved Hide resolved
* [ENHANCEMENT] Store-gateway: add no-compact details column on store-gateway tenants admin UI. #6848
* [ENHANCEMENT] PromQL: ignore small errors for bucketQuantile #6766
* [ENHANCEMENT] Distributor: improve efficiency of some errors #6785
Expand Down
77 changes: 77 additions & 0 deletions integration/vault_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
// SPDX-License-Identifier: AGPL-3.0-only
//go:build requires_docker

package integration

import (
"fmt"
"testing"

"github.com/grafana/e2e"
e2edb "github.com/grafana/e2e/db"
hashivault "github.com/hashicorp/vault/api"
"github.com/stretchr/testify/require"

"github.com/grafana/mimir/integration/e2emimir"
)

func TestVaultTokenRenewal(t *testing.T) {
s, err := e2e.NewScenario(networkName)
require.NoError(t, err)
defer s.Close()

// Initialize Vault
vault := e2e.NewHTTPService(
"vault",
"hashicorp/vault:1.13.2",
pstibrany marked this conversation as resolved.
Show resolved Hide resolved
nil,
e2e.NewHTTPReadinessProbe(8200, "/v1/sys/health", 200, 200),
8200,
pstibrany marked this conversation as resolved.
Show resolved Hide resolved
)
vault.SetEnvVars(map[string]string{"VAULT_DEV_ROOT_TOKEN_ID": "dev_token"})
require.NoError(t, s.StartAndWaitReady(vault))

cli, err := hashivault.NewClient(&hashivault.Config{Address: fmt.Sprintf("http://%s", vault.HTTPEndpoint())})
require.NoError(t, err)

cli.SetToken("dev_token")
pstibrany marked this conversation as resolved.
Show resolved Hide resolved

err = cli.Sys().EnableAuthWithOptions("userpass", &hashivault.EnableAuthOptions{
Type: "userpass",
})
require.NoError(t, err)

_, err = cli.Logical().Write("auth/userpass/users/foo", map[string]interface{}{
"password": "bar",
"ttl": "5s",
"max_ttl": "10s",
})
require.NoError(t, err)

consul := e2edb.NewConsul()
minio := e2edb.NewMinio(9000, blocksBucketName)
require.NoError(t, s.StartAndWaitReady(consul, minio))

flags := mergeFlags(
BlocksStorageFlags(),
BlocksStorageS3Flags(),
map[string]string{
"-vault.enabled": "true",
"-vault.url": fmt.Sprintf("http://%s", vault.NetworkHTTPEndpoint()),
"-vault.mount-path": "secret",
"-vault.auth.type": "userpass",
"-vault.auth.userpass.username": "foo",
"-vault.auth.userpass.password": "bar",
"-log.level": "debug",
},
)

// Start Mimir
mimir := e2emimir.NewSingleBinary("mimir-1", e2e.MergeFlags(DefaultSingleBinaryFlags(), flags))
require.NoError(t, s.StartAndWaitReady(mimir))

// Check that the token lease has been updated before hitting max_ttl
require.NoError(t, mimir.WaitSumMetrics(e2e.GreaterOrEqual(2), "cortex_vault_token_lease_renewal_total"))
// Check that re-authentication occurred
require.NoError(t, mimir.WaitSumMetrics(e2e.Equals(2), "cortex_vault_auth_total"))
}
8 changes: 6 additions & 2 deletions pkg/mimir/modules.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ func (t *Mimir) initVault() (services.Service, error) {
return nil, nil
}

v, err := vault.NewVault(t.Cfg.Vault)
v, err := vault.NewVault(t.Cfg.Vault, util_log.Logger, prometheus.WrapRegistererWithPrefix("cortex_", t.Registerer))
pstibrany marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -264,7 +264,11 @@ func (t *Mimir) initVault() (services.Service, error) {
}
}

return nil, nil
runFunc := func(ctx context.Context) error {
return t.Vault.RenewTokenLease(ctx)
}

return services.NewBasicService(nil, runFunc, nil), nil
}

func (t *Mimir) initSanityCheck() (services.Service, error) {
Expand Down
29 changes: 29 additions & 0 deletions pkg/vault/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// SPDX-License-Identifier: AGPL-3.0-only

package vault

import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)

type metrics struct {
authTotal prometheus.Counter
authLeaseRenewalTotal prometheus.Counter
}
pstibrany marked this conversation as resolved.
Show resolved Hide resolved

func newMetrics(r prometheus.Registerer) *metrics {
var m metrics

m.authTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "vault_auth_total",
Help: "Total number of times authentication to Vault happened during token lifecycle management",
})

m.authLeaseRenewalTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "vault_token_lease_renewal_total",
Help: "Total number of times the auth token was renewed",
})

return &m
}
95 changes: 85 additions & 10 deletions pkg/vault/vault.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,13 @@ import (
"flag"
"fmt"

"github.com/go-kit/log"
"github.com/go-kit/log/level"
hashivault "github.com/hashicorp/vault/api"
"github.com/hashicorp/vault/api/auth/approle"
"github.com/hashicorp/vault/api/auth/kubernetes"
"github.com/hashicorp/vault/api/auth/userpass"
"github.com/prometheus/client_golang/prometheus"
)

// Config for the Vault used to fetch secrets
Expand Down Expand Up @@ -58,9 +61,15 @@ type SecretsEngine interface {

type Vault struct {
KVStore SecretsEngine
Auth AuthConfig
pstibrany marked this conversation as resolved.
Show resolved Hide resolved

client *hashivault.Client
token *hashivault.Secret
logger log.Logger
metrics *metrics
}

func NewVault(cfg Config) (*Vault, error) {
func NewVault(cfg Config, l log.Logger, registerer prometheus.Registerer) (*Vault, error) {
if cfg.Mock != nil {
return &Vault{
KVStore: cfg.Mock,
Expand All @@ -75,19 +84,18 @@ func NewVault(cfg Config) (*Vault, error) {
return nil, err
}

authMethod, err := cfg.Auth.authMethod()
if err != nil {
return nil, err
}

authFac := authFactoryReal{}
_, err = authMethod.authenticate(context.Background(), &authFac, client)
authToken, err := getAuthToken(context.Background(), &cfg.Auth, client)
if err != nil {
return nil, fmt.Errorf("error authenticating to vault: %w", err)
return nil, fmt.Errorf("failed to get auth token from vault: %v", err)
}

vault := &Vault{
KVStore: client.KVv2(cfg.MountPath),
Auth: cfg.Auth,
token: authToken,
client: client,
logger: l,
metrics: newMetrics(registerer),
}

return vault, nil
Expand All @@ -106,14 +114,81 @@ func (v *Vault) ReadSecret(path string) ([]byte, error) {

data, ok := secret.Data["value"].(string)
if !ok {
return nil, fmt.Errorf("secret data type is not string, found %T value: %#v", secret.Data["value"], secret.Data["value"])
return nil, fmt.Errorf("secret data type is not string, found %T value: %#v at path: %s", secret.Data["value"], secret.Data["value"], path)
}

return []byte(data), nil
}

func (v *Vault) manageTokenLifecycle(ctx context.Context) error {
authTokenWatcher, err := v.client.NewLifetimeWatcher(&hashivault.LifetimeWatcherInput{
Secret: v.token,
})
if err != nil {
return fmt.Errorf("error initializing auth token lifetime watcher: %v", err)
pstibrany marked this conversation as resolved.
Show resolved Hide resolved
}

go authTokenWatcher.Start()
defer authTokenWatcher.Stop()

for {
select {
case <-ctx.Done():
return nil

case <-authTokenWatcher.DoneCh():
// Token failed to renew (e.g expired), re-auth required
return nil

case renewalInfo := <-authTokenWatcher.RenewCh():
// Token was successfully renewed
if renewalInfo.Secret.Auth != nil {
level.Debug(v.logger).Log("msg", fmt.Sprintf("token renewed, new lease: %d", renewalInfo.Secret.Auth.LeaseDuration))
pstibrany marked this conversation as resolved.
Show resolved Hide resolved
v.metrics.authLeaseRenewalTotal.Inc()
}
}
}
}

func (v *Vault) RenewTokenLease(ctx context.Context) error {
for ctx.Err() == nil {
lfcErr := v.manageTokenLifecycle(ctx)
if lfcErr != nil {
level.Error(v.logger).Log("msg", fmt.Sprintf("unable to manage token lifecycle: %v", lfcErr))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
level.Error(v.logger).Log("msg", fmt.Sprintf("unable to manage token lifecycle: %v", lfcErr))
level.Error(v.logger).Log("msg", "unable to manage token lifecycle", "err", lfcErr)

// We don't want to turn Mimir into an unready state if Vault fails here
<-ctx.Done()
pstibrany marked this conversation as resolved.
Show resolved Hide resolved
return lfcErr
}

if ctx.Err() != nil {
return ctx.Err()
}

newAuthToken, err := getAuthToken(ctx, &v.Auth, v.client)
if err != nil {
level.Error(v.logger).Log("msg", fmt.Sprintf("error during re-authentication after token expiry: %v", err))
pstibrany marked this conversation as resolved.
Show resolved Hide resolved
<-ctx.Done()
return err
}

v.metrics.authTotal.Inc()
v.token = newAuthToken
}

return nil
}

type authFactoryReal struct{}

func getAuthToken(ctx context.Context, authCfg *AuthConfig, client *hashivault.Client) (*hashivault.Secret, error) {
am, err := authCfg.authMethod()
if err != nil {
return nil, err
}

return am.authenticate(ctx, &authFactoryReal{}, client)
}

func (af *authFactoryReal) NewAppRoleAuth(roleID string, secretID *approle.SecretID, opts ...approle.LoginOption) (*approle.AppRoleAuth, error) {
return approle.NewAppRoleAuth(
roleID,
Expand Down
Loading