diff --git a/cmake/oss.cmake.in b/cmake/oss.cmake.in index e74bcdb8a3482..9529c2b8da3e2 100644 --- a/cmake/oss.cmake.in +++ b/cmake/oss.cmake.in @@ -179,7 +179,7 @@ ExternalProject_Add(fmt ExternalProject_Add(seastar GIT_REPOSITORY https://github.com/redpanda-data/seastar.git - GIT_TAG 16d4456f86e344d6c240c431045957e111ec213f + GIT_TAG 8f98d69bcbd2473eb9915204bd8fd1665e609739 INSTALL_DIR @REDPANDA_DEPS_INSTALL_DIR@ CMAKE_COMMAND ${CMAKE_COMMAND} -E env ${cmake_build_env} ${CMAKE_COMMAND} CMAKE_ARGS diff --git a/conf/redpanda.yaml b/conf/redpanda.yaml index fee180320f1ae..4dd0f404a883f 100644 --- a/conf/redpanda.yaml +++ b/conf/redpanda.yaml @@ -45,10 +45,10 @@ redpanda: developer_mode: true # Enable Pandaproxy -pandaproxy: +pandaproxy: {} # Enable Schema Registry -schema_registry: +schema_registry: {} rpk: # TLS configuration. diff --git a/src/go/k8s/README.md b/src/go/k8s/README.md index 22a5de3759c42..5cf39eec5e625 100644 --- a/src/go/k8s/README.md +++ b/src/go/k8s/README.md @@ -20,7 +20,7 @@ Official Kubernetes quick start documentation can be found at * kustomize v3.8.7 or newer * cert-manager v1.0.0 or newer -Optionaly to run operator locally: +Optionally to run operator locally: * kind v0.9.0 or newer @@ -30,7 +30,7 @@ Optionaly to run operator locally: Create local Kubernetes cluster using KIND -``` +```bash export KUBECONFIG=your/path/to/kubeconfig.yaml kind create cluster --config kind.yaml ``` @@ -47,19 +47,19 @@ resources. To verify that cert manager is ready please follow You can simply deploy the Redpanda operator with webhook (recommended) by running the following command -``` +```bash kubectl apply -k https://github.com/redpanda-data/redpanda/src/go/k8s/config/default ``` You can deploy the Redpanda operator without webhook by running the following command: -``` +```bash kubectl apply -k https://github.com/redpanda-data/redpanda/src/go/k8s/config/without-webhook ``` Install sample RedpandaCluster custom resource -``` +```bash kubectl apply -f https://raw.githubusercontent.com/redpanda-data/redpanda/dev/src/go/k8s/config/samples/one_node_cluster.yaml ``` @@ -68,26 +68,26 @@ kubectl apply -f https://raw.githubusercontent.com/redpanda-data/redpanda/dev/sr Create kind cluster -``` +```bash make kind-create ``` Install cert manager -``` +```bash make certmanager-install ``` Build docker images for manager and configurator -``` +```bash make docker-build make docker-build-configurator ``` Deploy operator to kind -``` +```bash make deploy-to-kind ``` @@ -96,6 +96,6 @@ make deploy-to-kind To remove all resources even the running Redpanda cluster please run the following command: -``` +```bash kubectl delete -k https://github.com/redpanda-data/redpanda/src/go/k8s/config/default ``` diff --git a/src/go/rpk/pkg/api/admin/admin.go b/src/go/rpk/pkg/api/admin/admin.go index e255975e2c283..c86b0cac8fb47 100644 --- a/src/go/rpk/pkg/api/admin/admin.go +++ b/src/go/rpk/pkg/api/admin/admin.go @@ -499,16 +499,23 @@ func maybeUnmarshalRespInto( // sendAndReceive sends a request and returns the response. If body is // non-nil, this json encodes the body and sends it with the request. +// If the body is already an io.Reader, the reader is used directly +// without marshaling. func (a *AdminAPI) sendAndReceive( ctx context.Context, method, url string, body interface{}, retryable bool, ) (*http.Response, error) { var r io.Reader if body != nil { - bs, err := json.Marshal(body) - if err != nil { - return nil, fmt.Errorf("unable to encode request body for %s %s: %w", method, url, err) // should not happen + // We might be passing io reader already as body, e.g: license file. + if v, ok := body.(io.Reader); ok { + r = v + } else { + bs, err := json.Marshal(body) + if err != nil { + return nil, fmt.Errorf("unable to encode request body for %s %s: %w", method, url, err) // should not happen + } + r = bytes.NewBuffer(bs) } - r = bytes.NewBuffer(bs) } req, err := http.NewRequestWithContext(ctx, method, url, r) @@ -549,7 +556,7 @@ func (a *AdminAPI) sendAndReceive( if err != nil { return nil, fmt.Errorf("request %s %s failed: %s, unable to read body: %w", method, url, status, err) } - return nil, &HTTPResponseError{Response: res, Body: resBody} + return nil, &HTTPResponseError{Response: res, Body: resBody, Method: method, URL: url} } return res, nil @@ -562,6 +569,6 @@ func (he HTTPResponseError) DecodeGenericErrorBody() (GenericErrorBody, error) { } func (he HTTPResponseError) Error() string { - return fmt.Sprintf("request %s %s failed: %s, body: %q", + return fmt.Sprintf("request %s %s failed: %s, body: %q\n", he.Method, he.URL, http.StatusText(he.Response.StatusCode), he.Body) } diff --git a/src/go/rpk/pkg/api/admin/api_features.go b/src/go/rpk/pkg/api/admin/api_features.go index 27ba6578c244b..15cce28e4a0f2 100644 --- a/src/go/rpk/pkg/api/admin/api_features.go +++ b/src/go/rpk/pkg/api/admin/api_features.go @@ -38,6 +38,18 @@ type FeaturesResponse struct { Features []Feature `json:"features"` } +type License struct { + Loaded bool `json:"loaded"` + Properties LicenseProperties `json:"license"` +} + +type LicenseProperties struct { + Version int `json:"format_version"` + Organization string `json:"org"` + Type string `json:"type"` + Expires int `json:"expires"` +} + // GetFeatures returns information about the available features. func (a *AdminAPI) GetFeatures(ctx context.Context) (FeaturesResponse, error) { var features FeaturesResponse @@ -48,3 +60,12 @@ func (a *AdminAPI) GetFeatures(ctx context.Context) (FeaturesResponse, error) { nil, &features) } + +func (a *AdminAPI) GetLicenseInfo(ctx context.Context) (License, error) { + var license License + return license, a.sendAny(ctx, http.MethodGet, "/v1/features/license", nil, &license) +} + +func (a *AdminAPI) SetLicense(ctx context.Context, license interface{}) error { + return a.sendToLeader(ctx, http.MethodPut, "/v1/features/license", license, nil) +} diff --git a/src/go/rpk/pkg/cli/cmd/acl.go b/src/go/rpk/pkg/cli/cmd/acl.go index aec3aa0148265..702ed32d3658b 100644 --- a/src/go/rpk/pkg/cli/cmd/acl.go +++ b/src/go/rpk/pkg/cli/cmd/acl.go @@ -38,7 +38,7 @@ func NewACLCommand(fs afero.Fs) *cobra.Command { ) command := &cobra.Command{ Use: "acl", - Short: "Manage ACLs and SASL users.", + Short: "Manage ACLs and SASL users", Long: helpACLs, Args: cobra.ExactArgs(0), Run: func(cmd *cobra.Command, _ []string) { @@ -50,7 +50,7 @@ func NewACLCommand(fs afero.Fs) *cobra.Command { }, } - command.Flags().BoolVar(&helpOperations, "help-operations", false, "Print more help about ACL operations.") + command.Flags().BoolVar(&helpOperations, "help-operations", false, "Print more help about ACL operations") common.AddKafkaFlags( command, diff --git a/src/go/rpk/pkg/cli/cmd/acl/create.go b/src/go/rpk/pkg/cli/cmd/acl/create.go index 5cc31654635d3..5ca682b998a4a 100644 --- a/src/go/rpk/pkg/cli/cmd/acl/create.go +++ b/src/go/rpk/pkg/cli/cmd/acl/create.go @@ -25,7 +25,7 @@ func NewCreateCommand(fs afero.Fs) *cobra.Command { var a acls cmd := &cobra.Command{ Use: "create", - Short: "Create ACLs.", + Short: "Create ACLs", Long: `Create ACLs. See the 'rpk acl' help text for a full write up on ACLs. Following the @@ -88,17 +88,17 @@ Allow write permissions to user buzz to transactional id "txn": func (a *acls) addCreateFlags(cmd *cobra.Command) { a.addDeprecatedFlags(cmd) - cmd.Flags().StringSliceVar(&a.topics, topicFlag, nil, "topic to grant ACLs for (repeatable)") - cmd.Flags().StringSliceVar(&a.groups, groupFlag, nil, "group to grant ACLs for (repeatable)") - cmd.Flags().BoolVar(&a.cluster, clusterFlag, false, "whether to grant ACLs to the cluster") - cmd.Flags().StringSliceVar(&a.txnIDs, txnIDFlag, nil, "transactional IDs to grant ACLs for (repeatable)") + cmd.Flags().StringSliceVar(&a.topics, topicFlag, nil, "Topic to grant ACLs for (repeatable)") + cmd.Flags().StringSliceVar(&a.groups, groupFlag, nil, "Group to grant ACLs for (repeatable)") + cmd.Flags().BoolVar(&a.cluster, clusterFlag, false, "Whether to grant ACLs to the cluster") + cmd.Flags().StringSliceVar(&a.txnIDs, txnIDFlag, nil, "Transactional IDs to grant ACLs for (repeatable)") - cmd.Flags().StringVar(&a.resourcePatternType, patternFlag, "literal", "pattern to use when matching resource names (literal or prefixed)") + cmd.Flags().StringVar(&a.resourcePatternType, patternFlag, "literal", "Pattern to use when matching resource names (literal or prefixed)") - cmd.Flags().StringSliceVar(&a.operations, operationFlag, nil, "operation to grant (repeatable)") + cmd.Flags().StringSliceVar(&a.operations, operationFlag, nil, "Operation to grant (repeatable)") - cmd.Flags().StringSliceVar(&a.allowPrincipals, allowPrincipalFlag, nil, "principals for which these permissions will be granted (repeatable)") - cmd.Flags().StringSliceVar(&a.allowHosts, allowHostFlag, nil, "hosts from which access will be granted (repeatable)") - cmd.Flags().StringSliceVar(&a.denyPrincipals, denyPrincipalFlag, nil, "principal for which these permissions will be denied (repeatable)") - cmd.Flags().StringSliceVar(&a.denyHosts, denyHostFlag, nil, "hosts from from access will be denied (repeatable)") + cmd.Flags().StringSliceVar(&a.allowPrincipals, allowPrincipalFlag, nil, "Principals for which these permissions will be granted (repeatable)") + cmd.Flags().StringSliceVar(&a.allowHosts, allowHostFlag, nil, "Hosts from which access will be granted (repeatable)") + cmd.Flags().StringSliceVar(&a.denyPrincipals, denyPrincipalFlag, nil, "Principal for which these permissions will be denied (repeatable)") + cmd.Flags().StringSliceVar(&a.denyHosts, denyHostFlag, nil, "Hosts from from access will be denied (repeatable)") } diff --git a/src/go/rpk/pkg/cli/cmd/acl/delete.go b/src/go/rpk/pkg/cli/cmd/acl/delete.go index 30f27f989b855..93576b57a6dca 100644 --- a/src/go/rpk/pkg/cli/cmd/acl/delete.go +++ b/src/go/rpk/pkg/cli/cmd/acl/delete.go @@ -31,7 +31,7 @@ func NewDeleteCommand(fs afero.Fs) *cobra.Command { ) cmd := &cobra.Command{ Use: "delete", - Short: "Delete ACLs.", + Short: "Delete ACLs", Long: `Delete ACLs. See the 'rpk acl' help text for a full write up on ACLs. Delete flags work in a @@ -94,28 +94,28 @@ resource names: }, } a.addDeleteFlags(cmd) - cmd.Flags().BoolVarP(&printAllFilters, "print-filters", "f", false, "print the filters that were requested (failed filters are always printed)") - cmd.Flags().BoolVarP(&dry, "dry", "d", false, "dry run: validate what would be deleted") - cmd.Flags().BoolVar(&noConfirm, "no-confirm", false, "disable confirmation prompt") + cmd.Flags().BoolVarP(&printAllFilters, "print-filters", "f", false, "Print the filters that were requested (failed filters are always printed)") + cmd.Flags().BoolVarP(&dry, "dry", "d", false, "Dry run: validate what would be deleted") + cmd.Flags().BoolVar(&noConfirm, "no-confirm", false, "Disable confirmation prompt") return cmd } func (a *acls) addDeleteFlags(cmd *cobra.Command) { a.addDeprecatedFlags(cmd) - cmd.Flags().StringSliceVar(&a.topics, topicFlag, nil, "topic to remove ACLs for (repeatable)") - cmd.Flags().StringSliceVar(&a.groups, groupFlag, nil, "group to remove ACLs for (repeatable)") - cmd.Flags().BoolVar(&a.cluster, clusterFlag, false, "whether to remove ACLs to the cluster") - cmd.Flags().StringSliceVar(&a.txnIDs, txnIDFlag, nil, "transactional IDs to remove ACLs for (repeatable)") + cmd.Flags().StringSliceVar(&a.topics, topicFlag, nil, "Topic to remove ACLs for (repeatable)") + cmd.Flags().StringSliceVar(&a.groups, groupFlag, nil, "Group to remove ACLs for (repeatable)") + cmd.Flags().BoolVar(&a.cluster, clusterFlag, false, "Whether to remove ACLs to the cluster") + cmd.Flags().StringSliceVar(&a.txnIDs, txnIDFlag, nil, "Transactional IDs to remove ACLs for (repeatable)") - cmd.Flags().StringVar(&a.resourcePatternType, patternFlag, "any", "pattern to use when matching resource names (any, match, literal, or prefixed)") + cmd.Flags().StringVar(&a.resourcePatternType, patternFlag, "any", "Pattern to use when matching resource names (any, match, literal, or prefixed)") - cmd.Flags().StringSliceVar(&a.operations, operationFlag, nil, "operation to remove (repeatable)") + cmd.Flags().StringSliceVar(&a.operations, operationFlag, nil, "Operation to remove (repeatable)") - cmd.Flags().StringSliceVar(&a.allowPrincipals, allowPrincipalFlag, nil, "allowed principal ACLs to remove (repeatable)") - cmd.Flags().StringSliceVar(&a.allowHosts, allowHostFlag, nil, "allowed host ACLs to remove (repeatable)") - cmd.Flags().StringSliceVar(&a.denyPrincipals, denyPrincipalFlag, nil, "denied principal ACLs to remove (repeatable)") - cmd.Flags().StringSliceVar(&a.denyHosts, denyHostFlag, nil, "denied host ACLs to remove (repeatable)") + cmd.Flags().StringSliceVar(&a.allowPrincipals, allowPrincipalFlag, nil, "Allowed principal ACLs to remove (repeatable)") + cmd.Flags().StringSliceVar(&a.allowHosts, allowHostFlag, nil, "Allowed host ACLs to remove (repeatable)") + cmd.Flags().StringSliceVar(&a.denyPrincipals, denyPrincipalFlag, nil, "Denied principal ACLs to remove (repeatable)") + cmd.Flags().StringSliceVar(&a.denyHosts, denyHostFlag, nil, "Denied host ACLs to remove (repeatable)") } func deleteReqResp( diff --git a/src/go/rpk/pkg/cli/cmd/acl/list.go b/src/go/rpk/pkg/cli/cmd/acl/list.go index 26461412cace1..6ecdb61df3163 100644 --- a/src/go/rpk/pkg/cli/cmd/acl/list.go +++ b/src/go/rpk/pkg/cli/cmd/acl/list.go @@ -28,7 +28,7 @@ func NewListCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "list", Aliases: []string{"ls", "describe"}, - Short: "List ACLs.", + Short: "List ACLs", Long: `List ACLs. See the 'rpk acl' help text for a full write up on ACLs. List flags work in a @@ -64,7 +64,7 @@ resource names: }, } a.addListFlags(cmd) - cmd.Flags().BoolVarP(&printAllFilters, "print-filters", "f", false, "print the filters that were requested (failed filters are always printed)") + cmd.Flags().BoolVarP(&printAllFilters, "print-filters", "f", false, "Print the filters that were requested (failed filters are always printed)") return cmd } @@ -79,19 +79,19 @@ func (a *acls) addListFlags(cmd *cobra.Command) { cmd.Flags().MarkDeprecated("principal", "use --{allow,deny}-{host,principal}") cmd.Flags().MarkDeprecated("host", "use --{allow,deny}-{host,principal}") - cmd.Flags().StringSliceVar(&a.topics, topicFlag, nil, "topic to match ACLs for (repeatable)") - cmd.Flags().StringSliceVar(&a.groups, groupFlag, nil, "group to match ACLs for (repeatable)") - cmd.Flags().BoolVar(&a.cluster, clusterFlag, false, "whether to match ACLs to the cluster") - cmd.Flags().StringSliceVar(&a.txnIDs, txnIDFlag, nil, "transactional IDs to match ACLs for (repeatable)") + cmd.Flags().StringSliceVar(&a.topics, topicFlag, nil, "Topic to match ACLs for (repeatable)") + cmd.Flags().StringSliceVar(&a.groups, groupFlag, nil, "Group to match ACLs for (repeatable)") + cmd.Flags().BoolVar(&a.cluster, clusterFlag, false, "Whether to match ACLs to the cluster") + cmd.Flags().StringSliceVar(&a.txnIDs, txnIDFlag, nil, "Transactional IDs to match ACLs for (repeatable)") - cmd.Flags().StringVar(&a.resourcePatternType, patternFlag, "any", "pattern to use when matching resource names (any, match, literal, or prefixed)") + cmd.Flags().StringVar(&a.resourcePatternType, patternFlag, "any", "Pattern to use when matching resource names (any, match, literal, or prefixed)") - cmd.Flags().StringSliceVar(&a.operations, operationFlag, nil, "operation to match (repeatable)") + cmd.Flags().StringSliceVar(&a.operations, operationFlag, nil, "Operation to match (repeatable)") - cmd.Flags().StringSliceVar(&a.allowPrincipals, allowPrincipalFlag, nil, "allowed principal ACLs to match (repeatable)") - cmd.Flags().StringSliceVar(&a.allowHosts, allowHostFlag, nil, "allowed host ACLs to match (repeatable)") - cmd.Flags().StringSliceVar(&a.denyPrincipals, denyPrincipalFlag, nil, "denied principal ACLs to match (repeatable)") - cmd.Flags().StringSliceVar(&a.denyHosts, denyHostFlag, nil, "denied host ACLs to match (repeatable)") + cmd.Flags().StringSliceVar(&a.allowPrincipals, allowPrincipalFlag, nil, "Allowed principal ACLs to match (repeatable)") + cmd.Flags().StringSliceVar(&a.allowHosts, allowHostFlag, nil, "Allowed host ACLs to match (repeatable)") + cmd.Flags().StringSliceVar(&a.denyPrincipals, denyPrincipalFlag, nil, "Denied principal ACLs to match (repeatable)") + cmd.Flags().StringSliceVar(&a.denyHosts, denyHostFlag, nil, "Denied host ACLs to match (repeatable)") } func describeReqResp( diff --git a/src/go/rpk/pkg/cli/cmd/acl/user.go b/src/go/rpk/pkg/cli/cmd/acl/user.go index dd26aad511a21..847808b951a66 100644 --- a/src/go/rpk/pkg/cli/cmd/acl/user.go +++ b/src/go/rpk/pkg/cli/cmd/acl/user.go @@ -24,7 +24,7 @@ func NewUserCommand(fs afero.Fs) *cobra.Command { var apiUrls []string cmd := &cobra.Command{ Use: "user", - Short: "Manage SASL users.", + Short: "Manage SASL users", Long: `Manage SASL users. If SASL is enabled, a SASL user is what you use to talk to Redpanda, and ACLs @@ -39,7 +39,7 @@ redpanda section of your redpanda.yaml. config.FlagAdminHosts2, []string{}, "The comma-separated list of Admin API addresses (:)."+ - " You must specify one for each node.", + " You must specify one for each node", ) cmd.AddCommand(NewCreateUserCommand(fs)) @@ -59,7 +59,7 @@ func NewCreateUserCommand(fs afero.Fs) *cobra.Command { var userOld, pass, passOld, mechanism string cmd := &cobra.Command{ Use: "create [USER] -p [PASS]", - Short: "Create a SASL user.", + Short: "Create a SASL user", Long: `Create a SASL user. This command creates a single SASL user with the given password, optionally @@ -120,11 +120,11 @@ acl help text for more info. } cmd.Flags().StringVar(&userOld, "new-username", "", "") - cmd.Flags().MarkDeprecated("new-username", "the username now does not require a flag") // Oct 2021 + cmd.Flags().MarkDeprecated("new-username", "The username now does not require a flag") // Oct 2021 - cmd.Flags().StringVarP(&pass, "password", "p", "", "new user's password") + cmd.Flags().StringVarP(&pass, "password", "p", "", "New user's password") cmd.Flags().StringVar(&passOld, "new-password", "", "") - cmd.Flags().MarkDeprecated("new-password", "renamed to --password") // Oct 2021 + cmd.Flags().MarkDeprecated("new-password", "Renamed to --password") // Oct 2021 cmd.Flags().StringVar( &mechanism, @@ -140,7 +140,7 @@ func NewDeleteUserCommand(fs afero.Fs) *cobra.Command { var oldUser string cmd := &cobra.Command{ Use: "delete [USER]", - Short: "Delete a SASL user.", + Short: "Delete a SASL user", Long: `Delete a SASL user. This command deletes the specified SASL account from Redpanda. This does not @@ -174,7 +174,7 @@ delete any ACLs that may exist for this user. } cmd.Flags().StringVar(&oldUser, "delete-username", "", "The user to be deleted") - cmd.Flags().MarkDeprecated("delete-username", "the username now does not require a flag") + cmd.Flags().MarkDeprecated("delete-username", "The username now does not require a flag") return cmd } @@ -183,7 +183,7 @@ func NewListUsersCommand(fs afero.Fs) *cobra.Command { return &cobra.Command{ Use: "list", Aliases: []string{"ls"}, - Short: "List SASL users.", + Short: "List SASL users", Run: func(cmd *cobra.Command, _ []string) { p := config.ParamsFromCommand(cmd) cfg, err := p.Load(fs) diff --git a/src/go/rpk/pkg/cli/cmd/cluster.go b/src/go/rpk/pkg/cli/cmd/cluster.go index c20e6dbdc51c1..1c11c28ad7a22 100644 --- a/src/go/rpk/pkg/cli/cmd/cluster.go +++ b/src/go/rpk/pkg/cli/cmd/cluster.go @@ -12,6 +12,7 @@ package cmd import ( "github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/cluster" "github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/cluster/config" + "github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/cluster/license" "github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/cluster/maintenance" "github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/common" "github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/group" @@ -33,7 +34,7 @@ func NewClusterCommand(fs afero.Fs) *cobra.Command { ) command := &cobra.Command{ Use: "cluster", - Short: "Interact with a Redpanda cluster.", + Short: "Interact with a Redpanda cluster", } // backcompat: until we switch to -X, we need these flags. common.AddKafkaFlags( @@ -54,11 +55,13 @@ func NewClusterCommand(fs afero.Fs) *cobra.Command { offsets.Deprecated = "replaced by 'rpk group describe'" offsets.Hidden = true offsets.Use = "offsets" - command.AddCommand(offsets) - - command.AddCommand(config.NewConfigCommand(fs)) - command.AddCommand(maintenance.NewMaintenanceCommand(fs)) - command.AddCommand(cluster.NewHealthOverviewCommand(fs)) + command.AddCommand( + cluster.NewHealthOverviewCommand(fs), + config.NewConfigCommand(fs), + license.NewLicenseCommand(fs), + maintenance.NewMaintenanceCommand(fs), + offsets, + ) return command } diff --git a/src/go/rpk/pkg/cli/cmd/cluster/config/config.go b/src/go/rpk/pkg/cli/cmd/cluster/config/config.go index 938932c17223a..517d15e9f23a5 100644 --- a/src/go/rpk/pkg/cli/cmd/cluster/config/config.go +++ b/src/go/rpk/pkg/cli/cmd/cluster/config/config.go @@ -29,7 +29,7 @@ func NewConfigCommand(fs afero.Fs) *cobra.Command { command := &cobra.Command{ Use: "config", Args: cobra.ExactArgs(0), - Short: "Interact with cluster configuration properties.", + Short: "Interact with cluster configuration properties", Long: `Interact with cluster configuration properties. Cluster properties are redpanda settings which apply to all nodes in @@ -68,7 +68,7 @@ different redpanda version that does not recognize certain properties.`, &all, "all", false, - "Include all properties, including tunables.", + "Include all properties, including tunables", ) command.AddCommand( diff --git a/src/go/rpk/pkg/cli/cmd/cluster/config/edit.go b/src/go/rpk/pkg/cli/cmd/cluster/config/edit.go index 3a99001f122a2..f5c55004a0b76 100644 --- a/src/go/rpk/pkg/cli/cmd/cluster/config/edit.go +++ b/src/go/rpk/pkg/cli/cmd/cluster/config/edit.go @@ -26,7 +26,7 @@ import ( func newEditCommand(fs afero.Fs, all *bool) *cobra.Command { cmd := &cobra.Command{ Use: "edit", - Short: "Edit cluster configuration properties.", + Short: "Edit cluster configuration properties", Long: `Edit cluster-wide configuration properties. This command opens a text editor to modify the cluster's configuration. diff --git a/src/go/rpk/pkg/cli/cmd/cluster/config/export.go b/src/go/rpk/pkg/cli/cmd/cluster/config/export.go index 4e9fb008cbe0c..8653c05d7259e 100644 --- a/src/go/rpk/pkg/cli/cmd/cluster/config/export.go +++ b/src/go/rpk/pkg/cli/cmd/cluster/config/export.go @@ -135,7 +135,7 @@ func newExportCommand(fs afero.Fs, all *bool) *cobra.Command { cmd := &cobra.Command{ Use: "export", - Short: "Export cluster configuration.", + Short: "Export cluster configuration", Long: `Export cluster configuration. Writes out a YAML representation of the cluster configuration to a file, @@ -166,11 +166,12 @@ to include all properties including these low level tunables. var file *os.File if filename == "" { file, err = ioutil.TempFile("/tmp", "config_*.yaml") + filename = "/tmp/config_*.yaml" } else { file, err = os.Create(filename) } - out.MaybeDie(err, "unable to create file %q: %v", file.Name(), err) + out.MaybeDie(err, "unable to create file %q: %v", filename, err) err = exportConfig(file, schema, currentConfig, *all) out.MaybeDie(err, "failed to write out config %q: %v", file.Name(), err) err = file.Close() diff --git a/src/go/rpk/pkg/cli/cmd/cluster/config/import.go b/src/go/rpk/pkg/cli/cmd/cluster/config/import.go index 78fcc62ed266a..eff288565068c 100644 --- a/src/go/rpk/pkg/cli/cmd/cluster/config/import.go +++ b/src/go/rpk/pkg/cli/cmd/cluster/config/import.go @@ -216,7 +216,7 @@ func newImportCommand(fs afero.Fs, all *bool) *cobra.Command { var filename string cmd := &cobra.Command{ Use: "import", - Short: "Import cluster configuration from a file.", + Short: "Import cluster configuration from a file", Long: `Import cluster configuration from a file. Import configuration from a YAML file, usually generated with diff --git a/src/go/rpk/pkg/cli/cmd/cluster/config/lint.go b/src/go/rpk/pkg/cli/cmd/cluster/config/lint.go index 6f1d59ebd1442..c3b9e6bb5d862 100644 --- a/src/go/rpk/pkg/cli/cmd/cluster/config/lint.go +++ b/src/go/rpk/pkg/cli/cmd/cluster/config/lint.go @@ -24,7 +24,7 @@ import ( func newLintCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "lint", - Short: "Remove any deprecated content from redpanda.yaml.", + Short: "Remove any deprecated content from redpanda.yaml", Long: `Remove any deprecated content from redpanda.yaml. Deprecated content includes properties which were set via redpanda.yaml diff --git a/src/go/rpk/pkg/cli/cmd/cluster/config/reset.go b/src/go/rpk/pkg/cli/cmd/cluster/config/reset.go index 53d5b2e833196..18cd1e5054a99 100644 --- a/src/go/rpk/pkg/cli/cmd/cluster/config/reset.go +++ b/src/go/rpk/pkg/cli/cmd/cluster/config/reset.go @@ -23,7 +23,7 @@ func newForceResetCommand(fs afero.Fs) *cobra.Command { var configCacheFile string cmd := &cobra.Command{ Use: "force-reset [PROPERTY...]", - Short: "Forcibly clear a cluster configuration property on this node.", + Short: "Forcibly clear a cluster configuration property on this node", Long: `Forcibly clear a cluster configuration property on this node. This command is not for general changes to cluster configuration: use this only diff --git a/src/go/rpk/pkg/cli/cmd/cluster/health.go b/src/go/rpk/pkg/cli/cmd/cluster/health.go index 03a5d0a13766e..2092151ed88b0 100644 --- a/src/go/rpk/pkg/cli/cmd/cluster/health.go +++ b/src/go/rpk/pkg/cli/cmd/cluster/health.go @@ -35,7 +35,7 @@ func NewHealthOverviewCommand(fs afero.Fs) *cobra.Command { ) cmd := &cobra.Command{ Use: "health", - Short: "Queries cluster for health overview.", + Short: "Queries cluster for health overview", Long: `Queries health overview. Health overview is created based on the health reports collected periodically @@ -84,8 +84,8 @@ following conditions are met: &adminCAFile, ) - cmd.Flags().BoolVarP(&watch, "watch", "w", false, "blocks and writes out all cluster health changes") - cmd.Flags().BoolVarP(&exit, "exit-when-healthy", "e", false, "when used with watch, exits after cluster is back in healthy state") + cmd.Flags().BoolVarP(&watch, "watch", "w", false, "Blocks and writes out all cluster health changes") + cmd.Flags().BoolVarP(&exit, "exit-when-healthy", "e", false, "When used with watch, exits after cluster is back in healthy state") return cmd } diff --git a/src/go/rpk/pkg/cli/cmd/cluster/license/info.go b/src/go/rpk/pkg/cli/cmd/cluster/license/info.go new file mode 100644 index 0000000000000..03153aa33f1e7 --- /dev/null +++ b/src/go/rpk/pkg/cli/cmd/cluster/license/info.go @@ -0,0 +1,68 @@ +package license + +import ( + "encoding/json" + "fmt" + + "github.com/redpanda-data/redpanda/src/go/rpk/pkg/api/admin" + "github.com/redpanda-data/redpanda/src/go/rpk/pkg/config" + "github.com/redpanda-data/redpanda/src/go/rpk/pkg/out" + "github.com/spf13/afero" + "github.com/spf13/cobra" +) + +func newInfoCommand(fs afero.Fs) *cobra.Command { + var format string + command := &cobra.Command{ + Use: "info", + Args: cobra.ExactArgs(0), + Short: "Retrieve license information", + Long: `Retrieve license information: + + Organization: Organization the license was generated for. + Type: Type of license: free, enterprise, etc. + Expires: Number of days the license is valid until or -1 if is expired. + Version: License schema version. +`, + Run: func(cmd *cobra.Command, args []string) { + p := config.ParamsFromCommand(cmd) + cfg, err := p.Load(fs) + out.MaybeDie(err, "unable to load config: %v", err) + + cl, err := admin.NewClient(fs, cfg) + out.MaybeDie(err, "unable to initialize admin client: %v", err) + + info, err := cl.GetLicenseInfo(cmd.Context()) + out.MaybeDie(err, "unable to retrieve license info: %v", err) + + if !info.Loaded { + out.Die("this cluster is missing a license") + } + + if info.Properties != (admin.LicenseProperties{}) { + if format == "json" { + props, err := json.MarshalIndent(info.Properties, "", " ") + out.MaybeDie(err, "unable to print license information as json: %v", err) + fmt.Printf("%s\n", props) + } else { + printLicenseInfo(info.Properties) + } + } else { + out.Die("no license loaded") + } + }, + } + + command.Flags().StringVar(&format, "format", "text", "Output format (text, json)") + return command +} + +func printLicenseInfo(p admin.LicenseProperties) { + out.Section("LICENSE INFORMATION") + licenseFormat := `Organization: %v +Type: %v +Expires: %v days +Version: %v +` + fmt.Printf(licenseFormat, p.Organization, p.Type, p.Expires, p.Version) +} diff --git a/src/go/rpk/pkg/cli/cmd/cluster/license/license.go b/src/go/rpk/pkg/cli/cmd/cluster/license/license.go new file mode 100644 index 0000000000000..7f5d02cd6196f --- /dev/null +++ b/src/go/rpk/pkg/cli/cmd/cluster/license/license.go @@ -0,0 +1,44 @@ +package license + +import ( + "github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/common" + "github.com/redpanda-data/redpanda/src/go/rpk/pkg/config" + "github.com/spf13/afero" + "github.com/spf13/cobra" +) + +func NewLicenseCommand(fs afero.Fs) *cobra.Command { + var ( + adminURL string + adminEnableTLS bool + adminCertFile string + adminKeyFile string + adminCAFile string + ) + + cmd := &cobra.Command{ + Use: "license", + Args: cobra.ExactArgs(0), + Short: "Manage cluster license.", + } + + common.AddAdminAPITLSFlags(cmd, + &adminEnableTLS, + &adminCertFile, + &adminKeyFile, + &adminCAFile, + ) + + cmd.AddCommand( + newInfoCommand(fs), + newSetCommand(fs), + ) + + cmd.PersistentFlags().StringVar( + &adminURL, + config.FlagAdminHosts2, + "", + "Comma-separated list of admin API addresses (:)") + + return cmd +} diff --git a/src/go/rpk/pkg/cli/cmd/cluster/license/set.go b/src/go/rpk/pkg/cli/cmd/cluster/license/set.go new file mode 100644 index 0000000000000..8a8b0ade048cf --- /dev/null +++ b/src/go/rpk/pkg/cli/cmd/cluster/license/set.go @@ -0,0 +1,71 @@ +package license + +import ( + "fmt" + "io" + "strings" + + "github.com/redpanda-data/redpanda/src/go/rpk/pkg/api/admin" + "github.com/redpanda-data/redpanda/src/go/rpk/pkg/config" + "github.com/redpanda-data/redpanda/src/go/rpk/pkg/out" + "github.com/spf13/afero" + "github.com/spf13/cobra" +) + +func newSetCommand(fs afero.Fs) *cobra.Command { + var licPath string + cmd := &cobra.Command{ + Use: "set", + Args: cobra.MaximumNArgs(1), + Short: "Upload license to the cluster", + Long: `Upload license to the cluster + +You can either provide a path to a file containing the license: + + rpk cluster license set --path /home/organization/redpanda.license + +Or inline the license string: + + rpk cluster license set + +If neither are present, rpk will look for the license in the +default location '/etc/redpanda/redpanda.license'. +`, + + Run: func(cmd *cobra.Command, args []string) { + if licPath != "" && len(args) > 0 { + out.Die("inline license cannot be passed if flag '--path' is set") + } + if licPath == "" && len(args) == 0 { + fmt.Println("Neither license file nor inline license was provided, checking '/etc/redpanda/redpanda.license'.") + licPath = "/etc/redpanda/redpanda.license" + } + + p := config.ParamsFromCommand(cmd) + cfg, err := p.Load(fs) + out.MaybeDie(err, "unable to load config: %v", err) + + cl, err := admin.NewClient(fs, cfg) + out.MaybeDie(err, "unable to initialize admin client: %v", err) + + var r io.Reader + if licPath != "" { + file, err := fs.Open(licPath) + out.MaybeDie(err, "unable to open %q: %v", licPath, err) + defer file.Close() + r = file + } else { + r = strings.NewReader(args[0]) + } + + err = cl.SetLicense(cmd.Context(), r) + out.MaybeDie(err, "unable to set license: %v", err) + + fmt.Println("Successfully uploaded license.") + }, + } + + cmd.Flags().StringVar(&licPath, "path", "", "Path to the license file") + + return cmd +} diff --git a/src/go/rpk/pkg/cli/cmd/cluster/maintenance/disable.go b/src/go/rpk/pkg/cli/cmd/cluster/maintenance/disable.go index 390d102e844b6..5f1ea43a4ff35 100644 --- a/src/go/rpk/pkg/cli/cmd/cluster/maintenance/disable.go +++ b/src/go/rpk/pkg/cli/cmd/cluster/maintenance/disable.go @@ -24,7 +24,7 @@ import ( func newDisableCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "disable ", - Short: "Disable maintenance mode for a node.", + Short: "Disable maintenance mode for a node", Long: `Disable maintenance mode for a node.`, Args: cobra.ExactArgs(1), Run: func(cmd *cobra.Command, args []string) { diff --git a/src/go/rpk/pkg/cli/cmd/cluster/maintenance/enable.go b/src/go/rpk/pkg/cli/cmd/cluster/maintenance/enable.go index df59c5dc94a50..f94fcaa71e4b9 100644 --- a/src/go/rpk/pkg/cli/cmd/cluster/maintenance/enable.go +++ b/src/go/rpk/pkg/cli/cmd/cluster/maintenance/enable.go @@ -26,7 +26,7 @@ func newEnableCommand(fs afero.Fs) *cobra.Command { var wait bool cmd := &cobra.Command{ Use: "enable ", - Short: "Enable maintenance mode for a node.", + Short: "Enable maintenance mode for a node", Long: `Enable maintenance mode for a node. This command enables maintenance mode for the node with the specified ID. If a diff --git a/src/go/rpk/pkg/cli/cmd/cluster/maintenance/maintenance.go b/src/go/rpk/pkg/cli/cmd/cluster/maintenance/maintenance.go index 9f883e3da2d29..cd265d223d150 100644 --- a/src/go/rpk/pkg/cli/cmd/cluster/maintenance/maintenance.go +++ b/src/go/rpk/pkg/cli/cmd/cluster/maintenance/maintenance.go @@ -27,7 +27,7 @@ func NewMaintenanceCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "maintenance", - Short: "Toggle a node's maintenance mode.", + Short: "Toggle a node's maintenance mode", Long: `Interact with cluster maintenance mode. Maintenance mode is a state that a node may be placed into in which the node diff --git a/src/go/rpk/pkg/cli/cmd/cluster/maintenance/status.go b/src/go/rpk/pkg/cli/cmd/cluster/maintenance/status.go index 1135802517a48..5b562e3a0258b 100644 --- a/src/go/rpk/pkg/cli/cmd/cluster/maintenance/status.go +++ b/src/go/rpk/pkg/cli/cmd/cluster/maintenance/status.go @@ -40,7 +40,7 @@ func addBrokerMaintenanceReport(table *out.TabWriter, b admin.Broker) { func newStatusCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "status", - Short: "Report maintenance status.", + Short: "Report maintenance status", Long: `Report maintenance status. This command reports maintenance status for each node in the cluster. The output diff --git a/src/go/rpk/pkg/cli/cmd/cluster/metadata.go b/src/go/rpk/pkg/cli/cmd/cluster/metadata.go index 2b8486150c8b9..a77384b2b3a51 100644 --- a/src/go/rpk/pkg/cli/cmd/cluster/metadata.go +++ b/src/go/rpk/pkg/cli/cmd/cluster/metadata.go @@ -36,7 +36,7 @@ func NewMetadataCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "metadata", Aliases: []string{"status", "info"}, - Short: "Request broker metadata.", + Short: "Request broker metadata", Long: `Request broker metadata. The Kafka protocol's metadata contains information about brokers, topics, and @@ -120,11 +120,11 @@ In the broker section, the controller node is suffixed with *. }, } - cmd.Flags().BoolVarP(&cluster, "print-cluster", "c", false, "print cluster section") - cmd.Flags().BoolVarP(&brokers, "print-brokers", "b", false, "print brokers section") - cmd.Flags().BoolVarP(&topics, "print-topics", "t", false, "print topics section (implied if any topics are specified)") - cmd.Flags().BoolVarP(&internal, "print-internal-topics", "i", false, "print internal topics (if all topics requested, implies -t)") - cmd.Flags().BoolVarP(&detailed, "print-detailed-topics", "d", false, "print per-partition information for topics (implies -t)") + cmd.Flags().BoolVarP(&cluster, "print-cluster", "c", false, "Print cluster section") + cmd.Flags().BoolVarP(&brokers, "print-brokers", "b", false, "Print brokers section") + cmd.Flags().BoolVarP(&topics, "print-topics", "t", false, "Print topics section (implied if any topics are specified)") + cmd.Flags().BoolVarP(&internal, "print-internal-topics", "i", false, "Print internal topics (if all topics requested, implies -t)") + cmd.Flags().BoolVarP(&detailed, "print-detailed-topics", "d", false, "Print per-partition information for topics (implies -t)") return cmd } diff --git a/src/go/rpk/pkg/cli/cmd/common/common.go b/src/go/rpk/pkg/cli/cmd/common/common.go index 1d60cf7c78c88..a44701a1e1cee 100644 --- a/src/go/rpk/pkg/cli/cmd/common/common.go +++ b/src/go/rpk/pkg/cli/cmd/common/common.go @@ -39,7 +39,7 @@ func AddKafkaFlags( "Comma-separated list of broker ip:port pairs (e.g."+ " --brokers '192.168.78.34:9092,192.168.78.35:9092,192.179.23.54:9092' )."+ " Alternatively, you may set the REDPANDA_BROKERS environment"+ - " variable with the comma-separated list of broker addresses.", + " variable with the comma-separated list of broker addresses", ) command.PersistentFlags().StringVar( configFile, @@ -52,19 +52,19 @@ func AddKafkaFlags( user, "user", "", - "SASL user to be used for authentication.", + "SASL user to be used for authentication", ) command.PersistentFlags().StringVar( password, "password", "", - "SASL password to be used for authentication.", + "SASL password to be used for authentication", ) command.PersistentFlags().StringVar( saslMechanism, config.FlagSASLMechanism, "", - "The authentication mechanism to use. Supported values: SCRAM-SHA-256, SCRAM-SHA-512.", + "The authentication mechanism to use. Supported values: SCRAM-SHA-256, SCRAM-SHA-512", ) AddTLSFlags(command, enableTLS, certFile, keyFile, truststoreFile) @@ -81,25 +81,25 @@ func AddTLSFlags( enableTLS, config.FlagEnableTLS, false, - "Enable TLS for the Kafka API (not necessary if specifying custom certs).", + "Enable TLS for the Kafka API (not necessary if specifying custom certs)", ) command.PersistentFlags().StringVar( certFile, config.FlagTLSCert, "", - "The certificate to be used for TLS authentication with the broker.", + "The certificate to be used for TLS authentication with the broker", ) command.PersistentFlags().StringVar( keyFile, config.FlagTLSKey, "", - "The certificate key to be used for TLS authentication with the broker.", + "The certificate key to be used for TLS authentication with the broker", ) command.PersistentFlags().StringVar( truststoreFile, config.FlagTLSCA, "", - "The truststore to be used for TLS communication with the broker.", + "The truststore to be used for TLS communication with the broker", ) return command @@ -114,25 +114,25 @@ func AddAdminAPITLSFlags( enableTLS, config.FlagEnableAdminTLS, false, - "Enable TLS for the Admin API (not necessary if specifying custom certs).", + "Enable TLS for the Admin API (not necessary if specifying custom certs)", ) command.PersistentFlags().StringVar( certFile, config.FlagAdminTLSCert, "", - "The certificate to be used for TLS authentication with the Admin API.", + "The certificate to be used for TLS authentication with the Admin API", ) command.PersistentFlags().StringVar( keyFile, config.FlagAdminTLSKey, "", - "The certificate key to be used for TLS authentication with the Admin API.", + "The certificate key to be used for TLS authentication with the Admin API", ) command.PersistentFlags().StringVar( truststoreFile, config.FlagAdminTLSCA, "", - "The truststore to be used for TLS communication with the Admin API.", + "The truststore to be used for TLS communication with the Admin API", ) return command diff --git a/src/go/rpk/pkg/cli/cmd/container.go b/src/go/rpk/pkg/cli/cmd/container.go index 6229d38d0ad0d..2301d8a59338f 100644 --- a/src/go/rpk/pkg/cli/cmd/container.go +++ b/src/go/rpk/pkg/cli/cmd/container.go @@ -17,7 +17,7 @@ import ( func NewContainerCommand() *cobra.Command { command := &cobra.Command{ Use: "container", - Short: "Manage a local container cluster.", + Short: "Manage a local container cluster", } command.AddCommand(container.Start()) diff --git a/src/go/rpk/pkg/cli/cmd/container/purge.go b/src/go/rpk/pkg/cli/cmd/container/purge.go index 8447e8f7574d2..dfe4252a5633d 100644 --- a/src/go/rpk/pkg/cli/cmd/container/purge.go +++ b/src/go/rpk/pkg/cli/cmd/container/purge.go @@ -22,7 +22,7 @@ import ( func Purge() *cobra.Command { command := &cobra.Command{ Use: "purge", - Short: "Stop and remove an existing local container cluster's data.", + Short: "Stop and remove an existing local container cluster's data", RunE: func(_ *cobra.Command, _ []string) error { c, err := common.NewDockerClient() if err != nil { diff --git a/src/go/rpk/pkg/cli/cmd/container/start.go b/src/go/rpk/pkg/cli/cmd/container/start.go index 5fab06e45473f..f69ec1a5d9f58 100644 --- a/src/go/rpk/pkg/cli/cmd/container/start.go +++ b/src/go/rpk/pkg/cli/cmd/container/start.go @@ -58,7 +58,7 @@ func Start() *cobra.Command { ) command := &cobra.Command{ Use: "start", - Short: "Start a local container cluster.", + Short: "Start a local container cluster", FParseErrWhitelist: cobra.FParseErrWhitelist{ // Allow unknown flags so that arbitrary flags can be passed // through to the containers without the need to pass '--' diff --git a/src/go/rpk/pkg/cli/cmd/container/stop.go b/src/go/rpk/pkg/cli/cmd/container/stop.go index 260f386a2fffb..4f250b4ec6791 100644 --- a/src/go/rpk/pkg/cli/cmd/container/stop.go +++ b/src/go/rpk/pkg/cli/cmd/container/stop.go @@ -22,7 +22,7 @@ import ( func Stop() *cobra.Command { command := &cobra.Command{ Use: "stop", - Short: "Stop an existing local container cluster.", + Short: "Stop an existing local container cluster", RunE: func(_ *cobra.Command, _ []string) error { c, err := common.NewDockerClient() if err != nil { diff --git a/src/go/rpk/pkg/cli/cmd/debug/bundle.go b/src/go/rpk/pkg/cli/cmd/debug/bundle.go index 40ea7f333f690..3515dc95d1dc2 100644 --- a/src/go/rpk/pkg/cli/cmd/debug/bundle.go +++ b/src/go/rpk/pkg/cli/cmd/debug/bundle.go @@ -53,7 +53,7 @@ func newBundleCommand(fs afero.Fs) *cobra.Command { ) command := &cobra.Command{ Use: "bundle", - Short: "Collect environment data and create a bundle file for the Redpanda Data support team to inspect.", + Short: "Collect environment data and create a bundle file for the Redpanda Data support team to inspect", Long: bundleHelpText, Run: func(cmd *cobra.Command, args []string) { p := config.ParamsFromCommand(cmd) @@ -78,7 +78,7 @@ func newBundleCommand(fs afero.Fs) *cobra.Command { &adminURL, "admin-url", "", - "The address to the broker's admin API. Defaults to the one in the config file.", + "The address to the broker's admin API. Defaults to the one in the config file", ) command.Flags().DurationVar( &timeout, @@ -102,7 +102,7 @@ func newBundleCommand(fs afero.Fs) *cobra.Command { &logsSizeLimit, "logs-size-limit", "100MiB", - "Read the logs until the given size is reached. Multipliers are also supported, e.g. 3MB, 1GiB.", + "Read the logs until the given size is reached. Multipliers are also supported, e.g. 3MB, 1GiB", ) common.AddKafkaFlags( diff --git a/src/go/rpk/pkg/cli/cmd/debug/debug.go b/src/go/rpk/pkg/cli/cmd/debug/debug.go index 067f2c8a672a8..b8e965f53f556 100644 --- a/src/go/rpk/pkg/cli/cmd/debug/debug.go +++ b/src/go/rpk/pkg/cli/cmd/debug/debug.go @@ -17,7 +17,7 @@ import ( func NewCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "debug", - Short: "Debug the local Redpanda process.", + Short: "Debug the local Redpanda process", } cmd.AddCommand( diff --git a/src/go/rpk/pkg/cli/cmd/debug/info.go b/src/go/rpk/pkg/cli/cmd/debug/info.go index f6ac657dd19f7..96c8fa080f541 100644 --- a/src/go/rpk/pkg/cli/cmd/debug/info.go +++ b/src/go/rpk/pkg/cli/cmd/debug/info.go @@ -34,7 +34,7 @@ func NewInfoCommand(fs afero.Fs) *cobra.Command { ) cmd := &cobra.Command{ Use: "info", - Short: "Send usage stats to Redpanda Data.", + Short: "Send usage stats to Redpanda Data", Hidden: true, Aliases: []string{"status"}, Args: cobra.ExactArgs(0), @@ -95,7 +95,7 @@ func NewInfoCommand(fs afero.Fs) *cobra.Command { }, } cmd.Flags().StringVar(&configFile, "config", "", "Redpanda config file, if not set the file will be searched for in the default locations") - cmd.Flags().BoolVar(&send, "send", false, "If true, send resource usage data to Vectorzed.") - cmd.Flags().DurationVar(&timeout, "timeout", 2*time.Second, "How long to wait to calculate the Redpanda CPU % utilization.") + cmd.Flags().BoolVar(&send, "send", false, "If true, send resource usage data to Redpanda") + cmd.Flags().DurationVar(&timeout, "timeout", 2*time.Second, "How long to wait to calculate the Redpanda CPU % utilization") return cmd } diff --git a/src/go/rpk/pkg/cli/cmd/generate.go b/src/go/rpk/pkg/cli/cmd/generate.go index dde60c4a9ce66..e52ac000372d4 100644 --- a/src/go/rpk/pkg/cli/cmd/generate.go +++ b/src/go/rpk/pkg/cli/cmd/generate.go @@ -18,7 +18,7 @@ import ( func NewGenerateCommand(fs afero.Fs) *cobra.Command { command := &cobra.Command{ Use: "generate [template]", - Short: "Generate a configuration template for related services.", + Short: "Generate a configuration template for related services", } command.AddCommand(generate.NewGrafanaDashboardCmd()) command.AddCommand(generate.NewPrometheusConfigCmd(fs)) diff --git a/src/go/rpk/pkg/cli/cmd/generate/autocomplete.go b/src/go/rpk/pkg/cli/cmd/generate/autocomplete.go index d664e0482918a..b0f7139475dd7 100644 --- a/src/go/rpk/pkg/cli/cmd/generate/autocomplete.go +++ b/src/go/rpk/pkg/cli/cmd/generate/autocomplete.go @@ -19,7 +19,7 @@ import ( func NewShellCompletionCommand() *cobra.Command { return &cobra.Command{ Use: "shell-completion", - Short: "Generate shell completion commands.", + Short: "Generate shell completion commands", Long: ` Shell completion can help autocomplete rpk commands when you press tab. diff --git a/src/go/rpk/pkg/cli/cmd/generate/grafana.go b/src/go/rpk/pkg/cli/cmd/generate/grafana.go index 99f3af6987a97..8bfc2daebb3ec 100644 --- a/src/go/rpk/pkg/cli/cmd/generate/grafana.go +++ b/src/go/rpk/pkg/cli/cmd/generate/grafana.go @@ -65,7 +65,7 @@ func NewGrafanaDashboardCmd() *cobra.Command { var metricsEndpoint string command := &cobra.Command{ Use: "grafana-dashboard", - Short: "Generate a Grafana dashboard for redpanda metrics.", + Short: "Generate a Grafana dashboard for redpanda metrics", RunE: func(ccmd *cobra.Command, args []string) error { if !(strings.HasPrefix(metricsEndpoint, "http://") || strings.HasPrefix(metricsEndpoint, "https://")) { diff --git a/src/go/rpk/pkg/cli/cmd/generate/prometheus.go b/src/go/rpk/pkg/cli/cmd/generate/prometheus.go index 8d78a55234039..045deb5a08db4 100644 --- a/src/go/rpk/pkg/cli/cmd/generate/prometheus.go +++ b/src/go/rpk/pkg/cli/cmd/generate/prometheus.go @@ -46,7 +46,7 @@ func NewPrometheusConfigCmd(fs afero.Fs) *cobra.Command { ) command := &cobra.Command{ Use: "prometheus-config", - Short: "Generate the Prometheus configuration to scrape redpanda nodes.", + Short: "Generate the Prometheus configuration to scrape redpanda nodes", Long: ` Generate the Prometheus configuration to scrape redpanda nodes. This command's output should be added to the 'scrape_configs' array in your Prometheus diff --git a/src/go/rpk/pkg/cli/cmd/group/describe.go b/src/go/rpk/pkg/cli/cmd/group/describe.go index 7b295f63262cf..bbca2fa14bcf5 100644 --- a/src/go/rpk/pkg/cli/cmd/group/describe.go +++ b/src/go/rpk/pkg/cli/cmd/group/describe.go @@ -28,7 +28,7 @@ func NewDescribeCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "describe [GROUPS...]", - Short: "Describe group offset status & lag.", + Short: "Describe group offset status & lag", Long: `Describe group offset status & lag. This command describes group members, calculates their lag, and prints detailed diff --git a/src/go/rpk/pkg/cli/cmd/group/group.go b/src/go/rpk/pkg/cli/cmd/group/group.go index 4f51ab9e2bdb1..c411375222f84 100644 --- a/src/go/rpk/pkg/cli/cmd/group/group.go +++ b/src/go/rpk/pkg/cli/cmd/group/group.go @@ -25,7 +25,7 @@ func NewCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "group", Aliases: []string{"g"}, - Short: `Describe, list, and delete consumer groups and manage their offsets.`, + Short: `Describe, list, and delete consumer groups and manage their offsets`, Long: `Describe, list, and delete consumer groups and manage their offsets. Consumer groups allow you to horizontally scale consuming from topics. A @@ -108,7 +108,7 @@ func newListCommand(fs afero.Fs) *cobra.Command { return &cobra.Command{ Use: "list", Aliases: []string{"ls"}, - Short: "List all groups.", + Short: "List all groups", Long: `List all groups. This command lists all groups currently known to Redpanda, including empty @@ -144,7 +144,7 @@ groups, or to list groups that need to be cleaned up. func newDeleteCommand(fs afero.Fs) *cobra.Command { return &cobra.Command{ Use: "delete [GROUPS...]", - Short: "Delete groups from brokers.", + Short: "Delete groups from brokers", Long: `Delete groups from brokers. Older versions of the Kafka protocol included a retention_millis field in diff --git a/src/go/rpk/pkg/cli/cmd/group/seek.go b/src/go/rpk/pkg/cli/cmd/group/seek.go index 8118b994a158e..4caa965013bb4 100644 --- a/src/go/rpk/pkg/cli/cmd/group/seek.go +++ b/src/go/rpk/pkg/cli/cmd/group/seek.go @@ -35,7 +35,7 @@ func newSeekCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "seek [GROUP] --to (start|end|timestamp) --to-group ... --topics ...", - Short: "Modify a group's current offsets.", + Short: "Modify a group's current offsets", Long: `Modify a group's current offsets. This command allows you to modify a group's offsets. Sometimes, you may need to diff --git a/src/go/rpk/pkg/cli/cmd/iotune.go b/src/go/rpk/pkg/cli/cmd/iotune.go index edc33f7712028..c6405e05e1012 100644 --- a/src/go/rpk/pkg/cli/cmd/iotune.go +++ b/src/go/rpk/pkg/cli/cmd/iotune.go @@ -33,7 +33,7 @@ func NewIoTuneCmd(fs afero.Fs) *cobra.Command { ) command := &cobra.Command{ Use: "iotune", - Short: "Measure filesystem performance and create IO configuration file.", + Short: "Measure filesystem performance and create IO configuration file", Run: func(cmd *cobra.Command, args []string) { timeout += duration p := config.ParamsFromCommand(cmd) @@ -57,7 +57,7 @@ func NewIoTuneCmd(fs afero.Fs) *cobra.Command { "config", "", "Redpanda config file, if not set the file will be searched for"+ - " in the default locations.", + " in the default locations", ) command.Flags().StringVar( &outputFile, diff --git a/src/go/rpk/pkg/cli/cmd/plugin/plugin.go b/src/go/rpk/pkg/cli/cmd/plugin/plugin.go index 634f5175e4a5d..daac9d785ec03 100644 --- a/src/go/rpk/pkg/cli/cmd/plugin/plugin.go +++ b/src/go/rpk/pkg/cli/cmd/plugin/plugin.go @@ -17,7 +17,7 @@ const urlBase = "https://vectorized-public.s3.us-west-2.amazonaws.com/rpk-plugin func NewCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "plugin", - Short: "List, download, update, and remove rpk plugins.", + Short: "List, download, update, and remove rpk plugins", Long: `List, download, update, and remove rpk plugins. Plugins augment rpk with new commands. @@ -76,7 +76,7 @@ func newListCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "list", - Short: "List all available plugins.", + Short: "List all available plugins", Long: `List all available plugins. By default, this command fetches the remote manifest and prints plugins @@ -156,7 +156,7 @@ func newInstallCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "install [PLUGIN]", Aliases: []string{"download"}, - Short: "Install an rpk plugin.", + Short: "Install an rpk plugin", Long: `Install an rpk plugin. An rpk plugin must be saved in a directory that is in your $PATH. By default, @@ -246,7 +246,7 @@ func newUninstallCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "uninstall [NAME]", Aliases: []string{"rm"}, - Short: "Uninstall / remove an existing local plugin.", + Short: "Uninstall / remove an existing local plugin", Long: `Uninstall / remove an existing local plugin. This command lists locally installed plugins and removes the first plugin that diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/admin/admin.go b/src/go/rpk/pkg/cli/cmd/redpanda/admin/admin.go index da326b218c4b5..de87c05e1e8b3 100644 --- a/src/go/rpk/pkg/cli/cmd/redpanda/admin/admin.go +++ b/src/go/rpk/pkg/cli/cmd/redpanda/admin/admin.go @@ -24,7 +24,7 @@ import ( func NewCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "admin", - Short: "Talk to the Redpanda admin listener.", + Short: "Talk to the Redpanda admin listener", Args: cobra.ExactArgs(0), } @@ -50,7 +50,7 @@ func NewCommand(fs afero.Fs) *cobra.Command { config.FlagAdminHosts1, []string{}, "A comma-separated list of Admin API addresses (:)."+ - " You must specify one for each node.", + " You must specify one for each node", ) common.AddAdminAPITLSFlags( diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/admin/brokers/brokers.go b/src/go/rpk/pkg/cli/cmd/redpanda/admin/brokers/brokers.go index bd08a9d4e17c6..fa09963ee7fae 100644 --- a/src/go/rpk/pkg/cli/cmd/redpanda/admin/brokers/brokers.go +++ b/src/go/rpk/pkg/cli/cmd/redpanda/admin/brokers/brokers.go @@ -26,7 +26,7 @@ import ( func NewCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "brokers", - Short: "View and configure Redpanda brokers through the admin listener.", + Short: "View and configure Redpanda brokers through the admin listener", Args: cobra.ExactArgs(0), } cmd.AddCommand( @@ -41,7 +41,7 @@ func newListCommand(fs afero.Fs) *cobra.Command { return &cobra.Command{ Use: "list", Aliases: []string{"ls"}, - Short: "List the brokers in your cluster.", + Short: "List the brokers in your cluster", Args: cobra.ExactArgs(0), Run: func(cmd *cobra.Command, _ []string) { p := config.ParamsFromCommand(cmd) @@ -82,7 +82,7 @@ func newListCommand(fs afero.Fs) *cobra.Command { func newDecommissionBroker(fs afero.Fs) *cobra.Command { return &cobra.Command{ Use: "decommission [BROKER ID]", - Short: "Decommission the given broker.", + Short: "Decommission the given broker", Long: `Decommission the given broker. Decommissioning a broker removes it from the cluster. @@ -116,7 +116,7 @@ leader handles the request. func newRecommissionBroker(fs afero.Fs) *cobra.Command { return &cobra.Command{ Use: "recommission [BROKER ID]", - Short: "Recommission the given broker if it is still decommissioning.", + Short: "Recommission the given broker if it is still decommissioning", Long: `Recommission the given broker if is is still decommissioning. Recommissioning can stop an active decommission. diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/admin/config/config.go b/src/go/rpk/pkg/cli/cmd/redpanda/admin/config/config.go index fdcd40f81b55e..a73cf6839480b 100644 --- a/src/go/rpk/pkg/cli/cmd/redpanda/admin/config/config.go +++ b/src/go/rpk/pkg/cli/cmd/redpanda/admin/config/config.go @@ -20,7 +20,7 @@ import ( func NewCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "config", - Short: "View or modify Redpanda configuration through the admin listener.", + Short: "View or modify Redpanda configuration through the admin listener", Args: cobra.ExactArgs(0), } cmd.AddCommand( @@ -35,7 +35,7 @@ func newPrintCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "print", Aliases: []string{"dump", "list", "ls", "display"}, - Short: "Display the current Redpanda configuration.", + Short: "Display the current Redpanda configuration", Args: cobra.ExactArgs(0), Run: func(cmd *cobra.Command, _ []string) { p := config.ParamsFromCommand(cmd) @@ -65,7 +65,7 @@ func newPrintCommand(fs afero.Fs) *cobra.Command { func newLogLevelCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "log-level", - Short: "Manage a broker's log level.", + Short: "Manage a broker's log level", Args: cobra.ExactArgs(0), } cmd.AddCommand( @@ -81,7 +81,7 @@ func newLogLevelSetCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "set [LOGGERS...]", - Short: "Set broker logger's log level.", + Short: "Set broker logger's log level", Long: `Set broker logger's log level. This command temporarily changes a broker logger's log level. Each Redpanda diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/admin/partitions/partitions.go b/src/go/rpk/pkg/cli/cmd/redpanda/admin/partitions/partitions.go index 6254e621f90a5..6bee4efa0f7cb 100644 --- a/src/go/rpk/pkg/cli/cmd/redpanda/admin/partitions/partitions.go +++ b/src/go/rpk/pkg/cli/cmd/redpanda/admin/partitions/partitions.go @@ -27,7 +27,7 @@ import ( func NewCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "partitions", - Short: "View and configure Redpanda partitions through the admin listener.", + Short: "View and configure Redpanda partitions through the admin listener", Args: cobra.ExactArgs(0), } cmd.AddCommand( @@ -41,7 +41,7 @@ func newListCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "list [BROKER ID]", Aliases: []string{"ls"}, - Short: "List the partitions in a broker in the cluster.", + Short: "List the partitions in a broker in the cluster", Args: cobra.ExactArgs(1), Run: func(cmd *cobra.Command, args []string) { brokerID, err := strconv.Atoi(args[0]) diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/check.go b/src/go/rpk/pkg/cli/cmd/redpanda/check.go index 1b69922123130..36b7466d20e7d 100644 --- a/src/go/rpk/pkg/cli/cmd/redpanda/check.go +++ b/src/go/rpk/pkg/cli/cmd/redpanda/check.go @@ -34,7 +34,7 @@ func NewCheckCommand(fs afero.Fs) *cobra.Command { ) command := &cobra.Command{ Use: "check", - Short: "Check if system meets redpanda requirements.", + Short: "Check if system meets redpanda requirements", Run: func(cmd *cobra.Command, args []string) { p := config.ParamsFromCommand(cmd) cfg, err := p.Load(fs) diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/config.go b/src/go/rpk/pkg/cli/cmd/redpanda/config.go index 8412479faae90..c5be52368df2b 100644 --- a/src/go/rpk/pkg/cli/cmd/redpanda/config.go +++ b/src/go/rpk/pkg/cli/cmd/redpanda/config.go @@ -33,7 +33,7 @@ const ( func NewConfigCommand(fs afero.Fs) *cobra.Command { root := &cobra.Command{ Use: "config ", - Short: "Edit configuration.", + Short: "Edit configuration", } root.AddCommand(set(fs)) root.AddCommand(bootstrap(fs)) @@ -67,6 +67,7 @@ partial json/yaml config objects: p := config.ParamsFromCommand(cmd) cfg, err := p.Load(fs) out.MaybeDie(err, "unable to load config: %v", err) + cfg = cfg.FileOrDefaults() // we set fields in the raw file without writing env / flag overrides if format == "single" { fmt.Println("'--format single' is deprecated, either remove it or use yaml/json") @@ -104,6 +105,7 @@ func bootstrap(fs afero.Fs) *cobra.Command { p := config.ParamsFromCommand(cmd) cfg, err := p.Load(fs) out.MaybeDie(err, "unable to load config: %v", err) + cfg = cfg.FileOrDefaults() // we modify fields in the raw file without writing env / flag overrides seeds, err := parseSeedIPs(ips) out.MaybeDieErr(err) @@ -167,6 +169,7 @@ func initNode(fs afero.Fs) *cobra.Command { p := config.ParamsFromCommand(cmd) cfg, err := p.Load(fs) out.MaybeDie(err, "unable to load config: %v", err) + cfg = cfg.FileOrDefaults() // we modify fields in the raw file without writing env / flag overrides // Don't reset the node's UUID if it has already been set. if cfg.NodeUUID == "" { diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/config_test.go b/src/go/rpk/pkg/cli/cmd/redpanda/config_test.go index 7891c9b3dfed4..7f9d716684aea 100644 --- a/src/go/rpk/pkg/cli/cmd/redpanda/config_test.go +++ b/src/go/rpk/pkg/cli/cmd/redpanda/config_test.go @@ -184,3 +184,151 @@ func TestInitNode(t *testing.T) { }) } } + +// This is a top level command test, individual cases for set are +// tested in 'rpk/pkg/config/config_test.go'. +func TestSetCommand(t *testing.T) { + for _, test := range []struct { + name string + cfgFile string + exp string + args []string + }{ + { + name: "set without config file on disk", + exp: `config_file: /etc/redpanda/redpanda.yaml +redpanda: + data_directory: /var/lib/redpanda/data + node_id: 0 + rack: redpanda-rack + seed_servers: [] + rpc_server: + address: 0.0.0.0 + port: 33145 + kafka_api: + - address: 0.0.0.0 + port: 9092 + admin: + - address: 0.0.0.0 + port: 9644 + developer_mode: true +rpk: + enable_usage_stats: false + tune_network: false + tune_disk_scheduler: false + tune_disk_nomerges: false + tune_disk_write_cache: false + tune_disk_irq: false + tune_fstrim: false + tune_cpu: false + tune_aio_events: false + tune_clocksource: false + tune_swappiness: false + tune_transparent_hugepages: false + enable_memory_locking: false + tune_coredump: false + coredump_dir: /var/lib/redpanda/coredump + tune_ballast_file: false + overprovisioned: false +pandaproxy: {} +schema_registry: {} +`, + args: []string{"redpanda.rack", "redpanda-rack"}, + }, + { + name: "set with loaded config", + cfgFile: `config_file: /etc/redpanda/redpanda.yaml +redpanda: + data_directory: "" + node_id: 0 + rack: redpanda-rack + seed_servers: [] + rpc_server: + address: 0.0.0.0 + port: 33145 + kafka_api: + - address: 0.0.0.0 + port: 9092 + admin: + - address: 0.0.0.0 + port: 9644 + developer_mode: true +rpk: + enable_usage_stats: false + tune_network: false + tune_disk_scheduler: false + tune_disk_nomerges: false + tune_disk_write_cache: false + tune_disk_irq: false + tune_fstrim: false + tune_cpu: false + tune_aio_events: false + tune_clocksource: false + tune_swappiness: false + tune_transparent_hugepages: false + enable_memory_locking: false + tune_coredump: false + tune_ballast_file: false + overprovisioned: false +`, + exp: `config_file: /etc/redpanda/redpanda.yaml +redpanda: + node_id: 0 + rack: redpanda-rack + seed_servers: [] + rpc_server: + address: 0.0.0.0 + port: 33145 + kafka_api: + - address: 0.0.0.0 + port: 9092 + admin: + - address: 0.0.0.0 + port: 9644 + developer_mode: true +rpk: + enable_usage_stats: true + tune_network: false + tune_disk_scheduler: false + tune_disk_nomerges: false + tune_disk_write_cache: false + tune_disk_irq: false + tune_fstrim: false + tune_cpu: false + tune_aio_events: false + tune_clocksource: false + tune_swappiness: false + tune_transparent_hugepages: false + enable_memory_locking: false + tune_coredump: false + tune_ballast_file: false + overprovisioned: false +`, + args: []string{"rpk.enable_usage_stats", "true"}, + }, + } { + fs := afero.NewMemMapFs() + + // We create a config file in default redpanda location + if test.cfgFile != "" { + err := afero.WriteFile(fs, "/etc/redpanda/redpanda.yaml", []byte(test.cfgFile), 0o644) + if err != nil { + t.Errorf("unexpected failure writing passed config file: %v", err) + } + } + + c := set(fs) + c.SetArgs(test.args) + err := c.Execute() + if err != nil { + t.Errorf("error during command execution: %v", err) + } + + // Read back from that default location and compare. + file, err := afero.ReadFile(fs, "/etc/redpanda/redpanda.yaml") + if err != nil { + t.Errorf("unexpected failure reading config file: %v", err) + } + require.Equal(t, test.exp, string(file)) + } +} diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/mode.go b/src/go/rpk/pkg/cli/cmd/redpanda/mode.go index 8019b1fcc6095..a487737ecfd1f 100644 --- a/src/go/rpk/pkg/cli/cmd/redpanda/mode.go +++ b/src/go/rpk/pkg/cli/cmd/redpanda/mode.go @@ -26,7 +26,7 @@ func NewModeCommand(fs afero.Fs) *cobra.Command { var configFile string command := &cobra.Command{ Use: "mode ", - Short: "Enable a default configuration mode.", + Short: "Enable a default configuration mode", Long: "", Args: func(_ *cobra.Command, args []string) error { if len(args) < 1 { @@ -56,6 +56,7 @@ func executeMode(fs afero.Fs, cmd *cobra.Command, mode string) error { if err != nil { return fmt.Errorf("unable to load config: %v", err) } + cfg = cfg.FileOrDefaults() // we modify fields in the raw file without writing env / flag overrides cfg, err = config.SetMode(mode, cfg) if err != nil { return err diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/mode_test.go b/src/go/rpk/pkg/cli/cmd/redpanda/mode_test.go index 5d556bffc1329..85eec5c677384 100644 --- a/src/go/rpk/pkg/cli/cmd/redpanda/mode_test.go +++ b/src/go/rpk/pkg/cli/cmd/redpanda/mode_test.go @@ -42,10 +42,6 @@ func fillRpkConfig(path, mode string) *config.Config { Overprovisioned: !val, TuneBallastFile: val, } - // Unset defaults that get added after command execution, needed to compare - // expected config with loaded config. - conf.Rpk.KafkaAPI = config.RpkKafkaAPI{Brokers: []string{"0.0.0.0:9092"}} - conf.Rpk.AdminAPI = config.RpkAdminAPI{Addresses: []string{"127.0.0.1:9644"}} return conf } diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/start.go b/src/go/rpk/pkg/cli/cmd/redpanda/start.go index fe9f5e779dcd6..29bd6805541ec 100644 --- a/src/go/rpk/pkg/cli/cmd/redpanda/start.go +++ b/src/go/rpk/pkg/cli/cmd/redpanda/start.go @@ -134,7 +134,7 @@ func NewStartCommand(fs afero.Fs, launcher rp.Launcher) *cobra.Command { command := &cobra.Command{ Use: "start", - Short: "Start redpanda.", + Short: "Start redpanda", FParseErrWhitelist: cobra.FParseErrWhitelist{ // Allow unknown flags so that arbitrary flags can be passed // through to redpanda/seastar without the need to pass '--' @@ -152,6 +152,13 @@ func NewStartCommand(fs afero.Fs, launcher rp.Launcher) *cobra.Command { if err != nil { return fmt.Errorf("unable to load config file: %s", err) } + // We set fields in the raw file without writing rpk specific env + // or flag overrides. This command itself has all redpanda specific + // flags installed, and handles redpanda specific env vars itself. + // The magic `--set` flag is what modifies any redpanda.yaml fields. + // Thus, we can ignore any env / flags that would come from rpk + // configuration itself. + cfg = cfg.FileOrDefaults() if len(configKvs) > 0 { if err = setConfig(cfg, configKvs); err != nil { @@ -335,6 +342,11 @@ func NewStartCommand(fs afero.Fs, launcher rp.Launcher) *cobra.Command { sendEnv(fs, env, cfg, !prestartCfg.checkEnabled, err) return err } + + if cfg.Redpanda.Directory == "" { + cfg.Redpanda.Directory = config.Default().Redpanda.Directory + } + checkPayloads, tunerPayloads, err := prestart( fs, rpArgs, diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/start_test.go b/src/go/rpk/pkg/cli/cmd/redpanda/start_test.go index ac7a3b536760b..cfcfde0a3556d 100644 --- a/src/go/rpk/pkg/cli/cmd/redpanda/start_test.go +++ b/src/go/rpk/pkg/cli/cmd/redpanda/start_test.go @@ -14,9 +14,7 @@ package redpanda import ( "bytes" - "net" "os" - "strconv" "testing" "github.com/redpanda-data/redpanda/src/go/rpk/pkg/config" @@ -194,10 +192,6 @@ func TestStartCommand(t *testing.T) { path, ) c := config.Default() - // Adding unset default that get added on first load. - b0 := c.Redpanda.KafkaAPI[0] - c.Rpk.KafkaAPI.Brokers = []string{net.JoinHostPort(b0.Address, strconv.Itoa(b0.Port))} - c.Rpk.AdminAPI.Addresses = []string{"127.0.0.1:9644"} conf, err := new(config.Params).Load(fs) require.NoError(st, err) @@ -441,6 +435,27 @@ func TestStartCommand(t *testing.T) { // Check that the generated config is as expected. require.Exactly(st, config.Default().Redpanda.ID, conf.Redpanda.ID) }, + }, { + name: "it should write default data_directory if loaded config doesn't have one", + args: []string{ + "--config", config.Default().ConfigFile, + "--install-dir", "/var/lib/redpanda", + }, + before: func(fs afero.Fs) error { + conf := config.Default() + conf.Redpanda.Directory = "" + return conf.Write(fs) + }, + postCheck: func( + fs afero.Fs, + _ *redpanda.RedpandaArgs, + st *testing.T, + ) { + conf, err := new(config.Params).Load(fs) + require.NoError(st, err) + // Check that the generated config is as expected. + require.Exactly(st, config.Default().Redpanda.Directory, conf.Redpanda.Directory) + }, }, { name: "it should leave redpanda.node_id untouched if --node-id wasn't passed", args: []string{ diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/stop.go b/src/go/rpk/pkg/cli/cmd/redpanda/stop.go index 837b3f6177e28..f82aabdb61d80 100644 --- a/src/go/rpk/pkg/cli/cmd/redpanda/stop.go +++ b/src/go/rpk/pkg/cli/cmd/redpanda/stop.go @@ -34,7 +34,7 @@ func NewStopCommand(fs afero.Fs) *cobra.Command { ) command := &cobra.Command{ Use: "stop", - Short: "Stop redpanda.", + Short: "Stop redpanda", Long: `Stop a local redpanda process. 'rpk stop' first sends SIGINT, and waits for the specified timeout. Then, if redpanda hasn't stopped, it sends SIGTERM. Lastly, it sends SIGKILL if it's still diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/tune/help.go b/src/go/rpk/pkg/cli/cmd/redpanda/tune/help.go index 658e27389e31d..e22a0065edf11 100644 --- a/src/go/rpk/pkg/cli/cmd/redpanda/tune/help.go +++ b/src/go/rpk/pkg/cli/cmd/redpanda/tune/help.go @@ -34,7 +34,7 @@ func newHelpCommand() *cobra.Command { return &cobra.Command{ Use: "help ", - Short: "Display detailed information about the tuner.", + Short: "Display detailed information about the tuner", Args: func(_ *cobra.Command, args []string) error { if len(args) != 1 { return errors.New("requires the tuner name") diff --git a/src/go/rpk/pkg/cli/cmd/root.go b/src/go/rpk/pkg/cli/cmd/root.go index 947ed95fe0598..f630bab2a9885 100644 --- a/src/go/rpk/pkg/cli/cmd/root.go +++ b/src/go/rpk/pkg/cli/cmd/root.go @@ -56,11 +56,11 @@ func Execute() { root := &cobra.Command{ Use: "rpk", - Short: "rpk is the Redpanda CLI & toolbox.", + Short: "rpk is the Redpanda CLI & toolbox", Long: "", } root.PersistentFlags().BoolVarP(&verbose, config.FlagVerbose, - "v", false, "Enable verbose logging (default: false).") + "v", false, "Enable verbose logging (default: false)") root.AddCommand( NewGenerateCommand(fs), @@ -114,6 +114,13 @@ func Execute() { } } + // Cobra creates help flag as: help for if you want to override + // that message (capitalize the first letter) then this is the way. + // See: spf13/cobra#480 + walk(root, func(c *cobra.Command) { + c.Flags().BoolP("help", "h", false, "Help for "+c.Name()) + }) + err := root.Execute() if len(os.Args) > 1 { switch os.Args[1] { @@ -423,3 +430,11 @@ func (*osPluginHandler) exec(path string, args []string) error { } return syscall.Exec(path, args, env) } + +// walk calls f for c and all of its children. +func walk(c *cobra.Command, f func(*cobra.Command)) { + f(c) + for _, c := range c.Commands() { + walk(c, f) + } +} diff --git a/src/go/rpk/pkg/cli/cmd/topic.go b/src/go/rpk/pkg/cli/cmd/topic.go index c523988c26abf..0948f382934b0 100644 --- a/src/go/rpk/pkg/cli/cmd/topic.go +++ b/src/go/rpk/pkg/cli/cmd/topic.go @@ -30,7 +30,7 @@ func NewTopicCommand(fs afero.Fs) *cobra.Command { ) command := &cobra.Command{ Use: "topic", - Short: "Create, delete, produce to and consume from Redpanda topics.", + Short: "Create, delete, produce to and consume from Redpanda topics", } common.AddKafkaFlags(command, &configFile, &user, &password, &mechanism, &enableTLS, &certFile, &keyFile, &truststoreFile, &brokers) diff --git a/src/go/rpk/pkg/cli/cmd/topic/add_partitions.go b/src/go/rpk/pkg/cli/cmd/topic/add_partitions.go index d6b208d78aa08..d7b0c20da8ac2 100644 --- a/src/go/rpk/pkg/cli/cmd/topic/add_partitions.go +++ b/src/go/rpk/pkg/cli/cmd/topic/add_partitions.go @@ -27,7 +27,7 @@ func NewAddPartitionsCommand(fs afero.Fs) *cobra.Command { var num int cmd := &cobra.Command{ Use: "add-partitions [TOPICS...] --num [#]", - Short: "Add partitions to existing topics.", + Short: "Add partitions to existing topics", Args: cobra.MinimumNArgs(1), Long: `Add partitions to existing topics.`, Run: func(cmd *cobra.Command, topics []string) { @@ -70,6 +70,6 @@ func NewAddPartitionsCommand(fs afero.Fs) *cobra.Command { } }, } - cmd.Flags().IntVarP(&num, "num", "n", 0, "numer of partitions to add to each topic") + cmd.Flags().IntVarP(&num, "num", "n", 0, "Number of partitions to add to each topic") return cmd } diff --git a/src/go/rpk/pkg/cli/cmd/topic/config.go b/src/go/rpk/pkg/cli/cmd/topic/config.go index 7f1fdb4c1aa29..51eaa56b1c27d 100644 --- a/src/go/rpk/pkg/cli/cmd/topic/config.go +++ b/src/go/rpk/pkg/cli/cmd/topic/config.go @@ -33,7 +33,7 @@ func NewAlterConfigCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "alter-config [TOPICS...] --set key=value --delete key2,key3", - Short: `Set, delete, add, and remove key/value configs for a topic.`, + Short: `Set, delete, add, and remove key/value configs for a topic`, Long: `Set, delete, add, and remove key/value configs for a topic. This command allows you to incrementally alter the configuration for multiple @@ -132,7 +132,7 @@ valid, but does not apply it. cmd.Flags().StringArrayVar(&appends, "append", nil, "key=value; Value to append to a list-of-values key (repeatable)") cmd.Flags().StringArrayVar(&subtracts, "subtract", nil, "key=value; Value to remove from list-of-values key (repeatable)") - cmd.Flags().BoolVar(&dry, "dry", false, "dry run: validate the alter request, but do not apply") + cmd.Flags().BoolVar(&dry, "dry", false, "Dry run: validate the alter request, but do not apply") return cmd } diff --git a/src/go/rpk/pkg/cli/cmd/topic/consume.go b/src/go/rpk/pkg/cli/cmd/topic/consume.go index 13423edcc3c67..71c39ca179c65 100644 --- a/src/go/rpk/pkg/cli/cmd/topic/consume.go +++ b/src/go/rpk/pkg/cli/cmd/topic/consume.go @@ -66,7 +66,7 @@ func NewConsumeCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "consume TOPICS...", - Short: "Consume records from topics.", + Short: "Consume records from topics", Long: helpConsume, Args: cobra.MinimumNArgs(1), Run: func(cmd *cobra.Command, topics []string) { @@ -126,7 +126,7 @@ func NewConsumeCommand(fs afero.Fs) *cobra.Command { cmd.Flags().Int32SliceVarP(&c.partitions, "partitions", "p", nil, "Comma delimited list of specific partitions to consume") cmd.Flags().BoolVarP(&c.regex, "regex", "r", false, "Parse topics as regex; consume any topic that matches any expression") - cmd.Flags().StringVarP(&c.group, "group", "g", "", "group to use for consuming (incompatible with -p)") + cmd.Flags().StringVarP(&c.group, "group", "g", "", "Group to use for consuming (incompatible with -p)") cmd.Flags().StringVarP(&c.balancer, "balancer", "b", "cooperative-sticky", "Group balancer to use if group consuming (range, roundrobin, sticky, cooperative-sticky)") cmd.Flags().Int32Var(&c.fetchMaxBytes, "fetch-max-bytes", 1<<20, "Maximum amount of bytes per fetch request per broker") @@ -140,7 +140,7 @@ func NewConsumeCommand(fs afero.Fs) *cobra.Command { // Deprecated. cmd.Flags().BoolVar(new(bool), "commit", false, "") - cmd.Flags().MarkDeprecated("commit", "group consuming always commits") + cmd.Flags().MarkDeprecated("commit", "Group consuming always commits") return cmd } diff --git a/src/go/rpk/pkg/cli/cmd/topic/create.go b/src/go/rpk/pkg/cli/cmd/topic/create.go index 2ff8f4cb652c6..e21e4a09189db 100644 --- a/src/go/rpk/pkg/cli/cmd/topic/create.go +++ b/src/go/rpk/pkg/cli/cmd/topic/create.go @@ -34,7 +34,7 @@ func NewCreateCommand(fs afero.Fs) *cobra.Command { ) cmd := &cobra.Command{ Use: "create [TOPICS...]", - Short: "Create topics.", + Short: "Create topics", Args: cobra.MinimumNArgs(1), Long: `Create topics. @@ -117,11 +117,11 @@ the cleanup.policy=compact config option set. cmd.Flags().StringArrayVarP(&configKVs, "topic-config", "c", nil, "key=value; Config parameters (repeatable; e.g. -c cleanup.policy=compact)") cmd.Flags().Int32VarP(&partitions, "partitions", "p", -1, "Number of partitions to create per topic; -1 defaults to the cluster's default_topic_partitions") cmd.Flags().Int16VarP(&replicas, "replicas", "r", -1, "Replication factor (must be odd); -1 defaults to the cluster's default_topic_replications") - cmd.Flags().BoolVarP(&dry, "dry", "d", false, "dry run: validate the topic creation request; do not create topics") + cmd.Flags().BoolVarP(&dry, "dry", "d", false, "Dry run: validate the topic creation request; do not create topics") // Sept 2021 - cmd.Flags().BoolVar(&compact, "compact", false, "alias for -c cleanup.policy=compact") - cmd.Flags().MarkDeprecated("compact", "use -c cleanup.policy=compact") + cmd.Flags().BoolVar(&compact, "compact", false, "Alias for -c cleanup.policy=compact") + cmd.Flags().MarkDeprecated("compact", "Use -c cleanup.policy=compact") return cmd } diff --git a/src/go/rpk/pkg/cli/cmd/topic/delete.go b/src/go/rpk/pkg/cli/cmd/topic/delete.go index 56290062694fd..08d6d547e4aed 100644 --- a/src/go/rpk/pkg/cli/cmd/topic/delete.go +++ b/src/go/rpk/pkg/cli/cmd/topic/delete.go @@ -23,7 +23,7 @@ func NewDeleteCommand(fs afero.Fs) *cobra.Command { var re bool cmd := &cobra.Command{ Use: "delete [TOPICS...]", - Short: "Delete topics.", + Short: "Delete topics", Long: `Delete topics. This command deletes all requested topics, printing the success or fail status @@ -75,6 +75,6 @@ For example, } }, } - cmd.Flags().BoolVarP(&re, "regex", "r", false, "parse topics as regex; delete any topic that matches any input topic expression") + cmd.Flags().BoolVarP(&re, "regex", "r", false, "Parse topics as regex; delete any topic that matches any input topic expression") return cmd } diff --git a/src/go/rpk/pkg/cli/cmd/topic/describe.go b/src/go/rpk/pkg/cli/cmd/topic/describe.go index 77bc9062d2c7d..ba2308eea8afb 100644 --- a/src/go/rpk/pkg/cli/cmd/topic/describe.go +++ b/src/go/rpk/pkg/cli/cmd/topic/describe.go @@ -37,7 +37,7 @@ func NewDescribeCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "describe [TOPIC]", Aliases: []string{"info"}, - Short: "Describe a topic.", + Short: "Describe a topic", Long: `Describe a topic. This command prints detailed information about a topic. There are three @@ -179,10 +179,10 @@ partitions section. By default, the summary and configs sections are printed. cmd.Flags().MarkDeprecated("watermarks", "deprecated - watermarks are always printed if the partition section is requested") cmd.Flags().MarkDeprecated("detailed", "deprecated - info has been merged into describe, use -p to print detailed information") - cmd.Flags().BoolVarP(&summary, "print-summary", "s", false, "print the summary section") - cmd.Flags().BoolVarP(&configs, "print-configs", "c", false, "print the config section") - cmd.Flags().BoolVarP(&partitions, "print-partitions", "p", false, "print the detailed partitions section") - cmd.Flags().BoolVarP(&all, "print-all", "a", false, "print all sections") + cmd.Flags().BoolVarP(&summary, "print-summary", "s", false, "Print the summary section") + cmd.Flags().BoolVarP(&configs, "print-configs", "c", false, "Print the config section") + cmd.Flags().BoolVarP(&partitions, "print-partitions", "p", false, "Print the detailed partitions section") + cmd.Flags().BoolVarP(&all, "print-all", "a", false, "Print all sections") return cmd } diff --git a/src/go/rpk/pkg/cli/cmd/topic/list.go b/src/go/rpk/pkg/cli/cmd/topic/list.go index 77e1c86727630..07383965160ad 100644 --- a/src/go/rpk/pkg/cli/cmd/topic/list.go +++ b/src/go/rpk/pkg/cli/cmd/topic/list.go @@ -29,7 +29,7 @@ func NewListCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "list", Aliases: []string{"ls"}, - Short: "List topics, optionally listing specific topics.", + Short: "List topics, optionally listing specific topics", Long: `List topics, optionally listing specific topics. This command lists all topics that you have access to by default. If specifying @@ -78,8 +78,8 @@ information. }, } - cmd.Flags().BoolVarP(&detailed, "detailed", "d", false, "print per-partition information for topics") - cmd.Flags().BoolVarP(&internal, "internal", "i", false, "print internal topics") - cmd.Flags().BoolVarP(&re, "regex", "r", false, "parse topics as regex; list any topic that matches any input topic expression") + cmd.Flags().BoolVarP(&detailed, "detailed", "d", false, "Print per-partition information for topics") + cmd.Flags().BoolVarP(&internal, "internal", "i", false, "Print internal topics") + cmd.Flags().BoolVarP(&re, "regex", "r", false, "Parse topics as regex; list any topic that matches any input topic expression") return cmd } diff --git a/src/go/rpk/pkg/cli/cmd/topic/produce.go b/src/go/rpk/pkg/cli/cmd/topic/produce.go index 4f95227a9eaa1..6a648b7fa38de 100644 --- a/src/go/rpk/pkg/cli/cmd/topic/produce.go +++ b/src/go/rpk/pkg/cli/cmd/topic/produce.go @@ -43,7 +43,7 @@ func NewProduceCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "produce [TOPIC]", - Short: "Produce records to a topic.", + Short: "Produce records to a topic", Long: helpProduce, Args: cobra.MaximumNArgs(1), Run: func(cmd *cobra.Command, args []string) { @@ -160,10 +160,10 @@ func NewProduceCommand(fs afero.Fs) *cobra.Command { // The following flags require parsing before we initialize our client. cmd.Flags().StringVarP(&compression, "compression", "z", "snappy", "Compression to use for producing batches (none, gzip, snapy, lz4, zstd)") cmd.Flags().IntVar(&acks, "acks", -1, "Number of acks required for producing (-1=all, 0=none, 1=leader)") - cmd.Flags().DurationVar(&timeout, "delivery-timeout", 0, "per-record delivery timeout, if non-zero, min 1s") - cmd.Flags().Int32VarP(&partition, "partition", "p", -1, "partition to directly produce to, if non-negative (also allows %p parsing to set partitions)") + cmd.Flags().DurationVar(&timeout, "delivery-timeout", 0, "Per-record delivery timeout, if non-zero, min 1s") + cmd.Flags().Int32VarP(&partition, "partition", "p", -1, "Partition to directly produce to, if non-negative (also allows %p parsing to set partitions)") - cmd.Flags().StringVarP(&inFormat, "format", "f", "%v\n", "input record format") + cmd.Flags().StringVarP(&inFormat, "format", "f", "%v\n", "Input record format") cmd.Flags().StringVarP( &outFormat, "output-format", @@ -173,15 +173,15 @@ func NewProduceCommand(fs afero.Fs) *cobra.Command { ) cmd.Flags().StringArrayVarP(&recHeaders, "header", "H", nil, "Headers in format key:value to add to each record (repeatable)") cmd.Flags().StringVarP(&key, "key", "k", "", "A fixed key to use for each record (parsed input keys take precedence)") - cmd.Flags().BoolVarP(&tombstone, "tombstone", "Z", false, "produce empty values as tombstones") + cmd.Flags().BoolVarP(&tombstone, "tombstone", "Z", false, "Produce empty values as tombstones") // Deprecated cmd.Flags().IntVarP(new(int), "num", "n", 1, "") - cmd.Flags().MarkDeprecated("num", "invoke rpk multiple times if you wish to repeat records") + cmd.Flags().MarkDeprecated("num", "Invoke rpk multiple times if you wish to repeat records") cmd.Flags().BoolVarP(new(bool), "jvm-partitioner", "j", false, "") - cmd.Flags().MarkDeprecated("jvm-partitioner", "the default is now the jvm-partitioner") + cmd.Flags().MarkDeprecated("jvm-partitioner", "The default is now the jvm-partitioner") cmd.Flags().StringVarP(new(string), "timestamp", "t", "", "") - cmd.Flags().MarkDeprecated("timestamp", "record timestamps are set when producing") + cmd.Flags().MarkDeprecated("timestamp", "Record timestamps are set when producing") return cmd } diff --git a/src/go/rpk/pkg/cli/cmd/version.go b/src/go/rpk/pkg/cli/cmd/version.go index de67ec4cc4e61..cd8c251bc3fdf 100644 --- a/src/go/rpk/pkg/cli/cmd/version.go +++ b/src/go/rpk/pkg/cli/cmd/version.go @@ -19,7 +19,7 @@ import ( func NewVersionCommand() *cobra.Command { command := &cobra.Command{ Use: "version", - Short: "Check the current version.", + Short: "Check the current version", Long: "", Run: func(_ *cobra.Command, _ []string) { log.SetFormatter(cli.NewNoopFormatter()) diff --git a/src/go/rpk/pkg/cli/cmd/wasm.go b/src/go/rpk/pkg/cli/cmd/wasm.go index 59199c2f03875..1492125907c82 100644 --- a/src/go/rpk/pkg/cli/cmd/wasm.go +++ b/src/go/rpk/pkg/cli/cmd/wasm.go @@ -31,7 +31,7 @@ func NewWasmCommand(fs afero.Fs) *cobra.Command { command := &cobra.Command{ Use: "wasm", - Short: "Deploy and remove inline WASM engine scripts.", + Short: "Deploy and remove inline WASM engine scripts", } common.AddKafkaFlags( command, diff --git a/src/go/rpk/pkg/cli/cmd/wasm/deploy.go b/src/go/rpk/pkg/cli/cmd/wasm/deploy.go index e7ce367753c34..a88eaf48db01d 100644 --- a/src/go/rpk/pkg/cli/cmd/wasm/deploy.go +++ b/src/go/rpk/pkg/cli/cmd/wasm/deploy.go @@ -19,7 +19,7 @@ func NewDeployCommand(fs afero.Fs) *cobra.Command { ) cmd := &cobra.Command{ Use: "deploy [PATH]", - Short: "Deploy inline WASM function.", + Short: "Deploy inline WASM function", Args: cobra.ExactArgs(1), Run: func(cmd *cobra.Command, args []string) { p := config.ParamsFromCommand(cmd) @@ -49,9 +49,9 @@ func NewDeployCommand(fs afero.Fs) *cobra.Command { }, } - cmd.Flags().StringVar(&description, "description", "", "optional description about what the wasm function does") + cmd.Flags().StringVar(&description, "description", "", "Optional description about what the wasm function does") cmd.Flags().StringVar(&coprocType, "type", "async", "WASM engine type (async, data-policy)") - cmd.Flags().StringVar(&name, "name", "", "unique deploy identifier attached to the instance of this script") + cmd.Flags().StringVar(&name, "name", "", "Unique deploy identifier attached to the instance of this script") cmd.MarkFlagRequired("name") return cmd } diff --git a/src/go/rpk/pkg/cli/cmd/wasm/generate.go b/src/go/rpk/pkg/cli/cmd/wasm/generate.go index f9f8532a4ac95..c345aa3649927 100644 --- a/src/go/rpk/pkg/cli/cmd/wasm/generate.go +++ b/src/go/rpk/pkg/cli/cmd/wasm/generate.go @@ -30,7 +30,7 @@ func NewGenerateCommand(fs afero.Fs) *cobra.Command { var skipVersion bool cmd := &cobra.Command{ Use: "generate [PROJECT DIRECTORY]", - Short: "Create a npm template project for inline WASM engine.", + Short: "Create a npm template project for inline WASM engine", Args: cobra.ExactArgs(1), Run: func(_ *cobra.Command, args []string) { path, err := filepath.Abs(args[0]) @@ -39,7 +39,7 @@ func NewGenerateCommand(fs afero.Fs) *cobra.Command { out.MaybeDie(err, "unable to generate all manifest files: %v", err) }, } - cmd.Flags().BoolVar(&skipVersion, "skip-version", false, "omit wasm-api version check from npm, use default instead") + cmd.Flags().BoolVar(&skipVersion, "skip-version", false, "Omit wasm-api version check from npm, use default instead") return cmd } diff --git a/src/go/rpk/pkg/cli/cmd/wasm/remove.go b/src/go/rpk/pkg/cli/cmd/wasm/remove.go index 01c522568c523..c35fd1057b0eb 100644 --- a/src/go/rpk/pkg/cli/cmd/wasm/remove.go +++ b/src/go/rpk/pkg/cli/cmd/wasm/remove.go @@ -15,7 +15,7 @@ func NewRemoveCommand(fs afero.Fs) *cobra.Command { cmd := &cobra.Command{ Use: "remove [NAME]", - Short: "Remove inline WASM function.", + Short: "Remove inline WASM function", Args: cobra.ExactArgs(1), Run: func(cmd *cobra.Command, args []string) { p := config.ParamsFromCommand(cmd) diff --git a/src/go/rpk/pkg/config/config.go b/src/go/rpk/pkg/config/config.go index 0881da098d619..06d439df26caa 100644 --- a/src/go/rpk/pkg/config/config.go +++ b/src/go/rpk/pkg/config/config.go @@ -143,6 +143,21 @@ func AvailableModes() []string { } } +// FileOrDefaults return the configuration as read from the file or +// the default configuration if there is no file loaded. +func (c *Config) FileOrDefaults() *Config { + if c.File() != nil { + cfg := c.File() + cfg.loadedPath = c.loadedPath + cfg.ConfigFile = c.ConfigFile // preserve loaded ConfigFile property. + return cfg + } else { + cfg := Default() + cfg.ConfigFile = c.ConfigFile + return cfg // no file, write the defaults + } +} + // Check checks if the redpanda and rpk configuration is valid before running // the tuners. See: redpanda_checkers. func (c *Config) Check() (bool, []error) { diff --git a/src/go/rpk/pkg/config/params_test.go b/src/go/rpk/pkg/config/params_test.go index 111a1ebaf1898..e25f0e4a03492 100644 --- a/src/go/rpk/pkg/config/params_test.go +++ b/src/go/rpk/pkg/config/params_test.go @@ -1,10 +1,12 @@ package config import ( + "os" "strings" "testing" "github.com/spf13/afero" + "github.com/stretchr/testify/require" ) func TestParams_Write(t *testing.T) { @@ -56,7 +58,6 @@ redpanda: }, exp: `config_file: /etc/redpanda/redpanda.yaml redpanda: - data_directory: "" node_id: 6 rack: my_rack `, @@ -130,3 +131,102 @@ rpk: }) } } + +func TestRedpandaSampleFile(t *testing.T) { + // Config from 'redpanda/conf/redpanda.yaml'. + sample, err := os.ReadFile("../../../../../conf/redpanda.yaml") + if err != nil { + t.Errorf("unexpected error while reading sample config file: %s", err) + return + } + fs := afero.NewMemMapFs() + err = afero.WriteFile(fs, "/etc/redpanda/redpanda.yaml", sample, 0o644) + if err != nil { + t.Errorf("unexpected error while writing sample config file: %s", err) + return + } + expCfg := &Config{ + ConfigFile: "/etc/redpanda/redpanda.yaml", + loadedPath: "/etc/redpanda/redpanda.yaml", + Redpanda: RedpandaConfig{ + Directory: "/var/lib/redpanda/data", + RPCServer: SocketAddress{ + Address: "0.0.0.0", + Port: 33145, + }, + KafkaAPI: []NamedSocketAddress{{ + Address: "0.0.0.0", + Port: 9092, + }}, + AdminAPI: []NamedSocketAddress{{ + Address: "0.0.0.0", + Port: 9644, + }}, + ID: 1, + SeedServers: []SeedServer{}, + DeveloperMode: true, + }, + Rpk: RpkConfig{ + CoredumpDir: "/var/lib/redpanda/coredump", + EnableUsageStats: true, + }, + Pandaproxy: &Pandaproxy{}, + SchemaRegistry: &SchemaRegistry{}, + } + // Load and check we load it correctly + cfg, err := new(Params).Load(fs) + if err != nil { + t.Errorf("unexpected error while loading sample config file: %s", err) + return + } + cfg = cfg.FileOrDefaults() // we want to check that we correctly load the raw file + require.Equal(t, expCfg, cfg) + + // Write to the file and check we don't mangle the config properties + err = cfg.Write(fs) + if err != nil { + t.Errorf("unexpected error while writing config file: %s", err) + return + } + file, err := afero.ReadFile(fs, "/etc/redpanda/redpanda.yaml") + if err != nil { + t.Errorf("unexpected error while reading config file from fs: %s", err) + return + } + require.Equal(t, `config_file: /etc/redpanda/redpanda.yaml +redpanda: + data_directory: /var/lib/redpanda/data + node_id: 1 + seed_servers: [] + rpc_server: + address: 0.0.0.0 + port: 33145 + kafka_api: + - address: 0.0.0.0 + port: 9092 + admin: + - address: 0.0.0.0 + port: 9644 + developer_mode: true +rpk: + enable_usage_stats: true + tune_network: false + tune_disk_scheduler: false + tune_disk_nomerges: false + tune_disk_write_cache: false + tune_disk_irq: false + tune_fstrim: false + tune_cpu: false + tune_aio_events: false + tune_clocksource: false + tune_swappiness: false + tune_transparent_hugepages: false + enable_memory_locking: false + tune_coredump: false + coredump_dir: /var/lib/redpanda/coredump + tune_ballast_file: false + overprovisioned: false +pandaproxy: {} +schema_registry: {} +`, string(file)) +} diff --git a/src/go/rpk/pkg/config/schema.go b/src/go/rpk/pkg/config/schema.go index c3440460a4f1c..f3e7ebfdd5be4 100644 --- a/src/go/rpk/pkg/config/schema.go +++ b/src/go/rpk/pkg/config/schema.go @@ -44,7 +44,7 @@ func (c *Config) File() *Config { } type RedpandaConfig struct { - Directory string `yaml:"data_directory" json:"data_directory"` + Directory string `yaml:"data_directory,omitempty" json:"data_directory"` ID int `yaml:"node_id" json:"node_id"` Rack string `yaml:"rack,omitempty" json:"rack"` SeedServers []SeedServer `yaml:"seed_servers" json:"seed_servers"` diff --git a/src/go/rpk/pkg/tuners/ReadMe.md b/src/go/rpk/pkg/tuners/ReadMe.md index 4f82a46515be9..180ba97067ca6 100644 --- a/src/go/rpk/pkg/tuners/ReadMe.md +++ b/src/go/rpk/pkg/tuners/ReadMe.md @@ -10,7 +10,7 @@ The following tuners are supported The disk IRQs tuner binds all disk IRQs to requested set of CPUs. This tuner uses `hwloc` library to compute CPU masks. Prevent IRQ Balance from moving tuned devices IRQs. CPU set that is used by the tuner can be limited by CPU mask parameter. If mask parameter is provided then only those CPUs that are masked will be considered as available. Mask covering all host CPUs is used as a default. -IRQs are disstributed according to the following rules: +IRQs are distributed according to the following rules: - Distribute NVMe disks IRQs equally among all available CPUs. - Distribute non-NVMe disks IRQs equally among designated CPUs or among all available CPUs in the `mq` mode. diff --git a/src/go/rpk/pkg/tuners/check.go b/src/go/rpk/pkg/tuners/check.go index 453018698b854..cc7aea793165a 100644 --- a/src/go/rpk/pkg/tuners/check.go +++ b/src/go/rpk/pkg/tuners/check.go @@ -10,6 +10,7 @@ package tuners import ( + "fmt" "path/filepath" "sort" "time" @@ -30,16 +31,26 @@ func Check( return results, err } - for _, checkers := range checkersMap { + // We use a sorted list of the checker's ID present in the checkersMap to + // run in a consistent order. + var ids []int + for id := range checkersMap { + ids = append(ids, int(id)) + } + sort.Ints(ids) + + for _, id := range ids { + checkers := checkersMap[CheckerID(id)] for _, c := range checkers { + log.Debugf("Starting checker %q", c.GetDesc()) result := c.Check() if result.Err != nil { if c.GetSeverity() == Fatal { - return results, result.Err + return results, fmt.Errorf("fatal error during checker %q execution: %v", c.GetDesc(), result.Err) } - log.Warnf("System check '%s' failed with non-fatal error '%s'", c.GetDesc(), result.Err) + fmt.Printf("System check %q failed with non-fatal error %q\n", c.GetDesc(), result.Err) } - log.Debugf("Checker '%s' result %+v", c.GetDesc(), result) + log.Debugf("Finished checker %q; result %+v", c.GetDesc(), result) results = append(results, *result) } } diff --git a/src/go/rpk/pkg/tuners/redpanda_checkers.go b/src/go/rpk/pkg/tuners/redpanda_checkers.go index f176a637dfdff..b907b94b6d40b 100644 --- a/src/go/rpk/pkg/tuners/redpanda_checkers.go +++ b/src/go/rpk/pkg/tuners/redpanda_checkers.go @@ -13,8 +13,10 @@ package tuners import ( "errors" + "fmt" "time" + "github.com/hashicorp/go-multierror" "github.com/redpanda-data/redpanda/src/go/rpk/pkg/cloud" "github.com/redpanda-data/redpanda/src/go/rpk/pkg/cloud/gcp" "github.com/redpanda-data/redpanda/src/go/rpk/pkg/config" @@ -71,8 +73,14 @@ func NewConfigChecker(conf *config.Config) Checker { Fatal, true, func() (interface{}, error) { - ok, _ := conf.Check() - return ok, nil + ok, errs := conf.Check() + var err error + if len(errs) > 0 { + s := multierror.ListFormatFunc(errs) + err = fmt.Errorf("config file checker error: %v", s) + } + + return ok, err }) } diff --git a/src/v/README.md b/src/v/README.md index 08778a84a0a6f..09c8023d1eb47 100644 --- a/src/v/README.md +++ b/src/v/README.md @@ -11,11 +11,11 @@ platform | Machine dependent settings like ssse3 instructions | coproc | WASM / Coprocessor engine for lambda transforms | resource_mgmt | CPU and IO priority | utils | code utils | -hashing | hashing utility adaptors often used in cryptography or checksuming | +hashing | hashing utility adaptors often used in cryptography or checksumming | storage | low level bits of the storage api | redpanda | high level program - main entry point | finjector | failure injector framework for testing and correctness | -json | json manipulation utlities | +json | json manipulation utilities | http | HTTP conversion and utilities | kafka | Kafka compatibility protocol layer | compression | utilities for supporting multiple compressor types | diff --git a/src/v/archival/ntp_archiver_service.cc b/src/v/archival/ntp_archiver_service.cc index 7c8c301914b19..fa181a7199da5 100644 --- a/src/v/archival/ntp_archiver_service.cc +++ b/src/v/archival/ntp_archiver_service.cc @@ -14,6 +14,7 @@ #include "archival/logger.h" #include "cloud_storage/partition_manifest.h" #include "cloud_storage/remote.h" +#include "cloud_storage/tx_range_manifest.h" #include "cloud_storage/types.h" #include "cluster/partition_manager.h" #include "model/metadata.h" @@ -67,6 +68,9 @@ ntp_archiver::ntp_archiver( , _manifest_upload_timeout(conf.manifest_upload_timeout) , _upload_loop_initial_backoff(conf.upload_loop_initial_backoff) , _upload_loop_max_backoff(conf.upload_loop_max_backoff) + , _sync_manifest_timeout( + config::shard_local_cfg() + .cloud_storage_readreplica_manifest_sync_timeout_ms.bind()) , _upload_sg(conf.upload_scheduling_group) , _io_priority(conf.upload_io_priority) { vassert( @@ -81,6 +85,30 @@ ntp_archiver::ntp_archiver( _start_term); } +void ntp_archiver::run_sync_manifest_loop() { + vassert( + !_sync_manifest_loop_started, + "sync manifest loop for ntp {} already started", + _ntp); + _sync_manifest_loop_started = true; + + // NOTE: not using ssx::spawn_with_gate_then here because we want to log + // inside the gate (so that _rtclog is guaranteed to be alive). + ssx::spawn_with_gate(_gate, [this] { + return sync_manifest_loop() + .handle_exception_type([](const ss::abort_requested_exception&) {}) + .handle_exception_type([](const ss::sleep_aborted&) {}) + .handle_exception_type([](const ss::gate_closed_exception&) {}) + .handle_exception([this](std::exception_ptr e) { + vlog(_rtclog.error, "sync manifest loop error: {}", e); + }) + .finally([this] { + vlog(_rtclog.debug, "sync manifest loop stopped"); + _sync_manifest_loop_stopped = true; + }); + }); +} + void ntp_archiver::run_upload_loop() { vassert( !_upload_loop_started, "upload loop for ntp {} already started", _ntp); @@ -150,12 +178,72 @@ ss::future<> ntp_archiver::upload_loop() { } } +ss::future<> ntp_archiver::sync_manifest_loop() { + while (sync_manifest_loop_can_continue()) { + cloud_storage::download_result result = co_await sync_manifest(); + + if (result != cloud_storage::download_result::success) { + // The logic in class `remote` already does retries: if we get here, + // it means the download failed after several retries, indicating + // something non-transient may be wrong. Hence error severity. + vlog( + _rtclog.error, + "Failed to download manifest {}", + _manifest.get_manifest_path()); + } else { + vlog( + _rtclog.debug, + "Successfuly downloaded manifest {}", + _manifest.get_manifest_path()); + } + co_await ss::sleep_abortable(_sync_manifest_timeout(), _as); + } +} + +ss::future ntp_archiver::sync_manifest() { + cloud_storage::download_result r = co_await download_manifest(); + if (r == cloud_storage::download_result::success) { + vlog(_rtclog.debug, "Downloading manifest in read-replica mode"); + if (_partition->archival_meta_stm()) { + vlog( + _rtclog.debug, + "Updating the archival_meta_stm in read-replica mode"); + auto deadline = ss::lowres_clock::now() + _manifest_upload_timeout; + auto error = co_await _partition->archival_meta_stm()->add_segments( + _manifest, deadline, _as); + if ( + error != cluster::errc::success + && error != cluster::errc::not_leader) { + vlog( + _rtclog.warn, + "archival metadata STM update failed: {}", + error); + } + auto last_offset + = _partition->archival_meta_stm()->manifest().get_last_offset(); + vlog(_rtclog.debug, "manifest last_offset: {}", last_offset); + } + } else { + vlog( + _rtclog.error, + "Failed to download partition manifest in read-replica mode"); + } + co_return r; +} + bool ntp_archiver::upload_loop_can_continue() const { return !_as.abort_requested() && !_gate.is_closed() && _partition->is_elected_leader() && _partition->term() == _start_term; } +bool ntp_archiver::sync_manifest_loop_can_continue() const { + // todo: think about it + return !_as.abort_requested() && !_gate.is_closed() + && _partition->is_elected_leader() + && _partition->term() == _start_term; +} + ss::future<> ntp_archiver::stop() { _as.request_abort(); return _gate.close(); @@ -240,6 +328,34 @@ ntp_archiver::upload_segment(upload_candidate candidate) { _bucket, path, candidate.content_length, reset_func, fib); } +ss::future +ntp_archiver::upload_tx(upload_candidate candidate) { + gate_guard guard{_gate}; + retry_chain_node fib( + _segment_upload_timeout, _cloud_storage_initial_backoff, &_rtcnode); + retry_chain_logger ctxlog(archival_log, fib, _ntp.path()); + + vlog( + ctxlog.debug, "Uploading segment's tx range {}", candidate.exposed_name); + + auto tx_range = co_await _partition->aborted_transactions( + candidate.starting_offset, candidate.final_offset); + + if (tx_range.empty()) { + // The actual upload only happens if tx_range is not empty. + // The remote_segment should act as if the tx_range is empty if the + // request returned NoSuchKey error. + co_return cloud_storage::upload_result::success; + } + + auto path = cloud_storage::generate_remote_segment_path( + _ntp, _rev, candidate.exposed_name, _start_term); + + cloud_storage::tx_range_manifest manifest(path, tx_range); + + co_return co_await _remote.upload_manifest(_bucket, manifest, fib); +} + ss::future ntp_archiver::schedule_single_upload( model::offset start_upload_offset, model::offset last_stable_offset) { std::optional log = _partition_manager.log(_ntp); @@ -328,8 +444,24 @@ ss::future ntp_archiver::schedule_single_upload( start_upload_offset = offset + model::offset(1); auto delta = base - _partition->get_offset_translator_state()->from_log_offset(base); + // The upload is successful only if both segment and tx_range are uploaded. + auto upl_fut + = ss::when_all(upload_segment(upload), upload_tx(upload)) + .then([](auto tup) { + auto [fs, ftx] = std::move(tup); + auto rs = fs.get(); + auto rtx = ftx.get(); + if ( + rs == cloud_storage::upload_result::success + && rtx == cloud_storage::upload_result::success) { + return rs; + } else if (rs != cloud_storage::upload_result::success) { + return rs; + } + return rtx; + }); co_return scheduled_upload{ - .result = upload_segment(upload), + .result = std::move(upl_fut), .inclusive_last_offset = offset, .meta = cloud_storage::partition_manifest::segment_meta{ .is_compacted = upload.source->is_compacted_segment(), diff --git a/src/v/archival/ntp_archiver_service.h b/src/v/archival/ntp_archiver_service.h index 1cad7145c8227..c36a12c3fae6b 100644 --- a/src/v/archival/ntp_archiver_service.h +++ b/src/v/archival/ntp_archiver_service.h @@ -73,6 +73,8 @@ class ntp_archiver { /// storage. Can be started only once. void run_upload_loop(); + void run_sync_manifest_loop(); + /// Stop archiver. /// /// \return future that will become ready when all async operation will be @@ -80,6 +82,9 @@ class ntp_archiver { ss::future<> stop(); bool upload_loop_stopped() const { return _upload_loop_stopped; } + bool sync_manifest_loop_stopped() const { + return _sync_manifest_loop_stopped; + } /// Get NTP const model::ntp& get_ntp() const; @@ -112,6 +117,8 @@ class ntp_archiver { ss::future upload_next_candidates( std::optional last_stable_offset_override = std::nullopt); + ss::future sync_manifest(); + uint64_t estimate_backlog_size(); /// \brief Probe remote storage and truncate the manifest if needed @@ -156,17 +163,27 @@ class ntp_archiver { /// Upload individual segment to S3. /// - /// \return true on success and false otherwise + /// \return error code ss::future upload_segment(upload_candidate candidate); + /// Upload segment's transactions metadata to S3. + /// + /// \return error code + ss::future + upload_tx(upload_candidate candidate); + /// Upload manifest to the pre-defined S3 location ss::future upload_manifest(); /// Launch the upload loop fiber. ss::future<> upload_loop(); + /// Launch the sync manifest loop fiber. + ss::future<> sync_manifest_loop(); + bool upload_loop_can_continue() const; + bool sync_manifest_loop_can_continue() const; ntp_level_probe _probe; model::ntp _ntp; @@ -190,6 +207,7 @@ class ntp_archiver { ss::semaphore _mutex{1}; ss::lowres_clock::duration _upload_loop_initial_backoff; ss::lowres_clock::duration _upload_loop_max_backoff; + config::binding _sync_manifest_timeout; simple_time_jitter _backoff_jitter{100ms}; size_t _concurrency{4}; ss::lowres_clock::time_point _last_upload_time; @@ -197,6 +215,9 @@ class ntp_archiver { ss::io_priority_class _io_priority; bool _upload_loop_started = false; bool _upload_loop_stopped = false; + + bool _sync_manifest_loop_started = false; + bool _sync_manifest_loop_stopped = false; }; } // namespace archival diff --git a/src/v/archival/service.cc b/src/v/archival/service.cc index b0d474c5a089d..6060f5d0532f3 100644 --- a/src/v/archival/service.cc +++ b/src/v/archival/service.cc @@ -231,48 +231,56 @@ ss::future<> scheduler_service_impl::add_ntp_archiver( if (_gate.is_closed()) { return ss::now(); } - return archiver->download_manifest().then( - [this, archiver](cloud_storage::download_result result) { - auto ntp = archiver->get_ntp(); - switch (result) { - case cloud_storage::download_result::success: - vlog( - _rtclog.info, - "Found manifest for partition {}", - archiver->get_ntp()); - _probe.start_archiving_ntp(); - - _archivers.emplace(archiver->get_ntp(), archiver); - archiver->run_upload_loop(); - - return ss::now(); - case cloud_storage::download_result::notfound: - vlog( - _rtclog.info, - "Start archiving new partition {}", - archiver->get_ntp()); - // Start topic manifest upload - // asynchronously - if (ntp.tp.partition == 0) { - // Upload manifest once per topic. GCS has strict - // limits for single object updates. - (void)upload_topic_manifest( - model::topic_namespace(ntp.ns, ntp.tp.topic), - archiver->get_revision_id()); - } - _probe.start_archiving_ntp(); - - _archivers.emplace(archiver->get_ntp(), archiver); - archiver->run_upload_loop(); + return archiver->download_manifest().then([this, archiver]( + cloud_storage::download_result + result) { + auto ntp = archiver->get_ntp(); + auto part = _partition_manager.local().get(ntp); + switch (result) { + case cloud_storage::download_result::success: + vlog(_rtclog.info, "Found manifest for partition {}", ntp); + + if (part->get_ntp_config().is_read_replica_mode_enabled()) { + archiver->run_sync_manifest_loop(); + } else { + _probe.start_archiving_ntp(); + archiver->run_upload_loop(); + } + _archivers.emplace(ntp, archiver); + + return ss::now(); + case cloud_storage::download_result::notfound: + if (part->get_ntp_config().is_read_replica_mode_enabled()) { + vlog( + _rtclog.info, + "Couldn't download manifest for partition {} in read replica", + ntp); + archiver->run_sync_manifest_loop(); + } else { + vlog(_rtclog.info, "Start archiving new partition {}", ntp); + // Start topic manifest upload + // asynchronously + if (ntp.tp.partition == 0) { + // Upload manifest once per topic. GCS has strict + // limits for single object updates. + (void)upload_topic_manifest( + model::topic_namespace(ntp.ns, ntp.tp.topic), + archiver->get_revision_id()); + } + _probe.start_archiving_ntp(); + + archiver->run_upload_loop(); + } + _archivers.emplace(ntp, archiver); - return ss::now(); - case cloud_storage::download_result::failed: - case cloud_storage::download_result::timedout: - vlog(_rtclog.warn, "Manifest download failed"); - return ss::make_exception_future<>(ss::timed_out_error()); - } - return ss::now(); - }); + return ss::now(); + case cloud_storage::download_result::failed: + case cloud_storage::download_result::timedout: + vlog(_rtclog.warn, "Manifest download failed"); + return ss::make_exception_future<>(ss::timed_out_error()); + } + return ss::now(); + }); } ss::future<> @@ -284,9 +292,13 @@ scheduler_service_impl::create_archivers(std::vector to_create) { std::move(to_create), concurrency, [this](const model::ntp& ntp) { auto log = _partition_manager.local().log(ntp); auto part = _partition_manager.local().get(ntp); - if (log.has_value() && part && part->is_elected_leader() - && (part->get_ntp_config().is_archival_enabled() - || config::shard_local_cfg().cloud_storage_enable_remote_write())) { + if (!log.has_value() || !part || !part->is_elected_leader()) { + return ss::now(); + } + if ( + part->get_ntp_config().is_archival_enabled() + || part->get_ntp_config().is_read_replica_mode_enabled() + || config::shard_local_cfg().cloud_storage_enable_remote_write()) { auto archiver = ss::make_lw_shared( log->config(), _partition_manager.local(), @@ -325,7 +337,9 @@ ss::future<> scheduler_service_impl::reconcile_archivers() { // find archivers that have already stopped for (const auto& [ntp, archiver] : _archivers) { auto p = pm.get(ntp); - if (!p || archiver->upload_loop_stopped()) { + if ( + !p + || (archiver->upload_loop_stopped() && archiver->sync_manifest_loop_stopped())) { to_remove.push_back(ntp); } } diff --git a/src/v/cloud_storage/CMakeLists.txt b/src/v/cloud_storage/CMakeLists.txt index 205c3f7e67383..d3dbfa1325571 100644 --- a/src/v/cloud_storage/CMakeLists.txt +++ b/src/v/cloud_storage/CMakeLists.txt @@ -17,6 +17,7 @@ v_cc_library( remote_segment.cc remote_partition.cc remote_segment_index.cc + tx_range_manifest.cc DEPS Seastar::seastar v::bytes diff --git a/src/v/cloud_storage/base_manifest.h b/src/v/cloud_storage/base_manifest.h index e3169cb6c8973..96d37fec16cb4 100644 --- a/src/v/cloud_storage/base_manifest.h +++ b/src/v/cloud_storage/base_manifest.h @@ -27,6 +27,7 @@ struct serialized_json_stream { enum class manifest_type { topic, partition, + tx_range, }; class base_manifest { diff --git a/src/v/cloud_storage/probe.cc b/src/v/cloud_storage/probe.cc index af7a8f3a8badc..398d690ef7163 100644 --- a/src/v/cloud_storage/probe.cc +++ b/src/v/cloud_storage/probe.cc @@ -99,7 +99,7 @@ remote_probe::remote_probe( } if (!public_disabled) { - auto direction_label = sm::label("direction"); + auto direction_label = ssx::metrics::make_namespaced_label("direction"); _public_metrics.add_group( prometheus_sanitize::metrics_name("cloud_storage"), diff --git a/src/v/cloud_storage/probe.h b/src/v/cloud_storage/probe.h index cc59faf668fd8..7acf0e5dafb2f 100644 --- a/src/v/cloud_storage/probe.h +++ b/src/v/cloud_storage/probe.h @@ -59,6 +59,22 @@ class remote_probe { return _cnt_partition_manifest_downloads; } + /// Register manifest (re)upload + void txrange_manifest_upload() { _cnt_tx_manifest_uploads++; } + + /// Get manifest (re)upload + uint64_t get_txrange_manifest_uploads() const { + return _cnt_tx_manifest_uploads; + } + + /// Register manifest download + void txrange_manifest_download() { _cnt_tx_manifest_downloads++; } + + /// Get manifest download + uint64_t get_txrange_manifest_downloads() const { + return _cnt_tx_manifest_downloads; + } + /// Register backof invocation during manifest upload void manifest_upload_backoff() { _cnt_manifest_upload_backoff++; } @@ -166,6 +182,10 @@ class remote_probe { uint64_t _cnt_bytes_sent{0}; /// Number of bytes being successfully received from S3 uint64_t _cnt_bytes_received{0}; + /// Number of tx-range manifest uploads + uint64_t _cnt_tx_manifest_uploads{0}; + /// Number of tx-range manifest downloads + uint64_t _cnt_tx_manifest_downloads{0}; ss::metrics::metric_groups _metrics; ss::metrics::metric_groups _public_metrics; diff --git a/src/v/cloud_storage/remote.cc b/src/v/cloud_storage/remote.cc index 29af6e1790cee..3f3c27c4f37c3 100644 --- a/src/v/cloud_storage/remote.cc +++ b/src/v/cloud_storage/remote.cc @@ -216,6 +216,9 @@ ss::future remote::download_manifest( case manifest_type::topic: _probe.topic_manifest_download(); break; + case manifest_type::tx_range: + _probe.txrange_manifest_download(); + break; } co_return download_result::success; } catch (...) { @@ -298,6 +301,9 @@ ss::future remote::upload_manifest( case manifest_type::topic: _probe.topic_manifest_upload(); break; + case manifest_type::tx_range: + _probe.txrange_manifest_upload(); + break; } _probe.register_upload_size(size); co_return upload_result::success; diff --git a/src/v/cloud_storage/remote_partition.cc b/src/v/cloud_storage/remote_partition.cc index efd96d33986b0..b8e6a342a64ae 100644 --- a/src/v/cloud_storage/remote_partition.cc +++ b/src/v/cloud_storage/remote_partition.cc @@ -29,6 +29,7 @@ #include #include +#include #include using namespace std::chrono_literals; @@ -459,6 +460,14 @@ model::offset remote_partition::first_uploaded_offset() { } } +model::offset remote_partition::last_uploaded_offset() { + vassert( + _manifest.size() > 0, + "The manifest for {} is not expected to be empty", + _manifest.get_ntp()); + return _manifest.get_last_offset(); +} + const model::ntp& remote_partition::get_ntp() const { return _manifest.get_ntp(); } @@ -490,6 +499,54 @@ remote_partition::get_term_last_offset(model::term_id term) const { return std::nullopt; } +ss::future> +remote_partition::aborted_transactions(offset_range offsets) { + // Here we have to use kafka offsets to locate the segments and + // redpanda offsets to extract aborted transactions metadata because + // tx-manifests contains redpanda offsets. + std::vector result; + auto first_it = _segments.upper_bound(offsets.begin); + if (first_it != _segments.begin()) { + first_it = std::prev(first_it); + } + for (auto it = first_it; it != _segments.end(); it++) { + if (it->first > offsets.end) { + break; + } + auto& st = it->second; + auto tx = co_await ss::visit( + st, + [this, &st, offsets, offset_key = it->first]( + offloaded_segment_state& off_state) { + auto tmp = off_state->materialize(*this, offset_key); + auto res = tmp->segment->aborted_transactions( + offsets.begin_rp, offsets.end_rp); + st = std::move(tmp); + return res; + }, + [offsets](materialized_segment_ptr& m_state) { + return m_state->segment->aborted_transactions( + offsets.begin_rp, offsets.end_rp); + }); + std::copy(tx.begin(), tx.end(), std::back_inserter(result)); + } + // Adjacent segments might return the same transaction record. + // In this case we will have a duplicate. The duplicates will always + // be located next to each other in the sequence. + auto last = std::unique(result.begin(), result.end()); + result.erase(last, result.end()); + vlog( + _ctxlog.debug, + "found {} aborted transactions for {}-{} offset range ({}-{} before " + "offset translaction)", + result.size(), + offsets.begin_rp, + offsets.begin, + offsets.end_rp, + offsets.end); + co_return result; +} + ss::future<> remote_partition::stop() { vlog(_ctxlog.debug, "remote partition stop {} segments", _segments.size()); _stm_timer.cancel(); diff --git a/src/v/cloud_storage/remote_partition.h b/src/v/cloud_storage/remote_partition.h index f622daec685ec..3d14fce357291 100644 --- a/src/v/cloud_storage/remote_partition.h +++ b/src/v/cloud_storage/remote_partition.h @@ -133,6 +133,13 @@ class btree_map_stable_iterator } // namespace details +struct offset_range { + model::offset begin; + model::offset end; + model::offset begin_rp; + model::offset end_rp; +}; + /// Remote partition manintains list of remote segments /// and list of active readers. Only one reader can be /// maintained per segment. The idea here is that the @@ -180,6 +187,9 @@ class remote_partition /// Return first uploaded kafka offset model::offset first_uploaded_offset(); + /// Return last uploaded kafka offset + model::offset last_uploaded_offset(); + /// Get partition NTP const model::ntp& get_ntp() const; @@ -189,6 +199,10 @@ class remote_partition // returns term last kafka offset std::optional get_term_last_offset(model::term_id) const; + // Get list of aborted transactions that overlap with the offset range + ss::future> + aborted_transactions(offset_range offsets); + private: /// Create new remote_segment instances for all new /// items in the manifest. diff --git a/src/v/cloud_storage/remote_segment.cc b/src/v/cloud_storage/remote_segment.cc index a6380aa2eb703..7c8ca39d6d419 100644 --- a/src/v/cloud_storage/remote_segment.cc +++ b/src/v/cloud_storage/remote_segment.cc @@ -13,7 +13,9 @@ #include "bytes/iobuf.h" #include "cloud_storage/cache_service.h" #include "cloud_storage/logger.h" +#include "cloud_storage/partition_manifest.h" #include "cloud_storage/remote_segment_index.h" +#include "cloud_storage/tx_range_manifest.h" #include "cloud_storage/types.h" #include "config/configuration.h" #include "model/fundamental.h" @@ -32,10 +34,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include @@ -232,7 +236,7 @@ remote_segment::maybe_get_offsets(model::offset kafka_offset) { return pos; } -ss::future<> remote_segment::do_hydrate() { +ss::future<> remote_segment::do_hydrate_segment() { auto callback = [this]( uint64_t size_bytes, ss::input_stream s) -> ss::future { @@ -296,6 +300,106 @@ ss::future<> remote_segment::do_hydrate() { } } +ss::future<> remote_segment::do_hydrate_txrange() { + ss::gate::holder guard(_gate); + retry_chain_node local_rtc( + cache_hydration_timeout, cache_hydration_backoff, &_rtc); + + tx_range_manifest manifest(_path); + + auto res = co_await _api.download_manifest( + _bucket, manifest.get_manifest_path(), manifest, local_rtc); + + if (res == download_result::notfound) { + vlog( + _ctxlog.debug, + "tx_range {}, doesn't exist in the bucket", + manifest.get_manifest_path()); + } else if (res != download_result::success) { + vlog( + _ctxlog.debug, + "Failed to hydrating a tx_range {}, {} waiter will be " + "invoked", + manifest.get_manifest_path(), + _wait_list.size()); + throw download_exception(res, _path); + } + _tx_range = std::move(manifest).get_tx_range(); +} + +ss::future remote_segment::do_materialize_segment() { + if (_data_file) { + co_return true; + } + auto maybe_file = co_await _cache.get(_path); + if (!maybe_file) { + // We could got here because the cache check returned + // 'cache_element_status::available' but right after + // that the file was evicted from cache. It's also + // possible (but very unlikely) that we got here after + // successful hydration which was immediately followed + // by eviction. In any case we should just re-hydrate + // the segment. + vlog( + _ctxlog.info, + "Segment {} was deleted from cache and need to be " + "re-hydrated, {} waiter are pending", + _path, + _wait_list.size()); + co_return false; + } + _data_file = maybe_file->body; + if (!_index) { + // Materialize index state if it's not materialized yet. + // If do_hydrate_segment was called _index will be populated + // and this branch won't be triggered. If the segment was + // available on disk then this branch will read it and populate + // the _index. + co_await maybe_materialize_index(); + } + co_return true; +} + +ss::future remote_segment::do_materialize_txrange() { + if (_tx_range) { + co_return true; + } + auto path = generate_remote_tx_path(_path); + if (auto cache_item = co_await _cache.get(path); cache_item.has_value()) { + // The cache item is expected to be present if the this method is + // called. + vlog(_ctxlog.info, "Trying to materialize tx_range '{}'", path); + tx_range_manifest manifest(_path); + try { + ss::file_input_stream_options options{}; + options.buffer_size + = config::shard_local_cfg().storage_read_buffer_size; + options.read_ahead + = config::shard_local_cfg().storage_read_readahead_count; + options.io_priority_class + = priority_manager::local().shadow_indexing_priority(); + auto inp_stream = ss::make_file_input_stream( + cache_item->body, options); + co_await manifest.update(std::move(inp_stream)); + _tx_range = std::move(manifest).get_tx_range(); + } catch (...) { + vlog( + _ctxlog.warn, + "Failed to materialize tx_range '{}'. Error: {}", + path, + std::current_exception()); + } + co_await cache_item->body.close(); + } else { + vlog( + _ctxlog.info, + "tx_range '{}' is not available in cache, retrying", + path); + co_return false; + } + co_return true; +} + ss::future<> remote_segment::maybe_materialize_index() { ss::gate::holder guard(_gate); auto path = _path().native() + ".index"; @@ -342,6 +446,59 @@ ss::future<> remote_segment::maybe_materialize_index() { } } +// NOTE: Aborted transactions handled using tx_range manifests. +// The manifests are uploaded alongside the segments with (.tx) +// suffix added to the name. The hydration of tx_range manifest +// is not optional. We can't use the segment without it. The following +// cases are possible: +// - Both segment and tx-range are not hydrated; +// - The segment is hydrated but tx-range isn't +// - The segment is not hydrated but tx-range is +// - Both segment and tx-range are hydrated +// This doesn't include various 'in_progress' combinations which are +// disallowed. +// +// Also, both segment and tx-range can be materialized or not. In case +// of the segment this means that we're holding an opened file handler. +// In case of tx-range this means that we parsed the json and populated +// _tx_range collection. +// +// In order to be able to deal with the complexity this code combines +// the flags and tries to handle all combinations that makes sense. +enum class segment_txrange_status { + in_progress, + available, + not_available, + available_not_available, + not_available_available, +}; + +static segment_txrange_status +combine_statuses(cache_element_status segment, cache_element_status tx_range) { + switch (segment) { + case cache_element_status::in_progress: + return segment_txrange_status::in_progress; + case cache_element_status::available: + switch (tx_range) { + case cache_element_status::available: + return segment_txrange_status::available; + case cache_element_status::in_progress: + return segment_txrange_status::in_progress; + case cache_element_status::not_available: + return segment_txrange_status::available_not_available; + } + case cache_element_status::not_available: + switch (tx_range) { + case cache_element_status::available: + return segment_txrange_status::not_available_available; + case cache_element_status::in_progress: + return segment_txrange_status::in_progress; + case cache_element_status::not_available: + return segment_txrange_status::not_available; + } + } +} + ss::future<> remote_segment::run_hydrate_bg() { ss::gate::holder guard(_gate); try { @@ -361,57 +518,62 @@ ss::future<> remote_segment::run_hydrate_bg() { // and retrieve the file out of it or hydrate. // If _data_file is initialized we can use it safely since the // cache can't delete it until we close it. - auto status = co_await _cache.is_cached(_path); + auto tx_path = generate_remote_tx_path(_path); + auto segment_status = co_await _cache.is_cached(_path); + auto txrange_status = co_await _cache.is_cached(tx_path); + auto status = combine_statuses(segment_status, txrange_status); switch (status) { - case cache_element_status::in_progress: + case segment_txrange_status::in_progress: vassert( false, - "Hydration of segment {} is already in progress, {} " - "waiters", + "Hydration of segment or tx-manifest {} is already in " + "progress, {} waiters", _path, _wait_list.size()); - case cache_element_status::available: + case segment_txrange_status::available: vlog( _ctxlog.debug, "Hydrated segment {} is already available, {} waiters " - "will " - "be invoked", + "will be invoked", _path, _wait_list.size()); break; - case cache_element_status::not_available: { - vlog(_ctxlog.info, "Hydrating segment {}", _path); + case segment_txrange_status::not_available: + vlog( + _ctxlog.info, + "Hydrating segment and tx-manifest {}", + _path); try { - co_await do_hydrate(); + co_await ss::coroutine::all( + [this] { return do_hydrate_segment(); }, + [this] { return do_hydrate_txrange(); }); } catch (const download_exception&) { err = std::current_exception(); } - } break; + break; + case segment_txrange_status::not_available_available: + vlog(_ctxlog.info, "Hydrating only segment {}", _path); + try { + co_await do_hydrate_segment(); + } catch (const download_exception&) { + err = std::current_exception(); + } + break; + case segment_txrange_status::available_not_available: + vlog(_ctxlog.info, "Hydrating only tx-manifest {}", _path); + try { + co_await do_hydrate_txrange(); + } catch (const download_exception&) { + err = std::current_exception(); + } + break; } if (!err) { - auto maybe_file = co_await _cache.get(_path); - if (!maybe_file) { - // We could got here because the cache check returned - // 'cache_element_status::available' but right after - // that the file was evicted from cache. It's also - // possible (but very unlikely) that we got here after - // successful hydration which was immediately followed - // by eviction. In any case we should just re-hydrate - // the segment. The 'wait' on cond-variable won't block - // because the - // '_wait_list' is not empty. - vlog( - _ctxlog.info, - "Segment {} was deleted from cache and need to be " - "re-hydrated, {} waiter are pending", - _path, - _wait_list.size()); + if (co_await do_materialize_segment() == false) { continue; } - _data_file = maybe_file->body; - if (!_index) { - // materialize index state - co_await maybe_materialize_index(); + if (co_await do_materialize_txrange() == false) { + continue; } } } @@ -454,6 +616,35 @@ ss::future<> remote_segment::hydrate() { }); } +ss::future> +remote_segment::aborted_transactions(model::offset from, model::offset to) { + co_await hydrate(); + std::vector result; + if (!_tx_range) { + // We got NoSuchKey when we tried to download the + // tx-manifest. This means that segment doesn't have + // any record batches which belong to aborted transactions. + vlog(_ctxlog.debug, "segment {} no tx-metadata available", _path); + co_return result; + } + for (const auto& it : *_tx_range) { + if (it.last < from) { + continue; + } + if (it.first > to) { + continue; + } + result.push_back(it); + } + vlog( + _ctxlog.debug, + "found {} aborted transactions for {}-{} offset range in this segment", + result.size(), + from, + to); + co_return result; +} + /// Batch consumer that connects to remote_segment_batch_reader. /// It also does offset translation based on incomplete data in /// manifests. diff --git a/src/v/cloud_storage/remote_segment.h b/src/v/cloud_storage/remote_segment.h index 3a2e8e2277a06..ecfeca01bec9c 100644 --- a/src/v/cloud_storage/remote_segment.h +++ b/src/v/cloud_storage/remote_segment.h @@ -17,6 +17,7 @@ #include "cloud_storage/remote.h" #include "cloud_storage/remote_segment_index.h" #include "cloud_storage/types.h" +#include "cluster/rm_stm.h" #include "model/fundamental.h" #include "model/record.h" #include "s3/client.h" @@ -118,6 +119,13 @@ class remote_segment final { bool download_in_progress() const noexcept { return !_wait_list.empty(); } + /// Return aborted transactions metadata associated with the segment + /// + /// \param from start redpanda offset + /// \param to end redpanda offset + ss::future> + aborted_transactions(model::offset from, model::offset to); + private: /// get a file offset for the corresponding kafka offset /// if the index is available @@ -133,7 +141,15 @@ class remote_segment final { /// Actually hydrate the segment. The method downloads the segment file /// to the cache dir and updates the segment index. - ss::future<> do_hydrate(); + ss::future<> do_hydrate_segment(); + /// Hydrate tx manifest. Method downloads the manifest file to the cache + /// dir. + ss::future<> do_hydrate_txrange(); + /// Materilize segment. Segment has to be hydrated beforehand. The + /// 'materialization' process opens file handle and creates + /// compressed segment index in memory. + ss::future do_materialize_segment(); + ss::future do_materialize_txrange(); /// Load segment index from file (if available) ss::future<> maybe_materialize_index(); @@ -162,6 +178,9 @@ class remote_segment final { ss::file _data_file; std::optional _index; + + using tx_range_vec = fragmented_vector; + std::optional _tx_range; }; class remote_segment_batch_consumer; diff --git a/src/v/cloud_storage/tests/CMakeLists.txt b/src/v/cloud_storage/tests/CMakeLists.txt index 1396bb7ec58fd..018041ac6f9ec 100644 --- a/src/v/cloud_storage/tests/CMakeLists.txt +++ b/src/v/cloud_storage/tests/CMakeLists.txt @@ -5,6 +5,7 @@ rp_test( directory_walker_test.cc partition_manifest_test.cc topic_manifest_test.cc + tx_range_manifest_test.cc s3_imposter.cc remote_test.cc offset_translation_layer_test.cc diff --git a/src/v/cloud_storage/tests/tx_range_manifest_test.cc b/src/v/cloud_storage/tests/tx_range_manifest_test.cc new file mode 100644 index 0000000000000..85b7006c8d5da --- /dev/null +++ b/src/v/cloud_storage/tests/tx_range_manifest_test.cc @@ -0,0 +1,90 @@ +/* + * Copyright 2022 Redpanda Data, Inc. + * + * Licensed as a Redpanda Enterprise file under the Redpanda Community + * License (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://github.com/redpanda-data/redpanda/blob/master/licenses/rcl.md + */ + +#include "bytes/iobuf.h" +#include "bytes/iobuf_parser.h" +#include "cloud_storage/partition_manifest.h" +#include "cloud_storage/tx_range_manifest.h" +#include "cloud_storage/types.h" +#include "cluster/types.h" +#include "model/compression.h" +#include "model/fundamental.h" +#include "model/metadata.h" +#include "model/record.h" +#include "seastarx.h" + +#include +#include + +#include +#include + +#include +#include +#include +#include + +using namespace cloud_storage; + +static remote_segment_path + segment_path("abcdef01/kafka/topic/0_1/0-1-v1.log.1"); +static remote_manifest_path + manifest_path("abcdef01/kafka/topic/0_1/0-1-v1.log.1.tx"); + +using tx_range_t = cluster::rm_stm::tx_range; + +static std::vector ranges = { + tx_range_t{ + .pid = model::producer_identity(1, 2), + .first = model::offset(3), + .last = model::offset(5), + }, + tx_range_t{ + .pid = model::producer_identity(2, 3), + .first = model::offset(4), + .last = model::offset(6), + }}; + +SEASTAR_THREAD_TEST_CASE(manifest_type_tx) { + tx_range_manifest m(segment_path); + BOOST_REQUIRE(m.get_manifest_type() == manifest_type::tx_range); +} + +SEASTAR_THREAD_TEST_CASE(create_tx_manifest) { + tx_range_manifest m(segment_path); + auto path = m.get_manifest_path(); + BOOST_REQUIRE_EQUAL(path, manifest_path); +} + +SEASTAR_THREAD_TEST_CASE(empty_serialization_roundtrip_test) { + tx_range_manifest m(segment_path); + auto [is, size] = m.serialize(); + iobuf buf; + auto os = make_iobuf_ref_output_stream(buf); + ss::copy(is, os).get(); + + auto rstr = make_iobuf_input_stream(std::move(buf)); + tx_range_manifest restored(segment_path); + restored.update(std::move(rstr)).get(); + BOOST_REQUIRE(m == restored); +} + +SEASTAR_THREAD_TEST_CASE(serialization_roundtrip_test) { + tx_range_manifest m(segment_path, ranges); + auto [is, size] = m.serialize(); + iobuf buf; + auto os = make_iobuf_ref_output_stream(buf); + ss::copy(is, os).get(); + + auto rstr = make_iobuf_input_stream(std::move(buf)); + tx_range_manifest restored(segment_path); + restored.update(std::move(rstr)).get(); + BOOST_REQUIRE(m == restored); +} diff --git a/src/v/cloud_storage/tx_range_manifest.cc b/src/v/cloud_storage/tx_range_manifest.cc new file mode 100644 index 0000000000000..89eb6e948db7d --- /dev/null +++ b/src/v/cloud_storage/tx_range_manifest.cc @@ -0,0 +1,121 @@ +#include "cloud_storage/tx_range_manifest.h" + +#include "bytes/iobuf.h" +#include "bytes/iobuf_istreambuf.h" +#include "bytes/iobuf_ostreambuf.h" +#include "cloud_storage/partition_manifest.h" +#include "cloud_storage/types.h" +#include "cluster/rm_stm.h" +#include "json/istreamwrapper.h" +#include "model/record.h" +#include "utils/fragmented_vector.h" + +#include +#include +#include +#include +#include + +namespace cloud_storage { + +remote_manifest_path generate_remote_tx_path(const remote_segment_path& path) { + return remote_manifest_path(fmt::format("{}.tx", path().native())); +} + +tx_range_manifest::tx_range_manifest( + remote_segment_path spath, + const std::vector& range) + : _path(std::move(spath)) { + for (const auto& tx : range) { + _ranges.push_back(tx); + } + _ranges.shrink_to_fit(); +} + +tx_range_manifest::tx_range_manifest(remote_segment_path spath) + : _path(std::move(spath)) {} + +ss::future<> tx_range_manifest::update(ss::input_stream is) { + using namespace rapidjson; + iobuf result; + auto os = make_iobuf_ref_output_stream(result); + co_await ss::copy(is, os); + iobuf_istreambuf ibuf(result); + std::istream stream(&ibuf); + Document m; + IStreamWrapper wrapper(stream); + m.ParseStream(wrapper); + update(m); +} + +void tx_range_manifest::update(const rapidjson::Document& doc) { + _ranges = fragmented_vector(); + auto version = doc["version"].GetInt(); + auto compat_version = doc["compat_version"].GetInt(); + if ( + compat_version + > static_cast(tx_range_manifest_version::current_version)) { + throw std::runtime_error(fmt::sprintf( + "Can't deserialize tx manifest, supported version {}, manifest " + "version {}, compatible version {}", + static_cast(tx_range_manifest_version::current_version), + version, + compat_version)); + } + if (doc.HasMember("ranges")) { + const auto& arr = doc["ranges"].GetArray(); + for (const auto& it : arr) { + const auto& tx_range = it.GetObject(); + auto id = tx_range["pid.id"].GetInt64(); + auto epoch = tx_range["pid.epoch"].GetInt(); + auto first = model::offset{tx_range["first"].GetInt64()}; + auto last = model::offset{tx_range["last"].GetInt64()}; + model::producer_identity pid(id, static_cast(epoch)); + _ranges.push_back(cluster::rm_stm::tx_range{pid, first, last}); + } + } + _ranges.shrink_to_fit(); +} + +serialized_json_stream tx_range_manifest::serialize() const { + iobuf serialized; + iobuf_ostreambuf obuf(serialized); + std::ostream os(&obuf); + serialize(os); + size_t size_bytes = serialized.size_bytes(); + return { + .stream = make_iobuf_input_stream(std::move(serialized)), + .size_bytes = size_bytes}; +} + +remote_manifest_path tx_range_manifest::get_manifest_path() const { + return generate_remote_tx_path(_path); +} + +void tx_range_manifest::serialize(std::ostream& out) const { + using namespace rapidjson; + OStreamWrapper wrapper(out); + Writer w(wrapper); + w.StartObject(); + w.Key("version"); + w.Int(static_cast(tx_range_manifest_version::current_version)); + w.Key("compat_version"); + w.Int(static_cast(tx_range_manifest_version::compat_version)); + w.Key("ranges"); + w.StartArray(); + for (const auto& tx : _ranges) { + w.StartObject(); + w.Key("pid.id"); + w.Int64(tx.pid.id); + w.Key("pid.epoch"); + w.Int(tx.pid.epoch); + w.Key("first"); + w.Int64(tx.first()); + w.Key("last"); + w.Int64(tx.last()); + w.EndObject(); + } + w.EndArray(); + w.EndObject(); +} +} // namespace cloud_storage diff --git a/src/v/cloud_storage/tx_range_manifest.h b/src/v/cloud_storage/tx_range_manifest.h new file mode 100644 index 0000000000000..43e85393ad43c --- /dev/null +++ b/src/v/cloud_storage/tx_range_manifest.h @@ -0,0 +1,73 @@ +/* + * Copyright 2022 Redpanda Data, Inc. + * + * Licensed as a Redpanda Enterprise file under the Redpanda Community + * License (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://github.com/redpanda-data/redpanda/blob/master/licenses/rcl.md + */ + +#pragma once + +#include "cloud_storage/base_manifest.h" +#include "cluster/rm_stm.h" +#include "cluster/types.h" +#include "json/document.h" +#include "model/fundamental.h" +#include "model/metadata.h" + +#include + +#include + +namespace cloud_storage { + +/// Transactional metadata path in S3 +remote_manifest_path generate_remote_tx_path(const remote_segment_path& path); + +class tx_range_manifest final : public base_manifest { +public: + /// Create manifest for specific ntp + explicit tx_range_manifest( + remote_segment_path spath, + const std::vector& range); + + /// Create empty manifest that supposed to be updated later + explicit tx_range_manifest(remote_segment_path spath); + + friend bool + operator==(const tx_range_manifest& lhs, const tx_range_manifest& rhs) { + return lhs._path == rhs._path && lhs._ranges == rhs._ranges; + } + + /// Update manifest file from input_stream (remote set) + ss::future<> update(ss::input_stream is) override; + void update(const rapidjson::Document& is); + + /// Serialize manifest object + /// + /// \return asynchronous input_stream with the serialized json + serialized_json_stream serialize() const override; + + /// Manifest object name in S3 + remote_manifest_path get_manifest_path() const override; + + /// Serialize manifest object + /// + /// \param out output stream that should be used to output the json + void serialize(std::ostream& out) const; + + manifest_type get_manifest_type() const override { + return manifest_type::tx_range; + }; + + fragmented_vector&& get_tx_range() && { + return std::move(_ranges); + } + +private: + remote_segment_path _path; + fragmented_vector _ranges; +}; +} // namespace cloud_storage diff --git a/src/v/cloud_storage/types.h b/src/v/cloud_storage/types.h index a2d5dd2cf316e..b86b32ccbe80e 100644 --- a/src/v/cloud_storage/types.h +++ b/src/v/cloud_storage/types.h @@ -60,6 +60,12 @@ enum class manifest_version : int32_t { v1 = 1, }; +enum class tx_range_manifest_version : int32_t { + v1 = 1, + current_version = v1, + compat_version = v1, +}; + static constexpr int32_t topic_manifest_version = 1; std::ostream& operator<<(std::ostream& o, const download_result& r); diff --git a/src/v/cluster/cluster_utils.h b/src/v/cluster/cluster_utils.h index 4aa9380e47b7d..371cd2f4527f2 100644 --- a/src/v/cluster/cluster_utils.h +++ b/src/v/cluster/cluster_utils.h @@ -271,4 +271,14 @@ inline std::vector subtract_replica_sets( }); return ret; } + +// check if replica set contains a node +inline bool contains_node( + const std::vector& replicas, model::node_id id) { + return std::find_if( + replicas.begin(), + replicas.end(), + [id](const model::broker_shard& bs) { return bs.node_id == id; }) + != replicas.end(); +} } // namespace cluster diff --git a/src/v/cluster/controller.cc b/src/v/cluster/controller.cc index f3878fdf73173..508429df43abe 100644 --- a/src/v/cluster/controller.cc +++ b/src/v/cluster/controller.cc @@ -92,7 +92,8 @@ ss::future<> controller::wire_up() { return _authorizer.start( []() { return config::shard_local_cfg().superusers.bind(); }); }) - .then([this] { return _tp_state.start(); }); + .then([this] { return _tp_state.start(); }) + .then([this] { _probe.start(); }); } ss::future<> controller::start() { @@ -234,6 +235,7 @@ ss::future<> controller::start() { return stm.wait(stm.bootstrap_last_applied(), model::no_timeout); }); }) + .then([this] { return cluster_creation_hook(); }) .then( [this] { return _backend.invoke_on_all(&controller_backend::start); }) .then([this] { @@ -376,6 +378,7 @@ ss::future<> controller::stop() { f = shutdown_input(); } + _probe.stop(); return f.then([this] { auto stop_leader_balancer = _leader_balancer ? _leader_balancer->stop() : ss::now(); @@ -409,4 +412,39 @@ ss::future<> controller::stop() { }); } +/** + * This function provides for writing the controller log immediately + * after it has been created, before anything else has been written + * to it, and before we have started communicating with peers. + */ +ss::future<> controller::cluster_creation_hook() { + if (!config::node().seed_servers().empty()) { + // We are not on the root node + co_return; + } else if ( + _raft0->last_visible_index() > model::offset{} + || _raft0->config().brokers().size() > 1) { + // The controller log has already been written to + co_return; + } + + // Internal RPC does not start until after controller startup + // is complete (we are called during controller startup), so + // it is guaranteed that if we were single node/empty controller + // log at start of this function, we will still be in that state + // here. The wait for leadership is really just a wait for the + // consensus object to finish writing its last_voted_for from + // its self-vote. + while (!_raft0->is_leader()) { + co_await ss::sleep(100ms); + } + + auto err + = co_await _security_frontend.local().maybe_create_bootstrap_user(); + vassert( + err == errc::success, + "Controller write should always succeed in single replica state during " + "creation"); +} + } // namespace cluster diff --git a/src/v/cluster/controller.h b/src/v/cluster/controller.h index e61e81a793325..509c9f89ff486 100644 --- a/src/v/cluster/controller.h +++ b/src/v/cluster/controller.h @@ -122,7 +122,7 @@ class controller { private: friend controller_probe; -private: + ss::future<> cluster_creation_hook(); config_manager::preload_result _config_preload; ss::sharded _as; // instance per core @@ -136,7 +136,6 @@ class controller { ss::sharded _tp_frontend; // instance per core ss::sharded _backend; // instance per core ss::sharded _stm; // single instance - ss::sharded _service; // instance per core ss::sharded _api; // instance per core ss::sharded _members_frontend; // instance per core ss::sharded _members_backend; // single instance diff --git a/src/v/cluster/controller_backend.cc b/src/v/cluster/controller_backend.cc index 3d0bce521fde3..6635c21c38ea9 100644 --- a/src/v/cluster/controller_backend.cc +++ b/src/v/cluster/controller_backend.cc @@ -90,6 +90,31 @@ std::vector create_brokers_set( return brokers; } +std::vector create_brokers_set( + const std::vector& replicas, + const absl::flat_hash_map& + replica_revisions, + cluster::members_table& members) { + std::vector brokers; + brokers.reserve(replicas.size()); + + std::transform( + std::cbegin(replicas), + std::cend(replicas), + std::back_inserter(brokers), + [&members, &replica_revisions](const model::broker_shard& bs) { + auto br = members.get_broker(bs.node_id); + if (!br) { + throw std::logic_error( + fmt::format("Replica node {} is not available", bs.node_id)); + } + return raft::broker_revision{ + .broker = *br->get(), + .rev = replica_revisions.find(bs.node_id)->second}; + }); + return brokers; +} + std::optional get_target_shard( model::node_id id, const std::vector& replicas) { auto it = std::find_if( @@ -519,6 +544,29 @@ find_interrupting_operation(deltas_t::iterator current_it, deltas_t& deltas) { } }); } +ss::future revert_configuration_update( + const model::ntp& ntp, + const std::vector& replicas, + model::revision_id rev, + ss::lw_shared_ptr p, + members_table& members, + topic_table& topics) { + auto in_progress_it = topics.in_progress_updates().find(ntp); + // no longer in progress + if (in_progress_it == topics.in_progress_updates().end()) { + co_return errc::success; + } + auto brokers = create_brokers_set( + replicas, in_progress_it->second.replicas_revisions, members); + vlog( + clusterlog.debug, + "reverting already finished reconfiguration of {}, revision: {}. Replica " + "set: {} ", + ntp, + rev, + replicas); + co_return co_await p->update_replica_set(std::move(brokers), rev); +} } // namespace ss::future<> controller_backend::reconcile_ntp(deltas_t& deltas) { @@ -1197,21 +1245,25 @@ ss::future controller_backend::cancel_replica_set_update( replicas, rev, [this, &ntp, rev, replicas](ss::lw_shared_ptr p) { + const auto current_cfg = p->group_configuration(); + // we do not have to request update/cancellation twice + if (current_cfg.revision_id() == rev) { + return ss::make_ready_future( + errc::waiting_for_recovery); + } + const auto raft_cfg_update_finished - = are_configuration_replicas_up_to_date( - p->group_configuration(), replicas); + = current_cfg.type() == raft::configuration_type::simple; // raft already finished its part, we need to move replica back if (raft_cfg_update_finished) { - auto brokers = create_brokers_set( - replicas, _members_table.local()); - vlog( - clusterlog.debug, - "raft reconfiguration finished, moving partition {} " - "configuration back to requested state: {}", + return revert_configuration_update( ntp, - replicas); - return p->update_replica_set(std::move(brokers), rev); + replicas, + rev, + std::move(p), + _members_table.local(), + _topics.local()); } else { vlog( clusterlog.debug, @@ -1233,21 +1285,42 @@ ss::future controller_backend::force_abort_replica_set_update( if (!partition) { co_return errc::partition_not_exists; } + const auto current_cfg = partition->group_configuration(); - const auto raft_cfg_update_finished = are_configuration_replicas_up_to_date( - partition->group_configuration(), replicas); - if (raft_cfg_update_finished) { - co_return co_await update_partition_replica_set(ntp, replicas, rev); - } else { - // wait for configuration update, only declare success - // when configuration was actually updated - auto update_ec = check_configuration_update( - _self, partition, replicas, rev); + // wait for configuration update, only declare success + // when configuration was actually updated + auto update_ec = check_configuration_update( + _self, partition, replicas, rev); - if (!update_ec) { - co_return errc::success; - } + if (!update_ec) { + co_return errc::success; + } + + // we do not have to request update/cancellation twice + if (current_cfg.revision_id() == rev) { + co_return errc::waiting_for_recovery; + } + + const auto raft_cfg_update_finished = current_cfg.type() + == raft::configuration_type::simple; + + if (raft_cfg_update_finished) { + co_return co_await apply_configuration_change_on_leader( + ntp, + replicas, + rev, + [this, rev, &replicas, &ntp]( + ss::lw_shared_ptr p) { + return revert_configuration_update( + ntp, + replicas, + rev, + std::move(p), + _members_table.local(), + _topics.local()); + }); + } else { auto ec = co_await partition->force_abort_replica_set_update(rev); if (ec) { diff --git a/src/v/cluster/controller_backend.h b/src/v/cluster/controller_backend.h index cbdb850eac00a..2febe8dd27d5f 100644 --- a/src/v/cluster/controller_backend.h +++ b/src/v/cluster/controller_backend.h @@ -30,8 +30,190 @@ namespace cluster { -/// on every core, sharded - +/** + * + * # Reconciliation + * + * Controller backend is responsible for making sure that the cluster state is + * in align with the topic and partition state gathered in topic_table. + * + * Controller backend lives on each core on every node in the cluster. Each + * instance of controller backend is responsible for dealing with core & node + * local partition replicas (instances of `cluster::partition` object that are + * supposed to be instantiated on given core and given node). Controller backend + * manages partition replica lifecycle. It instantiates/deletes + * `cluster::partition` instances and registers them in shard table. + * + * Controller backend operations are driven by deltas generated in topics table. + * Backend waits for the new deltas using condition variable. Each delta + * represent an operation that must be executed for ntp f.e. create, update + * properties, move, etc. + * + * Each controller backend in the cluster (on each node and each core) process + * all the deltas and based on the situation it either executes an operation or + * ignore it (command pattern). + * + * Deltas vector for each NTP is processed in separate fiber in other words + * deltas for different NTPs are executed concurrently but for the same NTP + * sequentially. + * + * Each delta has revision assigned revision for the delta is assigned based on + * the raft0 log offset of command that the delta is related with. The same + * delta has the same revision globally. + * + * Deltas are executed in order from oldest revision up to the newest. + * + * + * NTP_1 + * Loop until finished or cancelled + * + * ┌──────────────────┐ + * │ │ + * │ │ + * ┌────────────┐ ┌────────────┐ ┌────────────┐ │ ┌────────────┐ │ + * │ delta │ │ delta │ │ delta │ │ │ delta │ │ + * │ │ │ │ │ ├──► └─►│ ├──┘ + * │ revision: 3│ │ revision: 2│ │ revision: 1│ │ revision: 0│ + * └────────────┘ └────────────┘ └────────────┘ └────────────┘ + * + * . + * . + * . + * NTP_N + * Loop until finished or cancelled + * + * ┌──────────────────┐ + * │ │ + * │ │ + * ┌────────────┐ ┌────────────┐ ┌────────────┐ │ ┌────────────┐ │ + * │ delta │ │ delta │ │ delta │ │ │ delta │ │ + * │ │ │ │ │ ├──► └─►│ ├──┘ + * │ revision: 3│ │ revision: 2│ │ revision: 1│ │ revision: 0│ + * └────────────┘ └────────────┘ └────────────┘ └────────────┘ + * + * # Revisions + * + * As each reconciliation loops are not coordinated we must be able to recognize + * epochs. Consider a situation in which a stream of deltas executed by the + * backend leads to the state which is identical from end user perspective f.e. + * topic with the same name and configuration was deleted and then created back + * again. We must be able to recognize if the instance of partition replica that + * has been created for the topic belongs to the original topic or the one that + * was re created. In order to introduce differentiation between the two not + * distinguishable states we use revision_id as an epoch. Revision is used + * whenever partition is created or its replicas are moved. This way controller + * backend is able to recognize if partition replicas have already been updated + * or if action is required. + * + * ## Revisions and raft vnode + * + * Whenever a new replica is added to raft configuration it has new revision + * assigned. In raft each raft group participant is described by a tuple of + * model::node_id and model::revision_id. This way every time the node is re + * added to the configuration (consider a situation in which partition with + * single replica is moved back and forth between two nodes f.e. 1 -> 2 -> 1 + * -> 2...) it is recognized as a new node. This fencing mechanism prevents the + * up to date raft group replicas from communicating with one from previous + * epoch. + * + * # Partition movement + * + * Partition movement in Redpanda is based on the Raft protocol mechanism called + * Joint Consensus. When requested Raft implementation is able to move data + * between nodes in a safe and consistent way. However requesting Raft to + * reconfigure a raft group is not enough to complete a partition move. When + * partition move is requested based on the current situation some of the + * controller backend may have to create new partition replica instances while + * other have to delete the one that are not longer part of raft group. + * Additionally there may be a need to move partition instance between cores on + * the same node. + * + * Every time partition move is requested each reconciliation loop executes an + * operation based on current and requested state and poll for its completion. + * + * Partition movement finish is coordinated using a designated finish command. + * + * Partition movement finish command is replicated from one of the replicas that + * was changed during reconfiguration process. + * + * IMPORTANT: + * Partition replicas are only deleted when executing delta for operation + * finished command. This way when partition replica is deleted it is guaranteed + * to not longer be needed. + * + * Example: + * + * Consider moving partition between a set of nodes: + * + * replicas on nodes (1,2,3) -> replicas on nodes (2,3,4) + * + * (for simplicity we ignore core assignment in this example) + * + * Assumptions: + * - node 1 is a leader for the partition. + * + * Operations that has to be executed on every node: + * + * Node 1: + * - node 1 is a leader, leader is the only one that can replicate data so it + * will be asked for reconfiguration + * - after partition replica is not longer needed on this node it may be removed + * + * Node 2 & 3: + * - node 2 will wait until configuration will be up to date with requested. In + * case leadership from node 1 moved it will ask for reconfiguration + * + * Node 4: + * - node 4 will create a new instance of partition replica and wait for the + * configuration to be up to date. + * - after successful reconfiguration node 4 will dispatch finish update command + * + * + * When finish update command will be received by node 1 it will remove the + * partition replica instance. + * + * + * ## Interrupting partition movement + * + * Partition movement interruption may only be accepted after topic table + * processed move command but before the finish update command was processed. We + * use topics table as a single source of truth to decide if the update may + * still be canceled or if it has finished. This way we must be able to revert + * configuration change even if raft already finished reconfiguration. + * + * Partition move interruption does not mark the reconfiguration process as + * finished i.e. it will still be represented as in progress when queried from + * topic table. Move interruption will only finish when reconfiguration is + * finished in raft and finish move command is issued by the controller backend + * + * In general the interrupt may happen in the following situations: + * + * 1) before raft reconfiguration was requested + * 2) when raft reconfiguration is in progress + * 3) when raft reconfiguration has already finished but before finish command + * was replicated + * + * In all of the situations we must move back to the raft group configuration + * which was active before the move was scheduled. The set of actions that must + * be taken to finish the interruption is different based on the situation in + * which interruption happened. + * + * For 1) controller backend must simply update raft configuration revision to + * be able to decide if action related with given revision_id has been executed. + * + * For 2) controller backend with request reconfiguration cancellation on a + * leader and will wait until raft configuration is up to date with what was + * observed before the move. Any replicas that were created for the purpose of + * move will be removed when processing finished move command. + * + * For 3) controller backend must request reconfiguration with the same exact + * replica set as before the move was requested. It is important to notice that + * no partition replicas were yet removed as finish command wasn't yet + * processed. Since cancelling partition move does not create new partition + * replica instances (instances of `cluster::partition`) but reuse the existing + * one we must reuse revision id of currently existing replica instances. + * + */ class controller_backend : public ss::peering_sharded_service { public: diff --git a/src/v/cluster/controller_probe.cc b/src/v/cluster/controller_probe.cc index 833d76e47d22c..1c2ad9a4bc1bd 100644 --- a/src/v/cluster/controller_probe.cc +++ b/src/v/cluster/controller_probe.cc @@ -24,24 +24,34 @@ namespace cluster { controller_probe::controller_probe(controller& c) noexcept - : _controller(c) { - _controller._raft_manager.local().register_leadership_notification( - [this]( - raft::group_id group, - model::term_id /*term*/, - std::optional leader_id) { - // We are only interested in notifications regarding the controller - // group. - if (_controller._raft0->group() != group) { - return; - } - - if (leader_id != _controller.self()) { - _public_metrics.reset(); - } else { - setup_metrics(); - } - }); + : _controller(c) + , _leadership_notification_handle{} {} + +void controller_probe::start() { + _leadership_notification_handle + = _controller._raft_manager.local().register_leadership_notification( + [this]( + raft::group_id group, + model::term_id /*term*/, + std::optional leader_id) { + // We are only interested in notifications regarding the controller + // group. + if (!_controller._raft0 || _controller._raft0->group() != group) { + return; + } + + if (leader_id != _controller.self()) { + _public_metrics.reset(); + } else { + setup_metrics(); + } + }); +} + +void controller_probe::stop() { + _public_metrics.reset(); + _controller._raft_manager.local().unregister_leadership_notification( + _leadership_notification_handle); } void controller_probe::setup_metrics() { @@ -77,7 +87,7 @@ void controller_probe::setup_metrics() { "partitions", [this] { const auto& leaders_table - = _controller._partition_leaders.local(); + = _controller.get_partition_leaders().local(); auto partitions_count = 0; leaders_table.for_each_leader( @@ -92,7 +102,7 @@ void controller_probe::setup_metrics() { "unavailable_partitions", [this] { const auto& leaders_table - = _controller._partition_leaders.local(); + = _controller.get_partition_leaders().local(); auto unavailable_partitions_count = 0; leaders_table.for_each_leader([&unavailable_partitions_count]( diff --git a/src/v/cluster/controller_probe.h b/src/v/cluster/controller_probe.h index 781f2de1ff68c..1688fb82496fd 100644 --- a/src/v/cluster/controller_probe.h +++ b/src/v/cluster/controller_probe.h @@ -12,6 +12,7 @@ #pragma once #include "cluster/fwd.h" +#include "cluster/types.h" #include "seastarx.h" #include @@ -22,11 +23,15 @@ class controller_probe { public: explicit controller_probe(cluster::controller&) noexcept; + void start(); + void stop(); + void setup_metrics(); private: cluster::controller& _controller; std::unique_ptr _public_metrics; + cluster::notification_id_type _leadership_notification_handle; }; } // namespace cluster diff --git a/src/v/cluster/feature_table.cc b/src/v/cluster/feature_table.cc index 134f9a2263741..24bf521389c83 100644 --- a/src/v/cluster/feature_table.cc +++ b/src/v/cluster/feature_table.cc @@ -30,6 +30,8 @@ std::string_view to_string_view(feature f) { return "serde_raft_0"; case feature::license: return "license"; + case feature::rm_stm_kafka_cache: + return "rm_stm_kafka_cache"; case feature::test_alpha: return "__test_alpha"; } @@ -58,7 +60,7 @@ std::string_view to_string_view(feature_state::state s) { // The version that this redpanda node will report: increment this // on protocol changes to raft0 structures, like adding new services. -static constexpr cluster_version latest_version = cluster_version{4}; +static constexpr cluster_version latest_version = cluster_version{5}; feature_table::feature_table() { // Intentionally undocumented environment variable, only for use diff --git a/src/v/cluster/feature_table.h b/src/v/cluster/feature_table.h index d40e2a7235b5b..88eddf0420923 100644 --- a/src/v/cluster/feature_table.h +++ b/src/v/cluster/feature_table.h @@ -27,6 +27,7 @@ enum class feature : std::uint64_t { mtls_authentication = 0x8, serde_raft_0 = 0x10, license = 0x20, + rm_stm_kafka_cache = 0x40, // Dummy features for testing only test_alpha = uint64_t(1) << 63, @@ -115,6 +116,12 @@ constexpr static std::array feature_schema{ feature::license, feature_spec::available_policy::always, feature_spec::prepare_policy::always}, + feature_spec{ + cluster_version{5}, + "rm_stm_kafka_cache", + feature::rm_stm_kafka_cache, + feature_spec::available_policy::always, + feature_spec::prepare_policy::always}, feature_spec{ cluster_version{2001}, "__test_alpha", diff --git a/src/v/cluster/fwd.h b/src/v/cluster/fwd.h index c1c28a36e85c2..726f9c59df70f 100644 --- a/src/v/cluster/fwd.h +++ b/src/v/cluster/fwd.h @@ -15,7 +15,6 @@ namespace cluster { class controller; class controller_backend; -class controller_service; class controller_stm_shard; class id_allocator_frontend; class rm_partition_frontend; diff --git a/src/v/cluster/members_backend.cc b/src/v/cluster/members_backend.cc index 65c4f57d03ab1..f69505b91d605 100644 --- a/src/v/cluster/members_backend.cc +++ b/src/v/cluster/members_backend.cc @@ -118,9 +118,9 @@ void members_backend::handle_single_update( vlog(clusterlog.debug, "membership update received: {}", update); switch (update.type) { case update_t::recommissioned: - // if node was recommissioned simply remove all decommissioning - // updates handle_recommissioned(update); + _updates.emplace_back(update); + _new_updates.signal(); return; case update_t::reallocation_finished: handle_reallocation_finished(update.id); @@ -132,6 +132,8 @@ void members_backend::handle_single_update( return; case update_t::decommissioned: stop_node_addition(update.id); + _decommission_command_revision.emplace( + update.id, model::revision_id(update.offset)); _updates.emplace_back(update); _new_updates.signal(); return; @@ -171,6 +173,9 @@ void members_backend::calculate_reallocations(update_meta& meta) { case members_manager::node_update_type::added: calculate_reallocations_after_node_added(meta); return; + case members_manager::node_update_type::recommissioned: + calculate_reallocations_after_recommissioned(meta); + return; default: return; } @@ -331,11 +336,80 @@ void members_backend::calculate_reallocations_after_node_added( } } +std::vector members_backend::ntps_moving_from_node_older_than( + model::node_id node, model::revision_id revision) const { + std::vector ret; + + for (const auto& [ntp, state] : _topics.local().in_progress_updates()) { + if (state.update_revision < revision) { + continue; + } + if (!contains_node(state.previous_replicas, node)) { + continue; + } + + auto current_assignment = _topics.local().get_partition_assignment(ntp); + if (unlikely(!current_assignment)) { + continue; + } + + if (!contains_node(current_assignment->replicas, node)) { + ret.push_back(ntp); + } + } + return ret; +} + +void members_backend::calculate_reallocations_after_recommissioned( + update_meta& meta) const { + auto it = _decommission_command_revision.find(meta.update.id); + vassert( + it != _decommission_command_revision.end(), + "members backend should hold a revision of nodes being decommissioned, " + "node_id: {}", + meta.update.id); + auto ntps = ntps_moving_from_node_older_than(meta.update.id, it->second); + // reallocate all partitions for which any of replicas is placed on + // decommissioned node + meta.partition_reallocations.reserve(ntps.size()); + for (auto& ntp : ntps) { + partition_reallocation reallocation(ntp); + reallocation.state = reallocation_state::request_cancel; + auto current_assignment = _topics.local().get_partition_assignment(ntp); + auto previous_replica_set = _topics.local().get_previous_replica_set( + ntp); + if ( + !current_assignment.has_value() + || !previous_replica_set.has_value()) { + continue; + } + reallocation.current_replica_set = std::move( + current_assignment->replicas); + reallocation.new_replica_set = std::move(*previous_replica_set); + + meta.partition_reallocations.push_back(std::move(reallocation)); + } +} + ss::future<> members_backend::reconcile() { // if nothing to do, wait co_await _new_updates.wait([this] { return !_updates.empty(); }); auto u = co_await _lock.get_units(); - + // remove stored revisions of previous decommissioning nodes, this will only + // happen when update is finished and it is either decommissioning or + // recommissioning of a node + for (const auto& meta : _updates) { + const bool is_decommission + = meta.update.type + == members_manager::node_update_type::decommissioned; + const bool is_recommission + = meta.update.type + == members_manager::node_update_type::recommissioned; + + if (meta.finished && (is_decommission || is_recommission)) { + _decommission_command_revision.erase(meta.update.id); + } + } // remove finished updates std::erase_if( _updates, [](const update_meta& meta) { return meta.finished; }); @@ -450,7 +524,6 @@ ss::future<> members_backend::reconcile() { const auto allocator_empty = _allocator.local().is_empty( meta.update.id); - if ( is_draining && all_reallocations_finished && allocator_empty && !updates_in_progress) { @@ -478,6 +551,15 @@ ss::future<> members_backend::reconcile() { all_reallocations_finished, allocator_empty, updates_in_progress); + if (!allocator_empty && all_reallocations_finished) { + // recalculate reallocations + vlog( + clusterlog.info, + "[update: {}] decommissioning in progress. recalculating " + "reallocations", + meta.update); + calculate_reallocations(meta); + } } } } @@ -641,6 +723,11 @@ ss::future<> members_backend::reallocate_replica_set( meta.current_replica_set, meta.new_replica_set, error.message()); + if (error == errc::no_update_in_progress) { + // mark reallocation as finished, reallocations will be + // recalculated if required + meta.state = reallocation_state::finished; + } co_return; } // success, update state and move on diff --git a/src/v/cluster/members_backend.h b/src/v/cluster/members_backend.h index aaae37e8b834b..918ac77423619 100644 --- a/src/v/cluster/members_backend.h +++ b/src/v/cluster/members_backend.h @@ -10,6 +10,7 @@ #include +#include #include #include @@ -98,6 +99,9 @@ class members_backend { void reassign_replicas(partition_assignment&, partition_reallocation&); void calculate_reallocations_after_node_added(update_meta&) const; void calculate_reallocations_after_decommissioned(update_meta&) const; + void calculate_reallocations_after_recommissioned(update_meta&) const; + std::vector ntps_moving_from_node_older_than( + model::node_id, model::revision_id) const; void setup_metrics(); ss::sharded& _topics_frontend; ss::sharded& _topics; @@ -118,6 +122,14 @@ class members_backend { ss::timer<> _retry_timer; ss::condition_variable _new_updates; ss::metrics::metric_groups _metrics; + /** + * store revision of node decommissioning update, decommissioning command + * revision is stored when node is being decommissioned, it is used to + * determine which partition movements were scheduled before the node was + * decommissioned, recommissioning process will not abort those movements. + */ + absl::flat_hash_map + _decommission_command_revision; }; std::ostream& operator<<(std::ostream&, const members_backend::reallocation_state&); diff --git a/src/v/cluster/members_manager.cc b/src/v/cluster/members_manager.cc index ba0d8df250512..ac09f17569bc3 100644 --- a/src/v/cluster/members_manager.cc +++ b/src/v/cluster/members_manager.cc @@ -226,12 +226,15 @@ members_manager::apply_update(model::record_batch b) { [this, update_offset](decommission_node_cmd cmd) mutable { auto id = cmd.key; return dispatch_updates_to_cores(update_offset, cmd) - .then([this, id](std::error_code error) { + .then([this, id, update_offset](std::error_code error) { auto f = ss::now(); if (!error) { _allocator.local().decommission_node(id); f = _update_queue.push_eventually(node_update{ - .id = id, .type = node_update_type::decommissioned}); + .id = id, + .type = node_update_type::decommissioned, + .offset = update_offset, + }); } return f.then([error] { return error; }); }); @@ -239,23 +242,27 @@ members_manager::apply_update(model::record_batch b) { [this, update_offset](recommission_node_cmd cmd) mutable { auto id = cmd.key; return dispatch_updates_to_cores(update_offset, cmd) - .then([this, id](std::error_code error) { + .then([this, id, update_offset](std::error_code error) { auto f = ss::now(); if (!error) { _allocator.local().recommission_node(id); f = _update_queue.push_eventually(node_update{ - .id = id, .type = node_update_type::recommissioned}); + .id = id, + .type = node_update_type::recommissioned, + .offset = update_offset}); } return f.then([error] { return error; }); }); }, - [this](finish_reallocations_cmd cmd) mutable { + [this, update_offset](finish_reallocations_cmd cmd) mutable { // we do not have to dispatch this command to members table since this // command is only used by a backend to signal successfully finished // node reallocations return _update_queue .push_eventually(node_update{ - .id = cmd.key, .type = node_update_type::reallocation_finished}) + .id = cmd.key, + .type = node_update_type::reallocation_finished, + .offset = update_offset}) .then([] { return make_error_code(errc::success); }); }, [this, update_offset](maintenance_mode_cmd cmd) { diff --git a/src/v/cluster/members_manager.h b/src/v/cluster/members_manager.h index 37c019dce75a4..6743f709e6864 100644 --- a/src/v/cluster/members_manager.h +++ b/src/v/cluster/members_manager.h @@ -53,6 +53,12 @@ class members_manager { model::node_id id; node_update_type type; model::offset offset; + + bool is_commissioning() const { + return type == members_manager::node_update_type::decommissioned + || type == members_manager::node_update_type::recommissioned; + } + friend std::ostream& operator<<(std::ostream&, const node_update&); }; diff --git a/src/v/cluster/metadata_cache.cc b/src/v/cluster/metadata_cache.cc index 2cadfe9cb1ee3..85f16c74d25f1 100644 --- a/src/v/cluster/metadata_cache.cc +++ b/src/v/cluster/metadata_cache.cc @@ -254,4 +254,20 @@ metadata_cache::get_default_shadow_indexing_mode() const { } return m; } + +topic_properties metadata_cache::get_default_properties() const { + topic_properties tp; + tp.compression = {get_default_compression()}; + tp.cleanup_policy_bitflags = {get_default_cleanup_policy_bitflags()}; + tp.compaction_strategy = {get_default_compaction_strategy()}; + tp.timestamp_type = {get_default_timestamp_type()}; + tp.segment_size = {get_default_segment_size()}; + tp.retention_bytes = tristate({get_default_retention_bytes()}); + tp.retention_duration = tristate( + {get_default_retention_duration()}); + tp.recovery = {false}; + tp.shadow_indexing = {get_default_shadow_indexing_mode()}; + return tp; +} + } // namespace cluster diff --git a/src/v/cluster/metadata_cache.h b/src/v/cluster/metadata_cache.h index 28f04bfeec467..ea343d3f8a401 100644 --- a/src/v/cluster/metadata_cache.h +++ b/src/v/cluster/metadata_cache.h @@ -166,6 +166,7 @@ class metadata_cache { std::optional get_default_retention_duration() const; model::shadow_indexing_mode get_default_shadow_indexing_mode() const; + topic_properties get_default_properties() const; private: ss::sharded& _topics_state; diff --git a/src/v/cluster/partition.cc b/src/v/cluster/partition.cc index 8350845f7d958..090616884b616 100644 --- a/src/v/cluster/partition.cc +++ b/src/v/cluster/partition.cc @@ -32,10 +32,12 @@ partition::partition( consensus_ptr r, ss::sharded& tx_gateway_frontend, ss::sharded& cloud_storage_api, - ss::sharded& cloud_storage_cache) + ss::sharded& cloud_storage_cache, + ss::sharded& feature_table) : _raft(r) , _probe(std::make_unique(*this)) , _tx_gateway_frontend(tx_gateway_frontend) + , _feature_table(feature_table) , _is_tx_enabled(config::shard_local_cfg().enable_transactions.value()) , _is_idempotence_enabled( config::shard_local_cfg().enable_idempotence.value()) { @@ -70,7 +72,7 @@ partition::partition( if (has_rm_stm) { _rm_stm = ss::make_shared( - clusterlog, _raft.get(), _tx_gateway_frontend); + clusterlog, _raft.get(), _tx_gateway_frontend, _feature_table); stm_manager->add_stm(_rm_stm); } @@ -102,21 +104,15 @@ partition::partition( } } -ss::future> partition::replicate( +ss::future> partition::replicate( model::record_batch_reader&& r, raft::replicate_options opts) { - return _raft->replicate(std::move(r), opts); -} - -raft::replicate_stages partition::replicate_in_stages( - model::record_batch_reader&& r, raft::replicate_options opts) { - return _raft->replicate_in_stages(std::move(r), opts); -} - -ss::future> partition::replicate( - model::term_id term, - model::record_batch_reader&& r, - raft::replicate_options opts) { - return _raft->replicate(term, std::move(r), opts); + using ret_t = result; + auto res = co_await _raft->replicate(std::move(r), opts); + if (!res) { + co_return ret_t(res.error()); + } + co_return ret_t(kafka_result{ + kafka::offset(_translator->from_log_offset(res.value().last_offset)())}); } ss::shared_ptr partition::rm_stm() { @@ -138,10 +134,11 @@ ss::shared_ptr partition::rm_stm() { return _rm_stm; } -raft::replicate_stages partition::replicate_in_stages( +kafka_stages partition::replicate_in_stages( model::batch_identity bid, model::record_batch_reader&& r, raft::replicate_options opts) { + using ret_t = result; if (bid.is_transactional) { if (!_is_tx_enabled) { vlog( @@ -149,7 +146,7 @@ raft::replicate_stages partition::replicate_in_stages( "Can't process a transactional request to {}. Transactional " "processing isn't enabled.", _raft->ntp()); - return raft::replicate_stages(raft::errc::timeout); + return kafka_stages(raft::errc::timeout); } if (!_rm_stm) { @@ -157,7 +154,7 @@ raft::replicate_stages partition::replicate_in_stages( clusterlog.error, "Topic {} doesn't support transactional processing.", _raft->ntp()); - return raft::replicate_stages(raft::errc::timeout); + return kafka_stages(raft::errc::timeout); } } @@ -168,7 +165,7 @@ raft::replicate_stages partition::replicate_in_stages( "Can't process an idempotent request to {}. Idempotency isn't " "enabled.", _raft->ntp()); - return raft::replicate_stages(raft::errc::timeout); + return kafka_stages(raft::errc::timeout); } if (!_rm_stm) { @@ -176,68 +173,27 @@ raft::replicate_stages partition::replicate_in_stages( clusterlog.error, "Topic {} doesn't support idempotency.", _raft->ntp()); - return raft::replicate_stages(raft::errc::timeout); + return kafka_stages(raft::errc::timeout); } } if (_rm_stm) { return _rm_stm->replicate_in_stages(bid, std::move(r), opts); - } else { - return _raft->replicate_in_stages(std::move(r), opts); - } -} - -ss::future> partition::replicate( - model::batch_identity bid, - model::record_batch_reader&& r, - raft::replicate_options opts) { - if (bid.is_transactional) { - if (!_is_tx_enabled) { - vlog( - clusterlog.error, - "Can't process a transactional request to {}. Transactional " - "processing isn't enabled.", - _raft->ntp()); - return ss::make_ready_future>( - raft::errc::timeout); - } - - if (!_rm_stm) { - vlog( - clusterlog.error, - "Topic {} doesn't support transactional processing.", - _raft->ntp()); - return ss::make_ready_future>( - raft::errc::timeout); - } } - if (bid.has_idempotent()) { - if (!_is_idempotence_enabled) { - vlog( - clusterlog.error, - "Can't process an idempotent request to {}. Idempotency isn't " - "enabled.", - _raft->ntp()); - return ss::make_ready_future>( - raft::errc::timeout); - } - - if (!_rm_stm) { - vlog( - clusterlog.error, - "Topic {} doesn't support idempotency.", - _raft->ntp()); - return ss::make_ready_future>( - raft::errc::timeout); - } - } - - if (_rm_stm) { - return _rm_stm->replicate(bid, std::move(r), opts); - } else { - return _raft->replicate(std::move(r), opts); - } + auto res = _raft->replicate_in_stages(std::move(r), opts); + auto replicate_finished = res.replicate_finished.then( + [this](result r) { + if (!r) { + return ret_t(r.error()); + } + auto old_offset = r.value().last_offset; + auto new_offset = kafka::offset( + _translator->from_log_offset(old_offset)()); + return ret_t(kafka_result{new_offset}); + }); + return kafka_stages( + std::move(res.request_enqueued), std::move(replicate_finished)); } ss::future<> partition::start() { @@ -245,7 +201,8 @@ ss::future<> partition::start() { _probe.setup_metrics(ntp); - auto f = _raft->start(); + auto f = _raft->start().then( + [this] { _translator = _raft->get_offset_translator_state(); }); if (is_id_allocator_topic(ntp)) { return f.then([this] { return _id_allocator_stm->start(); }); diff --git a/src/v/cluster/partition.h b/src/v/cluster/partition.h index 81107c6d0beb6..cae61e69878d5 100644 --- a/src/v/cluster/partition.h +++ b/src/v/cluster/partition.h @@ -13,6 +13,7 @@ #include "cloud_storage/remote_partition.h" #include "cluster/archival_metadata_stm.h" +#include "cluster/feature_table.h" #include "cluster/id_allocator_stm.h" #include "cluster/partition_probe.h" #include "cluster/rm_stm.h" @@ -44,27 +45,17 @@ class partition { consensus_ptr r, ss::sharded&, ss::sharded&, - ss::sharded&); + ss::sharded&, + ss::sharded&); raft::group_id group() const { return _raft->group(); } ss::future<> start(); ss::future<> stop(); - ss::future> + ss::future> replicate(model::record_batch_reader&&, raft::replicate_options); - raft::replicate_stages - replicate_in_stages(model::record_batch_reader&&, raft::replicate_options); - - ss::future> replicate( - model::term_id, model::record_batch_reader&&, raft::replicate_options); - - ss::future> replicate( - model::batch_identity, - model::record_batch_reader&&, - raft::replicate_options); - - raft::replicate_stages replicate_in_stages( + kafka_stages replicate_in_stages( model::batch_identity, model::record_batch_reader&&, raft::replicate_options); @@ -168,6 +159,13 @@ class partition { std::move(brokers), new_revision_id); } + ss::future update_replica_set( + std::vector brokers, + model::revision_id new_revision_id) { + return _raft->replace_configuration( + std::move(brokers), new_revision_id); + } + raft::group_configuration group_configuration() const { return _raft->config(); } @@ -215,11 +213,21 @@ class partition { return _rm_stm->aborted_transactions(from, to); } + ss::future> + aborted_transactions_cloud(cloud_storage::offset_range offsets) { + return _cloud_storage_partition->aborted_transactions(offsets); + } + const ss::shared_ptr& archival_meta_stm() const { return _archival_meta_stm; } + bool is_read_replica_mode_enabled() const { + const auto& cfg = _raft->log_config(); + return cfg.is_read_replica_mode_enabled(); + } + /// Return true if shadow indexing is enabled for the partition bool is_remote_fetch_enabled() const { const auto& cfg = _raft->log_config(); @@ -241,17 +249,28 @@ class partition { model::offset start_cloud_offset() const { vassert( cloud_data_available(), - "Method can only be called if cloud data is available"); + "Method can only be called if cloud data is available, ntp: {}", + _raft->ntp()); return _cloud_storage_partition->first_uploaded_offset(); } + /// Last available cloud offset + model::offset last_cloud_offset() const { + vassert( + cloud_data_available(), + "Method can only be called if cloud data is available, ntp: {}", + _raft->ntp()); + return _cloud_storage_partition->last_uploaded_offset(); + } + /// Create a reader that will fetch data from remote storage ss::future make_cloud_reader( storage::log_reader_config config, std::optional deadline = std::nullopt) { vassert( cloud_data_available(), - "Method can only be called if cloud data is available"); + "Method can only be called if cloud data is available, ntp: {}", + _raft->ntp()); return _cloud_storage_partition->make_reader(config, deadline); } @@ -289,11 +308,7 @@ class partition { return _raft->abort_configuration_change(rev); } -private: - friend partition_manager; - friend replicated_partition_probe; - - consensus_ptr raft() { return _raft; } + consensus_ptr raft() const { return _raft; } private: consensus_ptr _raft; @@ -305,9 +320,11 @@ class partition { ss::abort_source _as; partition_probe _probe; ss::sharded& _tx_gateway_frontend; + ss::sharded& _feature_table; bool _is_tx_enabled{false}; bool _is_idempotence_enabled{false}; ss::lw_shared_ptr _cloud_storage_partition; + ss::lw_shared_ptr _translator; friend std::ostream& operator<<(std::ostream& o, const partition& x); }; diff --git a/src/v/cluster/partition_balancer_planner.cc b/src/v/cluster/partition_balancer_planner.cc index a8a2bf078fefa..7bd6e50572219 100644 --- a/src/v/cluster/partition_balancer_planner.cc +++ b/src/v/cluster/partition_balancer_planner.cc @@ -305,7 +305,7 @@ void partition_balancer_planner::get_unavailable_nodes_reassignments( continue; } auto new_allocation_units = get_reallocation( - a, t.second, partition_size.value(), false, rrs); + a, t.second.metadata, partition_size.value(), false, rrs); if (new_allocation_units) { result.reassignments.emplace_back(ntp_reassignments{ .ntp = ntp, @@ -384,7 +384,7 @@ void partition_balancer_planner::get_full_node_reassignments( } auto new_allocation_units = get_reallocation( *current_assignments, - topic_metadata, + topic_metadata.metadata, ntp_size_it->first, true, rrs); diff --git a/src/v/cluster/partition_balancer_types.h b/src/v/cluster/partition_balancer_types.h index 7131f77bb1fab..f562ed813bd14 100644 --- a/src/v/cluster/partition_balancer_types.h +++ b/src/v/cluster/partition_balancer_types.h @@ -49,6 +49,7 @@ struct partition_balancer_violations model::node_id id; model::timestamp unavailable_since; + unavailable_node() noexcept = default; unavailable_node(model::node_id id, model::timestamp unavailable_since) : id(id) , unavailable_since(unavailable_since) {} @@ -65,6 +66,7 @@ struct partition_balancer_violations model::node_id id; uint32_t disk_used_percent; + full_node() noexcept = default; full_node(model::node_id id, uint32_t disk_used_percent) : id(id) , disk_used_percent(disk_used_percent) {} diff --git a/src/v/cluster/partition_manager.cc b/src/v/cluster/partition_manager.cc index 406baf98795c8..263b3f1857ef5 100644 --- a/src/v/cluster/partition_manager.cc +++ b/src/v/cluster/partition_manager.cc @@ -48,13 +48,15 @@ partition_manager::partition_manager( ss::sharded& tx_gateway_frontend, ss::sharded& recovery_mgr, ss::sharded& cloud_storage_api, - ss::sharded& cloud_storage_cache) + ss::sharded& cloud_storage_cache, + ss::sharded& feature_table) : _storage(storage.local()) , _raft_manager(raft) , _tx_gateway_frontend(tx_gateway_frontend) , _partition_recovery_mgr(recovery_mgr) , _cloud_storage_api(cloud_storage_api) - , _cloud_storage_cache(cloud_storage_cache) {} + , _cloud_storage_cache(cloud_storage_cache) + , _feature_table(feature_table) {} partition_manager::ntp_table_container partition_manager::get_topic_partition_table( @@ -120,7 +122,11 @@ ss::future partition_manager::manage( group, std::move(initial_nodes), log); auto p = ss::make_lw_shared( - c, _tx_gateway_frontend, _cloud_storage_api, _cloud_storage_cache); + c, + _tx_gateway_frontend, + _cloud_storage_api, + _cloud_storage_cache, + _feature_table); _ntp_table.emplace(log.config().ntp(), p); _raft_table.emplace(group, p); diff --git a/src/v/cluster/partition_manager.h b/src/v/cluster/partition_manager.h index fa66f5ea31a09..8f45302e4abc8 100644 --- a/src/v/cluster/partition_manager.h +++ b/src/v/cluster/partition_manager.h @@ -14,6 +14,7 @@ #include "cloud_storage/cache_service.h" #include "cloud_storage/partition_recovery_manager.h" #include "cloud_storage/remote.h" +#include "cluster/feature_table.h" #include "cluster/ntp_callbacks.h" #include "cluster/partition.h" #include "model/metadata.h" @@ -37,7 +38,8 @@ class partition_manager { ss::sharded&, ss::sharded&, ss::sharded&, - ss::sharded&); + ss::sharded&, + ss::sharded&); using manage_cb_t = ss::noncopyable_function)>; @@ -190,6 +192,7 @@ class partition_manager { _partition_recovery_mgr; ss::sharded& _cloud_storage_api; ss::sharded& _cloud_storage_cache; + ss::sharded& _feature_table; ss::gate _gate; bool _block_new_leadership{false}; diff --git a/src/v/cluster/partition_probe.cc b/src/v/cluster/partition_probe.cc index 949409d0fb20d..e4f42ae68a3f0 100644 --- a/src/v/cluster/partition_probe.cc +++ b/src/v/cluster/partition_probe.cc @@ -89,7 +89,7 @@ void replicated_partition_probe::setup_internal_metrics(const model::ntp& ntp) { sm::make_gauge( "leader_id", [this] { - return _partition._raft->get_leader_id().value_or( + return _partition.raft()->get_leader_id().value_or( model::node_id(-1)); }, sm::description("Id of current partition leader"), @@ -98,7 +98,7 @@ void replicated_partition_probe::setup_internal_metrics(const model::ntp& ntp) { sm::make_gauge( "under_replicated_replicas", [this] { - auto metrics = _partition._raft->get_follower_metrics(); + auto metrics = _partition.raft()->get_follower_metrics(); return std::count_if( metrics.cbegin(), metrics.cend(), @@ -143,10 +143,10 @@ void replicated_partition_probe::setup_public_metrics(const model::ntp& ntp) { return; } - auto request_label = sm::label("request"); - auto ns_label = sm::label("namespace"); - auto topic_label = sm::label("topic"); - auto partition_label = sm::label("partition"); + auto request_label = ssx::metrics::make_namespaced_label("request"); + auto ns_label = ssx::metrics::make_namespaced_label("namespace"); + auto topic_label = ssx::metrics::make_namespaced_label("topic"); + auto partition_label = ssx::metrics::make_namespaced_label("partition"); const std::vector labels = { ns_label(ntp.ns()), @@ -181,7 +181,7 @@ void replicated_partition_probe::setup_public_metrics(const model::ntp& ntp) { sm::make_gauge( "under_replicated_replicas", [this] { - auto metrics = _partition._raft->get_follower_metrics(); + auto metrics = _partition.raft()->get_follower_metrics(); return std::count_if( metrics.cbegin(), metrics.cend(), @@ -214,7 +214,7 @@ void replicated_partition_probe::setup_public_metrics(const model::ntp& ntp) { .aggregate({sm::shard_label, partition_label}), sm::make_gauge( "replicas", - [this] { return _partition._raft->get_follower_count(); }, + [this] { return _partition.raft()->get_follower_count(); }, sm::description("Number of replicas per topic"), labels) .aggregate({sm::shard_label, partition_label}), diff --git a/src/v/cluster/rm_stm.cc b/src/v/cluster/rm_stm.cc index 4766ad986f21d..baa3dc492167d 100644 --- a/src/v/cluster/rm_stm.cc +++ b/src/v/cluster/rm_stm.cc @@ -187,10 +187,36 @@ struct tx_snapshot_v0 { std::vector seqs; }; +struct seq_cache_entry_v1 { + int32_t seq{-1}; + model::offset offset; +}; + +struct seq_entry_v1 { + model::producer_identity pid; + int32_t seq{-1}; + model::offset last_offset{-1}; + ss::circular_buffer seq_cache; + model::timestamp::type last_write_timestamp; +}; + +struct tx_snapshot_v1 { + static constexpr uint8_t version = 1; + + std::vector fenced; + std::vector ongoing; + std::vector prepared; + std::vector aborted; + std::vector abort_indexes; + model::offset offset; + std::vector seqs; +}; + rm_stm::rm_stm( ss::logger& logger, raft::consensus* c, - ss::sharded& tx_gateway_frontend) + ss::sharded& tx_gateway_frontend, + ss::sharded& feature_table) : persisted_stm("tx.snapshot", logger, c) , _oldest_session(model::timestamp::now()) , _sync_timeout(config::shard_local_cfg().rm_sync_timeout_ms.value()) @@ -209,7 +235,8 @@ rm_stm::rm_stm( , _abort_snapshot_mgr( "abort.idx", std::filesystem::path(c->log_config().work_directory()), - ss::default_priority_class()) { + ss::default_priority_class()) + , _feature_table(feature_table) { if (!_is_tx_enabled) { _is_autoabort_enabled = false; } @@ -760,7 +787,7 @@ ss::future rm_stm::do_abort_tx( co_return tx_errc::none; } -raft::replicate_stages rm_stm::replicate_in_stages( +kafka_stages rm_stm::replicate_in_stages( model::batch_identity bid, model::record_batch_reader r, raft::replicate_options opts) { @@ -779,10 +806,10 @@ raft::replicate_stages rm_stm::replicate_in_stages( enqueued->set_value(); } }); - return raft::replicate_stages(std::move(f), std::move(replicate_finished)); + return kafka_stages(std::move(f), std::move(replicate_finished)); } -ss::future> rm_stm::replicate( +ss::future> rm_stm::replicate( model::batch_identity bid, model::record_batch_reader r, raft::replicate_options opts) { @@ -799,7 +826,7 @@ rm_stm::transfer_leadership(std::optional target) { }); } -ss::future> rm_stm::do_replicate( +ss::future> rm_stm::do_replicate( model::batch_identity bid, model::record_batch_reader b, raft::replicate_options opts, @@ -829,6 +856,11 @@ ss::future<> rm_stm::stop() { return raft::state_machine::stop(); } +ss::future<> rm_stm::start() { + _translator = _c->get_offset_translator_state(); + return persisted_stm::start(); +} + rm_stm::transaction_info::status_t rm_stm::get_tx_status(model::producer_identity pid) const { if (_mem_state.preparing.contains(pid)) { @@ -922,7 +954,7 @@ bool rm_stm::check_seq(model::batch_identity bid) { return false; } - seq.update(bid.last_seq, model::offset{-1}); + seq.update(bid.last_seq, kafka::offset{-1}); seq.pid = bid.pid; seq.last_write_timestamp = last_write_timestamp; @@ -932,7 +964,7 @@ bool rm_stm::check_seq(model::batch_identity bid) { return true; } -std::optional +std::optional rm_stm::known_seq(model::batch_identity bid) const { auto pid_seq = _log_state.seq_table.find(bid.pid); if (pid_seq == _log_state.seq_table.end()) { @@ -957,7 +989,7 @@ std::optional rm_stm::tail_seq(model::producer_identity pid) const { return pid_seq->second.seq; } -void rm_stm::set_seq(model::batch_identity bid, model::offset last_offset) { +void rm_stm::set_seq(model::batch_identity bid, kafka::offset last_offset) { auto pid_seq = _log_state.seq_table.find(bid.pid); if (pid_seq != _log_state.seq_table.end()) { if (pid_seq->second.seq == bid.last_seq) { @@ -970,14 +1002,14 @@ void rm_stm::reset_seq(model::batch_identity bid) { _log_state.seq_table.erase(bid.pid); auto& seq = _log_state.seq_table[bid.pid]; seq.seq = bid.last_seq; - seq.last_offset = model::offset{-1}; + seq.last_offset = kafka::offset{-1}; seq.pid = bid.pid; seq.last_write_timestamp = model::timestamp::now().value(); _oldest_session = std::min( _oldest_session, model::timestamp(seq.last_write_timestamp)); } -ss::future> +ss::future> rm_stm::replicate_tx(model::batch_identity bid, model::record_batch_reader br) { if (!check_tx_permitted()) { co_return errc::generic_tx_error; @@ -1031,7 +1063,7 @@ rm_stm::replicate_tx(model::batch_identity bid, model::record_batch_reader br) { // this isn't the first attempt in the tx we should try dedupe auto cached_offset = known_seq(bid); if (cached_offset) { - if (cached_offset.value() < model::offset{0}) { + if (cached_offset.value() < kafka::offset{0}) { vlog( clusterlog.warn, "Status of the original attempt is unknown (still is " @@ -1045,8 +1077,7 @@ rm_stm::replicate_tx(model::batch_identity bid, model::record_batch_reader br) { // to propagate it to the app layer co_return errc::generic_tx_error; } - co_return raft::replicate_result{ - .last_offset = cached_offset.value()}; + co_return kafka_result{.last_offset = cached_offset.value()}; } if (!check_seq(bid)) { @@ -1082,26 +1113,28 @@ rm_stm::replicate_tx(model::batch_identity bid, model::record_batch_reader br) { expiration_it->second.last_update = clock_type::now(); expiration_it->second.is_expiration_requested = false; - auto replicated = r.value(); + auto old_offset = r.value().last_offset; + auto new_offset = from_log_offset(old_offset); - set_seq(bid, replicated.last_offset); + set_seq(bid, new_offset); - auto last_offset = model::offset(replicated.last_offset()); if (!_mem_state.tx_start.contains(bid.pid)) { - auto base_offset = model::offset( - last_offset() - (bid.record_count - 1)); + auto base_offset = model::offset(old_offset() - (bid.record_count - 1)); _mem_state.tx_start.emplace(bid.pid, base_offset); _mem_state.tx_starts.insert(base_offset); _mem_state.estimated.erase(bid.pid); } - co_return replicated; + + co_return kafka_result{.last_offset = new_offset}; } -ss::future> rm_stm::replicate_seq( +ss::future> rm_stm::replicate_seq( model::batch_identity bid, model::record_batch_reader br, raft::replicate_options opts, ss::lw_shared_ptr> enqueued) { + using ret_t = result; + if (!co_await sync(_sync_timeout)) { // it's ok not to set enqueued on early return because // the safety check in replicate_in_stages sets it automatically @@ -1160,7 +1193,7 @@ ss::future> rm_stm::replicate_seq( // checking among the responded requests auto cached_offset = known_seq(bid); if (cached_offset) { - co_return raft::replicate_result{.last_offset = cached_offset.value()}; + co_return kafka_result{.last_offset = cached_offset.value()}; } // checking if the request is already being processed @@ -1168,8 +1201,8 @@ ss::future> rm_stm::replicate_seq( if (inflight->last_seq == bid.last_seq && inflight->is_processing) { // found an inflight request, parking the current request // until the former is resolved - auto promise = ss::make_lw_shared< - available_promise>>(); + auto promise + = ss::make_lw_shared>>(); inflight->parked.push_back(promise); u.return_all(); co_return co_await promise->get_future(); @@ -1254,20 +1287,26 @@ ss::future> rm_stm::replicate_seq( // we don't need session->lock because we never interleave // access to is_processing and offset with sync point (await) request->is_processing = false; - request->r = r; + if (r) { + auto old_offset = r.value().last_offset; + auto new_offset = from_log_offset(old_offset); + request->r = ret_t(kafka_result{new_offset}); + } else { + request->r = ret_t(r.error()); + } for (auto& pending : request->parked) { - pending->set_value(r); + pending->set_value(request->r); } request->parked.clear(); - if (!r) { + if (!request->r) { // if r was failed at the consensus level (not because has_failed) // it should guarantee that all follow up replication requests fail // too but just in case stepping down to minimize the risk if (_c->is_leader() && _c->term() == synced_term) { co_await _c->step_down(); } - co_return r; + co_return request->r; } // requests get into session->cache in seq order so when we iterate @@ -1299,13 +1338,15 @@ ss::future> rm_stm::replicate_seq( _inflight_requests.erase(bid.pid); } - co_return r; + co_return request->r; } -ss::future> rm_stm::replicate_msg( +ss::future> rm_stm::replicate_msg( model::record_batch_reader br, raft::replicate_options opts, ss::lw_shared_ptr> enqueued) { + using ret_t = result; + if (!co_await sync(_sync_timeout)) { co_return errc::not_leader; } @@ -1313,7 +1354,14 @@ ss::future> rm_stm::replicate_msg( auto ss = _c->replicate_in_stages(_insync_term, std::move(br), opts); co_await std::move(ss.request_enqueued); enqueued->set_value(); - co_return co_await std::move(ss.replicate_finished); + auto r = co_await std::move(ss.replicate_finished); + + if (!r) { + co_return ret_t(r.error()); + } + auto old_offset = r.value().last_offset; + auto new_offset = from_log_offset(old_offset); + co_return ret_t(kafka_result{new_offset}); } model::offset rm_stm::last_stable_offset() { @@ -1760,12 +1808,13 @@ ss::future<> rm_stm::apply_control( void rm_stm::apply_data(model::batch_identity bid, model::offset last_offset) { if (bid.has_idempotent()) { auto [seq_it, inserted] = _log_state.seq_table.try_emplace(bid.pid); + auto translated = from_log_offset(last_offset); if (inserted) { seq_it->second.pid = bid.pid; seq_it->second.seq = bid.last_seq; - seq_it->second.last_offset = last_offset; + seq_it->second.last_offset = translated; } else { - seq_it->second.update(bid.last_seq, last_offset); + seq_it->second.update(bid.last_seq, translated); } seq_it->second.last_write_timestamp = bid.first_timestamp.value(); _oldest_session = std::min(_oldest_session, bid.first_timestamp); @@ -1812,19 +1861,50 @@ rm_stm::apply_snapshot(stm_snapshot_header hdr, iobuf&& tx_ss_buf) { iobuf_parser data_parser(std::move(tx_ss_buf)); if (hdr.version == tx_snapshot::version) { data = reflection::adl{}.from(data_parser); + } else if (hdr.version == tx_snapshot_v1::version) { + auto data_v1 = reflection::adl{}.from(data_parser); + data.fenced = std::move(data_v1.fenced); + data.ongoing = std::move(data_v1.ongoing); + data.prepared = std::move(data_v1.prepared); + data.aborted = std::move(data_v1.aborted); + data.abort_indexes = std::move(data_v1.abort_indexes); + data.offset = std::move(data_v1.offset); + for (auto& seq_v1 : data_v1.seqs) { + seq_entry seq; + seq.pid = seq_v1.pid; + seq.seq = seq_v1.seq; + try { + seq.last_offset = from_log_offset(seq_v1.last_offset); + } catch (...) { + // ignoring outside the translation range errors + continue; + } + seq.seq_cache.reserve(seq_v1.seq_cache.size()); + for (auto& item : seq_v1.seq_cache) { + try { + seq.seq_cache.push_back(seq_cache_entry{ + .seq = item.seq, .offset = from_log_offset(item.offset)}); + } catch (...) { + // ignoring outside the translation range errors + continue; + } + } + seq.last_write_timestamp = seq_v1.last_write_timestamp; + data.seqs.push_back(std::move(seq)); + } } else if (hdr.version == tx_snapshot_v0::version) { auto data_v0 = reflection::adl{}.from(data_parser); - data.fenced = data_v0.fenced; - data.ongoing = data_v0.ongoing; - data.prepared = data_v0.prepared; - data.aborted = data_v0.aborted; - data.abort_indexes = data_v0.abort_indexes; - data.offset = data_v0.offset; + data.fenced = std::move(data_v0.fenced); + data.ongoing = std::move(data_v0.ongoing); + data.prepared = std::move(data_v0.prepared); + data.aborted = std::move(data_v0.aborted); + data.abort_indexes = std::move(data_v0.abort_indexes); + data.offset = std::move(data_v0.offset); for (auto seq_v0 : data_v0.seqs) { auto seq = seq_entry{ .pid = seq_v0.pid, .seq = seq_v0.seq, - .last_offset = model::offset{-1}, + .last_offset = kafka::offset{-1}, .last_write_timestamp = seq_v0.last_write_timestamp}; data.seqs.push_back(std::move(seq)); } @@ -1879,6 +1959,32 @@ rm_stm::apply_snapshot(stm_snapshot_header hdr, iobuf&& tx_ss_buf) { _insync_offset = data.offset; } +uint8_t rm_stm::active_snapshot_version() { + if (_feature_table.local().is_active(feature::rm_stm_kafka_cache)) { + return tx_snapshot::version; + } + return tx_snapshot_v1::version; +} + +template +void rm_stm::fill_snapshot_wo_seqs(T& snapshot) { + for (auto const& [k, v] : _log_state.fence_pid_epoch) { + snapshot.fenced.push_back(model::producer_identity{k(), v()}); + } + for (auto& entry : _log_state.ongoing_map) { + snapshot.ongoing.push_back(entry.second); + } + for (auto& entry : _log_state.prepared) { + snapshot.prepared.push_back(entry.second); + } + for (auto& entry : _log_state.aborted) { + snapshot.aborted.push_back(entry); + } + for (auto& entry : _log_state.abort_indexes) { + snapshot.abort_indexes.push_back(entry); + } +} + ss::future rm_stm::take_snapshot() { if (_log_state.aborted.size() > _abort_index_segment_size) { std::sort( @@ -1904,33 +2010,51 @@ ss::future rm_stm::take_snapshot() { _log_state.aborted = snapshot.aborted; } - tx_snapshot tx_ss; - - for (auto const& [k, v] : _log_state.fence_pid_epoch) { - tx_ss.fenced.push_back(model::producer_identity{k(), v()}); - } - for (auto& entry : _log_state.ongoing_map) { - tx_ss.ongoing.push_back(entry.second); - } - for (auto& entry : _log_state.prepared) { - tx_ss.prepared.push_back(entry.second); - } - for (auto& entry : _log_state.aborted) { - tx_ss.aborted.push_back(entry); - } - for (auto& entry : _log_state.abort_indexes) { - tx_ss.abort_indexes.push_back(entry); - } - for (const auto& entry : _log_state.seq_table) { - tx_ss.seqs.push_back(entry.second.copy()); - } - tx_ss.offset = _insync_offset; - iobuf tx_ss_buf; - reflection::adl{}.to(tx_ss_buf, std::move(tx_ss)); + auto version = active_snapshot_version(); + if (version == tx_snapshot::version) { + tx_snapshot tx_ss; + fill_snapshot_wo_seqs(tx_ss); + for (const auto& entry : _log_state.seq_table) { + tx_ss.seqs.push_back(entry.second.copy()); + } + tx_ss.offset = _insync_offset; + reflection::adl{}.to(tx_ss_buf, std::move(tx_ss)); + } else if (version == tx_snapshot_v1::version) { + tx_snapshot_v1 tx_ss; + fill_snapshot_wo_seqs(tx_ss); + for (const auto& it : _log_state.seq_table) { + auto& entry = it.second; + seq_entry_v1 seqs; + seqs.pid = entry.pid; + seqs.seq = entry.seq; + try { + seqs.last_offset = to_log_offset(entry.last_offset); + } catch (...) { + // ignoring outside the translation range errors + continue; + } + seqs.last_write_timestamp = entry.last_write_timestamp; + seqs.seq_cache.reserve(seqs.seq_cache.size()); + for (auto& item : entry.seq_cache) { + try { + seqs.seq_cache.push_back(seq_cache_entry_v1{ + .seq = item.seq, .offset = to_log_offset(item.offset)}); + } catch (...) { + // ignoring outside the translation range errors + continue; + } + } + tx_ss.seqs.push_back(std::move(seqs)); + } + tx_ss.offset = _insync_offset; + reflection::adl{}.to(tx_ss_buf, std::move(tx_ss)); + } else { + vassert(false, "unsupported tx_snapshot version {}", version); + } co_return stm_snapshot::create( - tx_snapshot::version, _insync_offset, std::move(tx_ss_buf)); + version, _insync_offset, std::move(tx_ss_buf)); } ss::future<> rm_stm::save_abort_snapshot(abort_snapshot snapshot) { diff --git a/src/v/cluster/rm_stm.h b/src/v/cluster/rm_stm.h index 48f3fa0da9645..5ced34663a920 100644 --- a/src/v/cluster/rm_stm.h +++ b/src/v/cluster/rm_stm.h @@ -11,6 +11,7 @@ #pragma once +#include "cluster/feature_table.h" #include "cluster/persisted_stm.h" #include "cluster/tx_utils.h" #include "cluster/types.h" @@ -21,6 +22,7 @@ #include "raft/logger.h" #include "raft/state_machine.h" #include "raft/types.h" +#include "storage/offset_translator_state.h" #include "storage/snapshot.h" #include "utils/available_promise.h" #include "utils/expiring_promise.h" @@ -56,6 +58,8 @@ class rm_stm final : public persisted_stm { model::producer_identity pid; model::offset first; model::offset last; + + auto operator<=>(const tx_range&) const = default; }; struct abort_index { @@ -74,14 +78,14 @@ class rm_stm final : public persisted_stm { struct seq_cache_entry { int32_t seq{-1}; - model::offset offset; + kafka::offset offset; }; struct seq_entry { static const int seq_cache_size = 5; model::producer_identity pid; int32_t seq{-1}; - model::offset last_offset{-1}; + kafka::offset last_offset{-1}; ss::circular_buffer seq_cache; model::timestamp::type last_write_timestamp; @@ -99,7 +103,7 @@ class rm_stm final : public persisted_stm { return ret; } - void update(int32_t new_seq, model::offset new_offset) { + void update(int32_t new_seq, kafka::offset new_offset) { if (new_seq < seq) { return; } @@ -109,7 +113,7 @@ class rm_stm final : public persisted_stm { return; } - if (seq >= 0 && last_offset >= model::offset{0}) { + if (seq >= 0 && last_offset >= kafka::offset{0}) { auto entry = seq_cache_entry{.seq = seq, .offset = last_offset}; seq_cache.push_back(entry); while (seq_cache.size() >= seq_entry::seq_cache_size) { @@ -123,7 +127,7 @@ class rm_stm final : public persisted_stm { }; struct tx_snapshot { - static constexpr uint8_t version = 1; + static constexpr uint8_t version = 2; std::vector fenced; std::vector ongoing; @@ -150,7 +154,8 @@ class rm_stm final : public persisted_stm { explicit rm_stm( ss::logger&, raft::consensus*, - ss::sharded&); + ss::sharded&, + ss::sharded&); ss::future> begin_tx( model::producer_identity, model::tx_seq, std::chrono::milliseconds); @@ -169,12 +174,12 @@ class rm_stm final : public persisted_stm { ss::future> aborted_transactions(model::offset, model::offset); - raft::replicate_stages replicate_in_stages( + kafka_stages replicate_in_stages( model::batch_identity, model::record_batch_reader, raft::replicate_options); - ss::future> replicate( + ss::future> replicate( model::batch_identity, model::record_batch_reader, raft::replicate_options); @@ -184,6 +189,8 @@ class rm_stm final : public persisted_stm { ss::future<> stop() override; + ss::future<> start() override; + void testing_only_disable_auto_abort() { _is_autoabort_enabled = false; } void testing_only_enable_transactions() { _is_tx_enabled = true; } @@ -273,27 +280,27 @@ class rm_stm final : public persisted_stm { ss::future<> save_abort_snapshot(abort_snapshot); bool check_seq(model::batch_identity); - std::optional known_seq(model::batch_identity) const; - void set_seq(model::batch_identity, model::offset); + std::optional known_seq(model::batch_identity) const; + void set_seq(model::batch_identity, kafka::offset); void reset_seq(model::batch_identity); std::optional tail_seq(model::producer_identity) const; - ss::future> do_replicate( + ss::future> do_replicate( model::batch_identity, model::record_batch_reader, raft::replicate_options, ss::lw_shared_ptr>); - ss::future> + ss::future> replicate_tx(model::batch_identity, model::record_batch_reader); - ss::future> replicate_seq( + ss::future> replicate_seq( model::batch_identity, model::record_batch_reader, raft::replicate_options, ss::lw_shared_ptr>); - ss::future> replicate_msg( + ss::future> replicate_msg( model::record_batch_reader, raft::replicate_options, ss::lw_shared_ptr>); @@ -406,27 +413,14 @@ class rm_stm final : public persisted_stm { } }; - struct request_id { - model::producer_identity pid; - int32_t seq; - - auto operator<=>(const request_id&) const = default; - - template - friend H AbslHashValue(H h, const request_id& bid) { - return H::combine(std::move(h), bid.pid, bid.seq); - } - }; - // When a request is retried while the first appempt is still // being replicated the retried request is parked until the // original request is replicated. struct inflight_request { int32_t last_seq{-1}; - result r = errc::success; + result r = errc::success; bool is_processing; - std::vector< - ss::lw_shared_ptr>>> + std::vector>>> parked; }; @@ -466,8 +460,7 @@ class rm_stm final : public persisted_stm { tail_seq = -1; } - std::optional> - known_seq(int32_t last_seq) const { + std::optional> known_seq(int32_t last_seq) const { for (auto& seq : cache) { if (seq->last_seq == last_seq && !seq->is_processing) { return seq->r; @@ -487,11 +480,30 @@ class rm_stm final : public persisted_stm { return lock_it->second; } + kafka::offset from_log_offset(model::offset old_offset) { + if (old_offset > model::offset{-1}) { + return kafka::offset(_translator->from_log_offset(old_offset)()); + } + return kafka::offset(old_offset()); + } + + model::offset to_log_offset(kafka::offset new_offset) { + if (new_offset > model::offset{-1}) { + return _translator->to_log_offset(model::offset(new_offset())); + } + return model::offset(new_offset()); + } + transaction_info::status_t get_tx_status(model::producer_identity pid) const; std::optional get_expiration_info(model::producer_identity pid) const; + uint8_t active_snapshot_version(); + + template + void fill_snapshot_wo_seqs(T&); + ss::basic_rwlock<> _state_lock; absl::flat_hash_map> _tx_locks; absl::flat_hash_map< @@ -514,6 +526,8 @@ class rm_stm final : public persisted_stm { bool _is_tx_enabled{false}; ss::sharded& _tx_gateway_frontend; storage::snapshot_manager _abort_snapshot_mgr; + ss::lw_shared_ptr _translator; + ss::sharded& _feature_table; }; } // namespace cluster diff --git a/src/v/cluster/security_frontend.cc b/src/v/cluster/security_frontend.cc index 18deff751ace7..32ef272f7d65c 100644 --- a/src/v/cluster/security_frontend.cc +++ b/src/v/cluster/security_frontend.cc @@ -30,6 +30,7 @@ #include "rpc/errc.h" #include "rpc/types.h" #include "security/authorizer.h" +#include "security/scram_algorithm.h" #include @@ -308,4 +309,59 @@ security_frontend::dispatch_delete_acls_to_leader( }); } +/** + * For use during cluster creation, if RP_BOOTSTRAP_USER is set + * then write a user creation message to the controller log. + * + * @returns an error code if controller log write failed. If the + * environment variable is missing or malformed this is + * not considered an error. + * + */ +ss::future security_frontend::maybe_create_bootstrap_user() { + static const ss::sstring bootstrap_user_env_key{"RP_BOOTSTRAP_USER"}; + + auto creds_str_ptr = std::getenv(bootstrap_user_env_key.c_str()); + if (creds_str_ptr == nullptr) { + // Environment variable is not set + co_return errc::success; + } + + ss::sstring creds_str = creds_str_ptr; + auto colon = creds_str.find(":"); + if (colon == ss::sstring::npos || colon == creds_str.size() - 1) { + // Malformed value. Do not log the value, it may be malformed + // but it is still a secret. + vlog( + clusterlog.warn, + "Invalid value of {} (expected \"username:password\")", + bootstrap_user_env_key); + co_return errc::success; + } + + auto username = security::credential_user{creds_str.substr(0, colon)}; + auto password = creds_str.substr(colon + 1); + auto credentials = security::scram_sha256::make_credentials( + password, security::scram_sha256::min_iterations); + + auto err = co_await create_user( + username, credentials, model::timeout_clock::now() + 5s); + + if (err) { + vlog( + clusterlog.warn, + "Failed to apply {}: {}", + bootstrap_user_env_key, + err.message()); + } else { + vlog( + clusterlog.info, + "Created user '{}' via {}", + username, + bootstrap_user_env_key); + } + + co_return err; +} + } // namespace cluster diff --git a/src/v/cluster/security_frontend.h b/src/v/cluster/security_frontend.h index ae4924b39af9e..527143dcb4cce 100644 --- a/src/v/cluster/security_frontend.h +++ b/src/v/cluster/security_frontend.h @@ -54,6 +54,8 @@ class security_frontend final { std::vector, model::timeout_clock::duration); + ss::future maybe_create_bootstrap_user(); + private: ss::future> do_create_acls( std::vector, model::timeout_clock::duration); diff --git a/src/v/cluster/tests/idempotency_tests.cc b/src/v/cluster/tests/idempotency_tests.cc index 1d3187181691b..f79f59eaccd06 100644 --- a/src/v/cluster/tests/idempotency_tests.cc +++ b/src/v/cluster/tests/idempotency_tests.cc @@ -8,6 +8,7 @@ // by the Apache License, Version 2.0 #include "cluster/errc.h" +#include "cluster/feature_table.h" #include "cluster/rm_stm.h" #include "finjector/hbadger.h" #include "model/fundamental.h" @@ -36,7 +37,10 @@ FIXTURE_TEST( start_raft(); ss::sharded tx_gateway_frontend; - cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend); + ss::sharded feature_table; + feature_table.start().get0(); + cluster::rm_stm stm( + logger, _raft.get(), tx_gateway_frontend, feature_table); stm.testing_only_disable_auto_abort(); stm.start().get0(); @@ -81,6 +85,7 @@ FIXTURE_TEST( raft::replicate_options(raft::consistency_level::quorum_ack)) .get0(); BOOST_REQUIRE((bool)r2); + feature_table.stop().get0(); } FIXTURE_TEST( @@ -88,7 +93,10 @@ FIXTURE_TEST( start_raft(); ss::sharded tx_gateway_frontend; - cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend); + ss::sharded feature_table; + feature_table.start().get0(); + cluster::rm_stm stm( + logger, _raft.get(), tx_gateway_frontend, feature_table); stm.testing_only_disable_auto_abort(); stm.start().get0(); @@ -135,13 +143,17 @@ FIXTURE_TEST( BOOST_REQUIRE((bool)r2); BOOST_REQUIRE(r1.value().last_offset < r2.value().last_offset); + feature_table.stop().get0(); } FIXTURE_TEST(test_rm_stm_caches_last_5_offsets, mux_state_machine_fixture) { start_raft(); ss::sharded tx_gateway_frontend; - cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend); + ss::sharded feature_table; + feature_table.start().get0(); + cluster::rm_stm stm( + logger, _raft.get(), tx_gateway_frontend, feature_table); stm.testing_only_disable_auto_abort(); stm.start().get0(); @@ -150,7 +162,7 @@ FIXTURE_TEST(test_rm_stm_caches_last_5_offsets, mux_state_machine_fixture) { wait_for_confirmed_leader(); wait_for_meta_initialized(); - std::vector offsets; + std::vector offsets; auto count = 5; @@ -200,13 +212,17 @@ FIXTURE_TEST(test_rm_stm_caches_last_5_offsets, mux_state_machine_fixture) { BOOST_REQUIRE((bool)r1); BOOST_REQUIRE(r1.value().last_offset == offsets[i]); } + feature_table.stop().get0(); } FIXTURE_TEST(test_rm_stm_doesnt_cache_6th_offset, mux_state_machine_fixture) { start_raft(); ss::sharded tx_gateway_frontend; - cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend); + ss::sharded feature_table; + feature_table.start().get0(); + cluster::rm_stm stm( + logger, _raft.get(), tx_gateway_frontend, feature_table); stm.testing_only_disable_auto_abort(); stm.start().get0(); @@ -260,13 +276,17 @@ FIXTURE_TEST(test_rm_stm_doesnt_cache_6th_offset, mux_state_machine_fixture) { r1 == failure_type(cluster::errc::sequence_out_of_order)); } + feature_table.stop().get0(); } FIXTURE_TEST(test_rm_stm_prevents_gaps, mux_state_machine_fixture) { start_raft(); ss::sharded tx_gateway_frontend; - cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend); + ss::sharded feature_table; + feature_table.start().get0(); + cluster::rm_stm stm( + logger, _raft.get(), tx_gateway_frontend, feature_table); stm.testing_only_disable_auto_abort(); stm.start().get0(); @@ -312,6 +332,7 @@ FIXTURE_TEST(test_rm_stm_prevents_gaps, mux_state_machine_fixture) { .get0(); BOOST_REQUIRE( r2 == failure_type(cluster::errc::sequence_out_of_order)); + feature_table.stop().get0(); } FIXTURE_TEST( @@ -319,7 +340,10 @@ FIXTURE_TEST( start_raft(); ss::sharded tx_gateway_frontend; - cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend); + ss::sharded feature_table; + feature_table.start().get0(); + cluster::rm_stm stm( + logger, _raft.get(), tx_gateway_frontend, feature_table); stm.testing_only_disable_auto_abort(); stm.start().get0(); @@ -349,13 +373,17 @@ FIXTURE_TEST( .get0(); BOOST_REQUIRE( r == failure_type(cluster::errc::sequence_out_of_order)); + feature_table.stop().get0(); } FIXTURE_TEST(test_rm_stm_passes_immediate_retry, mux_state_machine_fixture) { start_raft(); ss::sharded tx_gateway_frontend; - cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend); + ss::sharded feature_table; + feature_table.start().get0(); + cluster::rm_stm stm( + logger, _raft.get(), tx_gateway_frontend, feature_table); stm.testing_only_disable_auto_abort(); stm.start().get0(); @@ -404,4 +432,5 @@ FIXTURE_TEST(test_rm_stm_passes_immediate_retry, mux_state_machine_fixture) { BOOST_REQUIRE((bool)r1); BOOST_REQUIRE((bool)r2); BOOST_REQUIRE(r1.value().last_offset == r2.value().last_offset); + feature_table.stop().get0(); } diff --git a/src/v/cluster/tests/partition_moving_test.cc b/src/v/cluster/tests/partition_moving_test.cc index a8b68af8fe2bc..b5d8fd9766c4a 100644 --- a/src/v/cluster/tests/partition_moving_test.cc +++ b/src/v/cluster/tests/partition_moving_test.cc @@ -318,7 +318,7 @@ class partition_assignment_test_fixture : public cluster_test_fixture { auto rdr = model::make_memory_record_batch_reader( std::move(batches)); // replicate - auto f = pm.get(ntp)->replicate( + auto f = pm.get(ntp)->raft()->replicate( std::move(rdr), raft::replicate_options(raft::consistency_level::quorum_ack)); diff --git a/src/v/cluster/tests/rebalancing_tests_fixture.h b/src/v/cluster/tests/rebalancing_tests_fixture.h index 107a25446b49f..36e18fecf5d5e 100644 --- a/src/v/cluster/tests/rebalancing_tests_fixture.h +++ b/src/v/cluster/tests/rebalancing_tests_fixture.h @@ -163,7 +163,7 @@ class rebalancing_tests_fixture : public cluster_test_fixture { auto rdr = model::make_memory_record_batch_reader( std::move(batches)); // replicate - auto f = pm.get(ntp)->replicate( + auto f = pm.get(ntp)->raft()->replicate( std::move(rdr), raft::replicate_options(raft::consistency_level::quorum_ack)); diff --git a/src/v/cluster/tests/rm_stm_tests.cc b/src/v/cluster/tests/rm_stm_tests.cc index c53da9af13c5c..f40bb497b9d94 100644 --- a/src/v/cluster/tests/rm_stm_tests.cc +++ b/src/v/cluster/tests/rm_stm_tests.cc @@ -8,6 +8,7 @@ // by the Apache License, Version 2.0 #include "cluster/errc.h" +#include "cluster/feature_table.h" #include "cluster/rm_stm.h" #include "finjector/hbadger.h" #include "model/fundamental.h" @@ -65,7 +66,10 @@ FIXTURE_TEST(test_tx_happy_tx, mux_state_machine_fixture) { start_raft(); ss::sharded tx_gateway_frontend; - cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend); + ss::sharded feature_table; + feature_table.start().get0(); + cluster::rm_stm stm( + logger, _raft.get(), tx_gateway_frontend, feature_table); stm.testing_only_disable_auto_abort(); stm.testing_only_enable_transactions(); @@ -129,6 +133,7 @@ FIXTURE_TEST(test_tx_happy_tx, mux_state_machine_fixture) { BOOST_REQUIRE_EQUAL(aborted_txs.size(), 0); BOOST_REQUIRE_LT(tx_offset, stm.last_stable_offset()); + feature_table.stop().get0(); } // tests: @@ -138,7 +143,10 @@ FIXTURE_TEST(test_tx_aborted_tx_1, mux_state_machine_fixture) { start_raft(); ss::sharded tx_gateway_frontend; - cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend); + ss::sharded feature_table; + feature_table.start().get0(); + cluster::rm_stm stm( + logger, _raft.get(), tx_gateway_frontend, feature_table); stm.testing_only_disable_auto_abort(); stm.testing_only_enable_transactions(); @@ -204,6 +212,7 @@ FIXTURE_TEST(test_tx_aborted_tx_1, mux_state_machine_fixture) { })); BOOST_REQUIRE_LT(tx_offset, stm.last_stable_offset()); + feature_table.stop().get0(); } // tests: @@ -213,7 +222,10 @@ FIXTURE_TEST(test_tx_aborted_tx_2, mux_state_machine_fixture) { start_raft(); ss::sharded tx_gateway_frontend; - cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend); + ss::sharded feature_table; + feature_table.start().get0(); + cluster::rm_stm stm( + logger, _raft.get(), tx_gateway_frontend, feature_table); stm.testing_only_disable_auto_abort(); stm.testing_only_enable_transactions(); @@ -285,6 +297,7 @@ FIXTURE_TEST(test_tx_aborted_tx_2, mux_state_machine_fixture) { })); BOOST_REQUIRE_LT(tx_offset, stm.last_stable_offset()); + feature_table.stop().get0(); } // transactional writes of an unknown tx are rejected @@ -292,7 +305,10 @@ FIXTURE_TEST(test_tx_unknown_produce, mux_state_machine_fixture) { start_raft(); ss::sharded tx_gateway_frontend; - cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend); + ss::sharded feature_table; + feature_table.start().get0(); + cluster::rm_stm stm( + logger, _raft.get(), tx_gateway_frontend, feature_table); stm.testing_only_disable_auto_abort(); stm.testing_only_enable_transactions(); @@ -322,6 +338,7 @@ FIXTURE_TEST(test_tx_unknown_produce, mux_state_machine_fixture) { raft::replicate_options(raft::consistency_level::quorum_ack)) .get0(); BOOST_REQUIRE(offset_r == invalid_producer_epoch); + feature_table.stop().get0(); } // begin fences off old transactions @@ -329,7 +346,10 @@ FIXTURE_TEST(test_tx_begin_fences_produce, mux_state_machine_fixture) { start_raft(); ss::sharded tx_gateway_frontend; - cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend); + ss::sharded feature_table; + feature_table.start().get0(); + cluster::rm_stm stm( + logger, _raft.get(), tx_gateway_frontend, feature_table); stm.testing_only_disable_auto_abort(); stm.testing_only_enable_transactions(); @@ -379,6 +399,7 @@ FIXTURE_TEST(test_tx_begin_fences_produce, mux_state_machine_fixture) { raft::replicate_options(raft::consistency_level::quorum_ack)) .get0(); BOOST_REQUIRE(!(bool)offset_r); + feature_table.stop().get0(); } // transactional writes of an aborted tx are rejected @@ -386,7 +407,10 @@ FIXTURE_TEST(test_tx_post_aborted_produce, mux_state_machine_fixture) { start_raft(); ss::sharded tx_gateway_frontend; - cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend); + ss::sharded feature_table; + feature_table.start().get0(); + cluster::rm_stm stm( + logger, _raft.get(), tx_gateway_frontend, feature_table); stm.testing_only_disable_auto_abort(); stm.testing_only_enable_transactions(); @@ -438,4 +462,5 @@ FIXTURE_TEST(test_tx_post_aborted_produce, mux_state_machine_fixture) { raft::replicate_options(raft::consistency_level::quorum_ack)) .get0(); BOOST_REQUIRE(offset_r == invalid_producer_epoch); + feature_table.stop().get0(); } diff --git a/src/v/cluster/tests/serialization_rt_test.cc b/src/v/cluster/tests/serialization_rt_test.cc index 99d34637f639e..a5f51bbdc05f5 100644 --- a/src/v/cluster/tests/serialization_rt_test.cc +++ b/src/v/cluster/tests/serialization_rt_test.cc @@ -15,11 +15,13 @@ #include "model/compression.h" #include "model/fundamental.h" #include "model/metadata.h" +#include "model/tests/random_batch.h" #include "model/tests/randoms.h" #include "model/timestamp.h" #include "raft/types.h" #include "random/generators.h" #include "reflection/adl.h" +#include "reflection/async_adl.h" #include "storage/types.h" #include "test_utils/randoms.h" #include "test_utils/rpc.h" @@ -2088,6 +2090,210 @@ SEASTAR_THREAD_TEST_CASE(serde_reflection_roundtrip) { }; roundtrip_test(data); } + { + raft::heartbeat_request data; + + // heartbeat request uses the first node/target_node for all of the + // heartbeat meatdata entries. so here we arrange for that to be true in + // the input data so that equality works as expected. + const auto node_id = tests::random_named_int(); + const auto target_node_id = tests::random_named_int(); + + for (auto i = 0, mi = random_generators::get_int(1, 20); i < mi; ++i) { + raft::protocol_metadata meta{ + .group = tests::random_named_int(), + .commit_index = tests::random_named_int(), + .term = tests::random_named_int(), + .prev_log_index = tests::random_named_int(), + .prev_log_term = tests::random_named_int(), + .last_visible_index = tests::random_named_int(), + }; + raft::heartbeat_metadata hm{ + .meta = meta, + .node_id = raft:: + vnode{node_id, tests::random_named_int()}, + .target_node_id = raft:: + vnode{target_node_id, tests::random_named_int()}, + }; + data.heartbeats.push_back(hm); + } + + // encoder will sort automatically. so for equality to work as expected + // we use the same sorting for the input as the expected output. + struct sorter_fn { + constexpr bool operator()( + const raft::heartbeat_metadata& lhs, + const raft::heartbeat_metadata& rhs) const { + return lhs.meta.commit_index < rhs.meta.commit_index; + } + }; + + std::sort(data.heartbeats.begin(), data.heartbeats.end(), sorter_fn{}); + + // serde round trip test async version + { + auto serde_in = data; + iobuf serde_out; + serde::write_async(serde_out, std::move(serde_in)).get(); + auto from_serde = serde::from_iobuf( + std::move(serde_out)); + BOOST_REQUIRE(data == from_serde); + } + + // the adl test needs to force async to avoid the automatic reflection + // version of the encoder. + { + auto adl_in = data; + iobuf adl_out; + reflection::async_adl{} + .to(adl_out, std::move(adl_in)) + .get(); + iobuf_parser in(std::move(adl_out)); + auto from_adl = reflection::async_adl{} + .from(in) + .get0(); + + BOOST_REQUIRE(data == from_adl); + } + } + { + raft::heartbeat_reply data; + + // heartbeat reply uses the first node/target_node for all of the + // reply meatdata entries. so here we arrange for that to be true in + // the input data so that equality works as expected. + const auto node_id = tests::random_named_int(); + const auto target_node_id = tests::random_named_int(); + + for (auto i = 0, mi = random_generators::get_int(1, 20); i < mi; ++i) { + raft::append_entries_reply reply{ + .target_node_id = raft:: + vnode{target_node_id, tests::random_named_int()}, + .node_id = raft:: + vnode{node_id, tests::random_named_int()}, + .group = tests::random_named_int(), + .term = tests::random_named_int(), + .last_flushed_log_index + = tests::random_named_int(), + .last_dirty_log_index = tests::random_named_int(), + .last_term_base_offset = tests::random_named_int(), + .result = raft::append_entries_reply::status::group_unavailable, + }; + data.meta.push_back(reply); + } + + // encoder will sort automatically. so for equality to work as expected + // we use the same sorting for the input as the expected output. + struct sorter_fn { + constexpr bool operator()( + const raft::append_entries_reply& lhs, + const raft::append_entries_reply& rhs) const { + return lhs.last_flushed_log_index < rhs.last_flushed_log_index; + } + }; + + std::sort(data.meta.begin(), data.meta.end(), sorter_fn{}); + + serde_roundtrip_test(data); + + // the adl test needs to force async to avoid the automatic reflection + // version of the encoder. + { + auto adl_in = data; + iobuf adl_out; + reflection::async_adl{} + .to(adl_out, std::move(adl_in)) + .get(); + iobuf_parser in(std::move(adl_out)); + auto from_adl + = reflection::async_adl{}.from(in).get0(); + + BOOST_REQUIRE(data == from_adl); + } + } + { + raft::protocol_metadata data{ + .group = tests::random_named_int(), + .commit_index = tests::random_named_int(), + .term = tests::random_named_int(), + .prev_log_index = tests::random_named_int(), + .prev_log_term = tests::random_named_int(), + .last_visible_index = tests::random_named_int(), + }; + roundtrip_test(data); + } + { + const auto gold = model::test::make_random_batches( + model::offset(0), 20); + + // make a copy of the source batches for later comparison because the + // copy moved into the request will get eaten. + ss::circular_buffer batches_in; + for (const auto& batch : gold) { + batches_in.push_back(batch.copy()); + } + + raft::protocol_metadata pmd{ + .group = tests::random_named_int(), + .commit_index = tests::random_named_int(), + .term = tests::random_named_int(), + .prev_log_index = tests::random_named_int(), + .prev_log_term = tests::random_named_int(), + .last_visible_index = tests::random_named_int(), + }; + + raft::append_entries_request data{ + raft::vnode{ + tests::random_named_int(), + tests::random_named_int()}, + raft::vnode{ + tests::random_named_int(), + tests::random_named_int()}, + pmd, + model::make_memory_record_batch_reader(std::move(batches_in)), + raft::append_entries_request::flush_after_append( + tests::random_bool()), + }; + + // append_entries_request -> iobuf + iobuf serde_out; + serde::write_async(serde_out, std::move(data)).get(); + + // iobuf -> append_entries_request + iobuf_parser serde_in(std::move(serde_out)); + auto from_serde + = serde::read_async(serde_in).get0(); + + BOOST_REQUIRE(from_serde.node_id == data.node_id); + BOOST_REQUIRE(from_serde.target_node_id == data.target_node_id); + BOOST_REQUIRE(from_serde.meta == data.meta); + BOOST_REQUIRE(from_serde.flush == data.flush); + + auto batches_from_serde = model::consume_reader_to_memory( + std::move(from_serde.batches()), + model::no_timeout) + .get0(); + BOOST_REQUIRE(gold.size() > 0); + BOOST_REQUIRE(batches_from_serde.size() == gold.size()); + for (size_t i = 0; i < gold.size(); i++) { + BOOST_REQUIRE(batches_from_serde[i] == gold[i]); + } + } + { + raft::append_entries_reply data{ + .target_node_id = raft:: + vnode{tests::random_named_int(), tests::random_named_int()}, + .node_id = raft:: + vnode{tests::random_named_int(), tests::random_named_int()}, + .group = tests::random_named_int(), + .term = tests::random_named_int(), + .last_flushed_log_index = tests::random_named_int(), + .last_dirty_log_index = tests::random_named_int(), + .last_term_base_offset = tests::random_named_int(), + .result = raft::append_entries_reply::status::group_unavailable, + }; + roundtrip_test(data); + } } SEASTAR_THREAD_TEST_CASE(cluster_property_kv_exchangable_with_pair) { diff --git a/src/v/cluster/topic_table.cc b/src/v/cluster/topic_table.cc index 9a2f1558bfedc..9ad44ca5efceb 100644 --- a/src/v/cluster/topic_table.cc +++ b/src/v/cluster/topic_table.cc @@ -25,16 +25,16 @@ namespace cluster { template -std::vector> +std::vector> topic_table::transform_topics(Func&& f) const { - std::vector> ret; + std::vector> ret; ret.reserve(_topics.size()); std::transform( std::cbegin(_topics), std::cend(_topics), std::back_inserter(ret), [f = std::forward(f)]( - const std::pair& p) { + const std::pair& p) { return f(p.second); }); return ret; @@ -47,22 +47,30 @@ topic_table::apply(create_topic_cmd cmd, model::offset offset) { return ss::make_ready_future( errc::topic_already_exists); } - // calculate delta - for (auto& pas : cmd.value.assignments) { - auto ntp = model::ntp(cmd.key.ns, cmd.key.tp, pas.id); - _pending_deltas.emplace_back( - std::move(ntp), pas, offset, delta::op_type::add); - } std::optional remote_revision = cmd.value.cfg.properties.remote_topic_properties ? std::make_optional( cmd.value.cfg.properties.remote_topic_properties->remote_revision) : std::nullopt; + auto md = topic_metadata_item{ + .metadata = topic_metadata( + std::move(cmd.value), model::revision_id(offset()), remote_revision)}; + // calculate delta + md.replica_revisions.reserve(cmd.value.assignments.size()); + for (auto& pas : md.get_assignments()) { + auto ntp = model::ntp(cmd.key.ns, cmd.key.tp, pas.id); + for (auto& r : pas.replicas) { + md.replica_revisions[pas.id][r.node_id] = model::revision_id( + offset); + } + _pending_deltas.emplace_back( + std::move(ntp), pas, offset, delta::op_type::add); + } - _topics.insert( - {cmd.key, - topic_metadata( - std::move(cmd.value), model::revision_id(offset()), remote_revision)}); + _topics.insert({ + cmd.key, + std::move(md), + }); notify_waiters(); return ss::make_ready_future(errc::success); } @@ -132,6 +140,10 @@ topic_table::apply(create_partition_cmd cmd, model::offset offset) { tp->second.get_assignments().emplace(p_as); // propagate deltas auto ntp = model::ntp(cmd.key.ns, cmd.key.tp, p_as.id); + for (auto& bs : p_as.replicas) { + tp->second.replica_revisions[p_as.id][bs.node_id] + = model::revision_id(offset); + } _pending_deltas.emplace_back( std::move(ntp), std::move(p_as), offset, delta::op_type::add); } @@ -169,16 +181,41 @@ topic_table::apply(move_partition_replicas_cmd cmd, model::offset o) { if (are_replica_sets_equal(current_assignment_it->replicas, cmd.value)) { return ss::make_ready_future(errc::success); } + auto revisions_it = tp->second.replica_revisions.find(cmd.key.tp.partition); + vassert( + revisions_it != tp->second.replica_revisions.end(), + "partition {}, replica revisions map must exists as partition is present", + cmd.key); _updates_in_progress.emplace( cmd.key, in_progress_update{ .previous_replicas = current_assignment_it->replicas, .state = in_progress_state::update_requested, + .update_revision = model::revision_id(o), + // snapshot replicas revisions + .replicas_revisions = revisions_it->second, }); auto previous_assignment = *current_assignment_it; // replace partition replica set current_assignment_it->replicas = cmd.value; + /** + * Update partition replica revisions. Assign new revision to added replicas + * and erase replicas which are removed from replica set + */ + auto added_replicas = subtract_replica_sets( + current_assignment_it->replicas, previous_assignment.replicas); + + for (auto& r : added_replicas) { + revisions_it->second[r.node_id] = model::revision_id(o); + } + + auto removed_replicas = subtract_replica_sets( + previous_assignment.replicas, current_assignment_it->replicas); + + for (auto& removed : removed_replicas) { + revisions_it->second.erase(removed.node_id); + } /// Update all non_replicable topics to have the same 'in-progress' state auto found = _topics_hierarchy.find(model::topic_namespace_view(cmd.key)); @@ -190,6 +227,7 @@ topic_table::apply(move_partition_replicas_cmd cmd, model::offset o) { in_progress_update{ .previous_replicas = current_assignment_it->replicas, .state = in_progress_state::update_requested, + .update_revision = model::revision_id(o), }); vassert( success, @@ -350,6 +388,13 @@ topic_table::apply(cancel_moving_partition_replicas_cmd cmd, model::offset o) { auto replicas = current_assignment_it->replicas; // replace replica set with set from in progress operation current_assignment_it->replicas = in_progress_it->second.previous_replicas; + auto revisions_it = tp->second.replica_revisions.find(cmd.key.tp.partition); + vassert( + revisions_it != tp->second.replica_revisions.end(), + "partition {} replica revisions map must exists", + cmd.key); + + revisions_it->second = in_progress_it->second.replicas_revisions; /// Update all non_replicable topics to have the same 'in-progress' state auto found = _topics_hierarchy.find(model::topic_namespace_view(cmd.key)); @@ -553,10 +598,14 @@ topic_table::apply(create_non_replicable_topic_cmd cmd, model::offset o) { success, "Duplicate non_replicable_topic detected when it shouldn't exist"); } + auto md = topic_metadata( + std::move(cfg), std::move(p_as), model::revision_id(o()), source.tp); + _topics.insert( {new_non_rep_topic, - topic_metadata( - std::move(cfg), std::move(p_as), model::revision_id(o()), source.tp)}); + topic_metadata_item{ + .metadata = std::move(md), + }}); notify_waiters(); co_return make_error_code(errc::success); } @@ -624,8 +673,9 @@ topic_table::wait_for_changes(ss::abort_source& as) { } std::vector topic_table::all_topics() const { - return transform_topics( - [](const topic_metadata& tp) { return tp.get_configuration().tp_ns; }); + return transform_topics([](const topic_metadata_item& tp) { + return tp.get_configuration().tp_ns; + }); } size_t topic_table::all_topics_count() const { return _topics.size(); } @@ -633,14 +683,14 @@ size_t topic_table::all_topics_count() const { return _topics.size(); } std::optional topic_table::get_topic_metadata(model::topic_namespace_view tp) const { if (auto it = _topics.find(tp); it != _topics.end()) { - return it->second; + return it->second.metadata; } return {}; } std::optional> topic_table::get_topic_metadata_ref(model::topic_namespace_view tp) const { if (auto it = _topics.find(tp); it != _topics.end()) { - return it->second; + return it->second.metadata; } return {}; } @@ -724,6 +774,58 @@ topic_table::get_previous_replica_set(const model::ntp& ntp) const { return std::nullopt; } +std::vector +topic_table::ntps_moving_to_node(model::node_id node) const { + std::vector ret; + + for (const auto& [ntp, state] : _updates_in_progress) { + if (contains_node(state.previous_replicas, node)) { + continue; + } + + auto current_assignment = get_partition_assignment(ntp); + if (unlikely(!current_assignment)) { + continue; + } + + if (contains_node(current_assignment->replicas, node)) { + ret.push_back(ntp); + } + } + return ret; +} + +std::vector +topic_table::ntps_moving_from_node(model::node_id node) const { + std::vector ret; + + for (const auto& [ntp, state] : _updates_in_progress) { + if (!contains_node(state.previous_replicas, node)) { + continue; + } + + auto current_assignment = get_partition_assignment(ntp); + if (unlikely(!current_assignment)) { + continue; + } + + if (!contains_node(current_assignment->replicas, node)) { + ret.push_back(ntp); + } + } + return ret; +} + +std::vector topic_table::all_updates_in_progress() const { + std::vector ret; + ret.reserve(_updates_in_progress.size()); + for (const auto& [ntp, _] : _updates_in_progress) { + ret.push_back(ntp); + } + + return ret; +} + std::ostream& operator<<(std::ostream& o, topic_table::in_progress_state update) { switch (update) { @@ -736,4 +838,5 @@ operator<<(std::ostream& o, topic_table::in_progress_state update) { } __builtin_unreachable(); } + } // namespace cluster diff --git a/src/v/cluster/topic_table.h b/src/v/cluster/topic_table.h index 1ea2ac237c786..0e6f7dd9c8692 100644 --- a/src/v/cluster/topic_table.h +++ b/src/v/cluster/topic_table.h @@ -41,17 +41,60 @@ class topic_table { cancel_requested, force_cancel_requested }; + /** + * Replicas revision map is used to track revision of brokers in a replica + * set. When a node is added into replica set its gets the revision assigned + */ + using replicas_revision_map + = absl::flat_hash_map; struct in_progress_update { std::vector previous_replicas; in_progress_state state; model::revision_id update_revision; + replicas_revision_map replicas_revisions; + }; + + struct topic_metadata_item { + topic_metadata metadata; + // replicas revisions for each partition + absl::node_hash_map + replica_revisions; + + bool is_topic_replicable() const { + return metadata.is_topic_replicable(); + } + + assignments_set& get_assignments() { + return metadata.get_assignments(); + } + + const assignments_set& get_assignments() const { + return metadata.get_assignments(); + } + model::revision_id get_revision() const { + return metadata.get_revision(); + } + std::optional get_remote_revision() const { + return metadata.get_remote_revision(); + } + const model::topic& get_source_topic() const { + return metadata.get_source_topic(); + } + + const topic_configuration& get_configuration() const { + return metadata.get_configuration(); + } + topic_configuration& get_configuration() { + return metadata.get_configuration(); + } }; + using delta = topic_table_delta; - using underlying_t = absl::flat_hash_map< + using underlying_t = absl::node_hash_map< model::topic_namespace, - topic_metadata, + topic_metadata_item, model::topic_namespace_hash, model::topic_namespace_eq>; using hierarchy_t = absl::node_hash_map< @@ -208,6 +251,23 @@ class topic_table { std::optional> get_previous_replica_set(const model::ntp&) const; + const absl::node_hash_map& + in_progress_updates() const { + return _updates_in_progress; + } + + /** + * Lists all NTPs that replicas are being move to a node + */ + std::vector ntps_moving_to_node(model::node_id) const; + + /** + * Lists all NTPs that replicas are being move from a node + */ + std::vector ntps_moving_from_node(model::node_id) const; + + std::vector all_updates_in_progress() const; + private: struct waiter { explicit waiter(uint64_t id) @@ -221,7 +281,7 @@ class topic_table { void notify_waiters(); template - std::vector> + std::vector> transform_topics(Func&&) const; underlying_t _topics; diff --git a/src/v/cluster/topic_updates_dispatcher.cc b/src/v/cluster/topic_updates_dispatcher.cc index 93f752f969283..2ee7fa3754bbf 100644 --- a/src/v/cluster/topic_updates_dispatcher.cc +++ b/src/v/cluster/topic_updates_dispatcher.cc @@ -12,9 +12,13 @@ #include "cluster/cluster_utils.h" #include "cluster/commands.h" #include "cluster/partition_leaders_table.h" +#include "cluster/topic_table.h" +#include "model/fundamental.h" #include "model/metadata.h" #include "raft/types.h" +#include + #include #include #include @@ -37,19 +41,29 @@ topic_updates_dispatcher::apply_update(model::record_batch b) { return ss::visit( std::move(cmd), [this, base_offset](delete_topic_cmd del_cmd) { - // delete case - we need state copy to - auto tp_md = _topic_table.local().get_topic_metadata( - del_cmd.value); + auto tp_ns = del_cmd.key; + auto topic_assignments + = _topic_table.local().get_topic_assignments(del_cmd.value); + in_progress_map in_progress; + + if (topic_assignments) { + in_progress = collect_in_progress( + del_cmd.key, *topic_assignments); + } return dispatch_updates_to_cores(del_cmd, base_offset) - .then([this, tp_md = std::move(tp_md)](std::error_code ec) { - if (ec == errc::success) { - vassert( - tp_md.has_value(), - "Topic had to exist before successful delete"); - deallocate_topic(*tp_md); - } - return ec; - }); + .then( + [this, + topic_assignments = std::move(topic_assignments), + in_progress = std::move(in_progress)](std::error_code ec) { + if (ec == errc::success) { + vassert( + topic_assignments.has_value(), + "Topic had to exist before successful delete"); + deallocate_topic(*topic_assignments, in_progress); + } + + return ec; + }); }, [this, base_offset](create_topic_cmd create_cmd) { return dispatch_updates_to_cores(create_cmd, base_offset) @@ -187,6 +201,22 @@ topic_updates_dispatcher::apply_update(model::record_batch b) { }); }); } +topic_updates_dispatcher::in_progress_map +topic_updates_dispatcher::collect_in_progress( + const model::topic_namespace& tp_ns, + const assignments_set& current_assignments) { + in_progress_map in_progress; + in_progress.reserve(current_assignments.size()); + // collect in progress assignments + for (auto& p : current_assignments) { + auto previous = _topic_table.local().get_previous_replica_set( + model::ntp(tp_ns.ns, tp_ns.tp, p.id)); + if (previous) { + in_progress.emplace(p.id, std::move(previous.value())); + } + } + return in_progress; +} ss::future<> topic_updates_dispatcher::update_leaders_with_estimates( std::vector leaders) { @@ -250,10 +280,19 @@ topic_updates_dispatcher::dispatch_updates_to_cores(Cmd cmd, model::offset o) { }); } -void topic_updates_dispatcher::deallocate_topic(const topic_metadata& tp_md) { - // we have to deallocate topics - for (auto& p : tp_md.get_assignments()) { - _partition_allocator.local().deallocate(p.replicas); +void topic_updates_dispatcher::deallocate_topic( + const assignments_set& topic_assignments, + const in_progress_map& in_progress) { + for (auto& p_as : topic_assignments) { + _partition_allocator.local().deallocate(p_as.replicas); + auto it = in_progress.find(p_as.id); + + // we must remove the allocation that would normally + // be removed with update_finished request + if (it != in_progress.end()) { + auto to_delete = subtract_replica_sets(it->second, p_as.replicas); + _partition_allocator.local().remove_allocations(to_delete); + } } } diff --git a/src/v/cluster/topic_updates_dispatcher.h b/src/v/cluster/topic_updates_dispatcher.h index e383e7c59bd40..0c5f3a7cc44fa 100644 --- a/src/v/cluster/topic_updates_dispatcher.h +++ b/src/v/cluster/topic_updates_dispatcher.h @@ -13,6 +13,8 @@ #include "cluster/commands.h" #include "cluster/scheduling/partition_allocator.h" #include "cluster/topic_table.h" +#include "cluster/types.h" +#include "model/fundamental.h" #include "model/record.h" #include @@ -71,6 +73,8 @@ class topic_updates_dispatcher { } private: + using in_progress_map = absl:: + node_hash_map>; template ss::future dispatch_updates_to_cores(Cmd, model::offset); @@ -78,7 +82,11 @@ class topic_updates_dispatcher { ss::future<> update_leaders_with_estimates(std::vector leaders); void update_allocations(std::vector); - void deallocate_topic(const topic_metadata&); + + void deallocate_topic(const assignments_set&, const in_progress_map&); + + in_progress_map + collect_in_progress(const model::topic_namespace&, const assignments_set&); ss::sharded& _partition_allocator; ss::sharded& _topic_table; diff --git a/src/v/cluster/types.cc b/src/v/cluster/types.cc index 9769e8e07a190..70888c5609c88 100644 --- a/src/v/cluster/types.cc +++ b/src/v/cluster/types.cc @@ -32,6 +32,16 @@ namespace cluster { +kafka_stages::kafka_stages( + ss::future<> enq, ss::future> offset_future) + : request_enqueued(std::move(enq)) + , replicate_finished(std::move(offset_future)) {} + +kafka_stages::kafka_stages(raft::errc ec) + : request_enqueued(ss::now()) + , replicate_finished( + ss::make_ready_future>(make_error_code(ec))){}; + bool topic_properties::is_compacted() const { if (!cleanup_policy_bitflags) { return false; @@ -45,7 +55,8 @@ bool topic_properties::has_overrides() const { return cleanup_policy_bitflags || compaction_strategy || segment_size || retention_bytes.has_value() || retention_bytes.is_disabled() || retention_duration.has_value() || retention_duration.is_disabled() - || recovery.has_value() || shadow_indexing.has_value(); + || recovery.has_value() || shadow_indexing.has_value() + || read_replica.has_value(); } storage::ntp_config::default_overrides @@ -59,6 +70,7 @@ topic_properties::get_ntp_cfg_overrides() const { ret.shadow_indexing_mode = shadow_indexing ? *shadow_indexing : model::shadow_indexing_mode::disabled; + ret.read_replica = read_replica; return ret; } @@ -91,7 +103,8 @@ storage::ntp_config topic_configuration::make_ntp_config( properties.recovery ? *properties.recovery : false), .shadow_indexing_mode = properties.shadow_indexing ? *properties.shadow_indexing - : model::shadow_indexing_mode::disabled}); + : model::shadow_indexing_mode::disabled, + .read_replica = properties.read_replica}); } return { model::ntp(tp_ns.ns, tp_ns.tp, p_id), diff --git a/src/v/cluster/types.h b/src/v/cluster/types.h index a5abd73d555fd..b101b19a88bff 100644 --- a/src/v/cluster/types.h +++ b/src/v/cluster/types.h @@ -175,6 +175,20 @@ inline std::error_code make_error_code(tx_errc e) noexcept { return std::error_code(static_cast(e), tx_error_category()); } +struct kafka_result { + kafka::offset last_offset; +}; +struct kafka_stages { + kafka_stages(ss::future<>, ss::future>); + explicit kafka_stages(raft::errc); + // after this future is ready, request in enqueued in raft and it will not + // be reorderd + ss::future<> request_enqueued; + // after this future is ready, request was successfully replicated with + // requested consistency level + ss::future> replicate_finished; +}; + struct try_abort_request : serde::envelope> { model::partition_id tm; diff --git a/src/v/coding-style.md b/src/v/coding-style.md index 3572cc04d43dc..87d766a02e38d 100644 --- a/src/v/coding-style.md +++ b/src/v/coding-style.md @@ -110,7 +110,7 @@ void a_function() { An exception is namespaces -- the body is _not_ indented, to prevent files that are almost 100% whitespace left margin. -When making a change, if you need to insert an indentation level, you can temporarily break the rules by insering a half-indent, so that the patch is easily reviewable: +When making a change, if you need to insert an indentation level, you can temporarily break the rules by inserting a half-indent, so that the patch is easily reviewable: ```c++ void a_function() { diff --git a/src/v/config/CMakeLists.txt b/src/v/config/CMakeLists.txt index 350ee35e36f9c..7579ca7d37ff8 100644 --- a/src/v/config/CMakeLists.txt +++ b/src/v/config/CMakeLists.txt @@ -1,6 +1,7 @@ v_cc_library( NAME config SRCS + broker_authn_endpoint.cc configuration.cc node_config.cc base_property.cc diff --git a/src/v/config/broker_authn_endpoint.cc b/src/v/config/broker_authn_endpoint.cc new file mode 100644 index 0000000000000..d3bcb055c0c8d --- /dev/null +++ b/src/v/config/broker_authn_endpoint.cc @@ -0,0 +1,104 @@ +// Copyright 2022 Redpanda Data, Inc. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.md +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0 + +#include "config/broker_authn_endpoint.h" + +#include "kafka/client/exceptions.h" +#include "model/metadata.h" +#include "utils/string_switch.h" + +namespace config { + +std::string_view to_string_view(broker_authn_method m) { + switch (m) { + case broker_authn_method::none: + return "none"; + case broker_authn_method::sasl: + return "sasl"; + case broker_authn_method::mtls_identity: + return "mtls_identity"; + } +} + +template<> +std::optional +from_string_view(std::string_view sv) { + return string_switch(sv) + .match("none", broker_authn_method::none) + .match("sasl", broker_authn_method::sasl) + .match("mtls_identity", broker_authn_method::mtls_identity) + .default_match(broker_authn_method::none); +} + +std::ostream& operator<<(std::ostream& os, const broker_authn_endpoint& ep) { + fmt::print(os, "{{{}:{}:{}}}", ep.name, ep.address, ep.authn_method); + return os; +} + +} // namespace config + +namespace YAML { + +Node convert::encode(const type& rhs) { + Node node; + node["name"] = rhs.name; + node["address"] = rhs.address.host(); + node["port"] = rhs.address.port(); + if (rhs.authn_method) { + node["authentication_method"] = ss::sstring( + to_string_view(*rhs.authn_method)); + } + return node; +} + +bool convert::decode( + const Node& node, type& rhs) { + for (auto s : {"address", "port"}) { + if (!node[s]) { + return false; + } + } + ss::sstring name; + if (node["name"]) { + name = node["name"].as(); + } + auto address = node["address"].as(); + auto port = node["port"].as(); + auto addr = net::unresolved_address(std::move(address), port); + std::optional method{}; + if (auto n = node["authentication_method"]; bool(n)) { + method = config::from_string_view( + n.as()); + } + rhs = config::broker_authn_endpoint{ + .name = std::move(name), + .address = std::move(addr), + .authn_method = method}; + return true; +} + +} // namespace YAML + +void json::rjson_serialize( + json::Writer& w, + const config::broker_authn_endpoint& ep) { + w.StartObject(); + w.Key("name"); + w.String(ep.name); + w.Key("address"); + w.String(ep.address.host()); + w.Key("port"); + w.Uint(ep.address.port()); + if (ep.authn_method) { + w.Key("authentication_method"); + auto method = to_string_view(*ep.authn_method); + w.String(method.begin(), method.length()); + } + w.EndObject(); +} diff --git a/src/v/config/broker_authn_endpoint.h b/src/v/config/broker_authn_endpoint.h new file mode 100644 index 0000000000000..ac1ef2953b5b3 --- /dev/null +++ b/src/v/config/broker_authn_endpoint.h @@ -0,0 +1,85 @@ +// Copyright 2021 Redpanda Data, Inc. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.md +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0 + +#pragma once + +#include "config/convert.h" +#include "config/property.h" +#include "json/_include_first.h" +#include "json/stringbuffer.h" +#include "json/writer.h" +#include "net/unresolved_address.h" + +#include + +#include + +#include +#include +#include + +namespace config { + +template +std::enable_if_t, std::optional> + from_string_view(std::string_view); + +enum class broker_authn_method { + none = 0, + sasl, + mtls_identity, +}; + +std::string_view to_string_view(broker_authn_method m); + +template<> +std::optional +from_string_view(std::string_view sv); + +struct broker_authn_endpoint { + ss::sstring name; + net::unresolved_address address; + std::optional authn_method; + + friend bool + operator==(const broker_authn_endpoint&, const broker_authn_endpoint&) + = default; + + friend std::ostream& + operator<<(std::ostream& os, const broker_authn_endpoint& ep); +}; + +namespace detail { + +template<> +consteval std::string_view property_type_name() { + return "config::broker_auth_endpoint"; +} + +} // namespace detail + +} // namespace config + +namespace YAML { + +template<> +struct convert { + using type = config::broker_authn_endpoint; + static Node encode(const type& rhs); + static bool decode(const Node& node, type& rhs); +}; + +} // namespace YAML + +namespace json { + +void rjson_serialize( + json::Writer& w, const config::broker_authn_endpoint& ep); + +} diff --git a/src/v/config/configuration.cc b/src/v/config/configuration.cc index bce4f0f6891bc..297e6abac797d 100644 --- a/src/v/config/configuration.cc +++ b/src/v/config/configuration.cc @@ -13,11 +13,13 @@ #include "config/node_config.h" #include "config/validators.h" #include "model/metadata.h" +#include "security/mtls.h" #include "storage/chunk_cache.h" #include "storage/segment_appender.h" #include "units.h" #include +#include namespace config { using namespace std::chrono_literals; @@ -474,7 +476,7 @@ configuration::configuration() "group_topic_partitions", "Number of partitions in the internal group membership topic", {.needs_restart = needs_restart::no, .visibility = visibility::tunable}, - 1) + 16) , default_topic_replication( *this, "default_topic_replications", @@ -780,9 +782,27 @@ configuration::configuration() , enable_sasl( *this, "enable_sasl", - "Enable SASL authentication for Kafka connections.", + "Enable SASL authentication for Kafka connections, authorization is " + "required. see also `kafka_enable_authorization`", {.needs_restart = needs_restart::no, .visibility = visibility::user}, false) + , kafka_enable_authorization( + *this, + "kafka_enable_authorization", + "Enable authorization for Kafka connections. Values:" + "- `nil`: Ignored. Authorization is enabled with `enable_sasl: true`" + "; `true`: authorization is required" + "; `false`: authorization is disabled" + ". See also: `enable_sasl` and `kafka_api[].authentication_method`", + {.needs_restart = needs_restart::no, .visibility = visibility::user}, + std::nullopt) + , kafka_mtls_principal_mapping_rules( + *this, + "kafka_mtls_principal_mapping_rules", + "Principal Mapping Rules for mTLS Authentication on the Kafka API", + {.needs_restart = needs_restart::no, .visibility = visibility::user}, + std::nullopt, + security::tls::validate_rules) , controller_backend_housekeeping_interval_ms( *this, "controller_backend_housekeeping_interval_ms", @@ -1029,6 +1049,13 @@ configuration::configuration() "remote storage (sec)", {.visibility = visibility::tunable}, std::nullopt) + , cloud_storage_readreplica_manifest_sync_timeout_ms( + *this, + "cloud_storage_readreplica_manifest_sync_timeout_ms", + "Timeout to check if new data is available for partition in S3 for read " + "replica", + {.needs_restart = needs_restart::no, .visibility = visibility::tunable}, + 30s) , cloud_storage_upload_ctrl_update_interval_ms( *this, "cloud_storage_upload_ctrl_update_interval_ms", diff --git a/src/v/config/configuration.h b/src/v/config/configuration.h index 8cb40c1c28d2c..c5bc0548aa001 100644 --- a/src/v/config/configuration.h +++ b/src/v/config/configuration.h @@ -171,6 +171,9 @@ struct configuration final : public config_store { property id_allocator_log_capacity; property id_allocator_batch_size; property enable_sasl; + property> kafka_enable_authorization; + property>> + kafka_mtls_principal_mapping_rules; property controller_backend_housekeeping_interval_ms; property node_management_operation_timeout_ms; @@ -218,6 +221,8 @@ struct configuration final : public config_store { cloud_storage_max_connection_idle_time_ms; property> cloud_storage_segment_max_upload_interval_sec; + property + cloud_storage_readreplica_manifest_sync_timeout_ms; // Archival upload controller property diff --git a/src/v/config/node_config.cc b/src/v/config/node_config.cc index ba673ebbdd335..0a8a468432896 100644 --- a/src/v/config/node_config.cc +++ b/src/v/config/node_config.cc @@ -63,7 +63,9 @@ node_config::node_config() noexcept "kafka_api", "Address and port of an interface to listen for Kafka API requests", {.visibility = visibility::user}, - {model::broker_endpoint(net::unresolved_address("127.0.0.1", 9092))}) + {config::broker_authn_endpoint{ + .address = net::unresolved_address("127.0.0.1", 9092), + .authn_method = std::nullopt}}) , kafka_api_tls( *this, "kafka_api_tls", diff --git a/src/v/config/node_config.h b/src/v/config/node_config.h index 6d802fa90f02b..c85eaf2fc36b1 100644 --- a/src/v/config/node_config.h +++ b/src/v/config/node_config.h @@ -9,6 +9,7 @@ #pragma once +#include "config/broker_authn_endpoint.h" #include "config/broker_endpoint.h" #include "config/convert.h" #include "config/data_directory_path.h" @@ -16,6 +17,9 @@ #include "config/seed_server.h" #include "config_store.h" +#include +#include + namespace config { struct node_config final : public config_store { @@ -31,7 +35,7 @@ struct node_config final : public config_store { property rpc_server_tls; // Kafka RPC listener - one_or_many_property kafka_api; + one_or_many_property kafka_api; one_or_many_property kafka_api_tls; // Admin API listener @@ -55,9 +59,19 @@ struct node_config final : public config_store { return data_directory().path / "pid.lock"; } - const std::vector& advertised_kafka_api() const { + std::vector advertised_kafka_api() const { if (_advertised_kafka_api().empty()) { - return kafka_api(); + std::vector eps; + auto api = kafka_api(); + eps.reserve(api.size()); + std::transform( + std::make_move_iterator(api.begin()), + std::make_move_iterator(api.end()), + std::back_inserter(eps), + [](auto ep) { + return model::broker_endpoint{ep.name, ep.address}; + }); + return eps; } return _advertised_kafka_api(); } diff --git a/src/v/config/rjson_serialization.cc b/src/v/config/rjson_serialization.cc index 59fee261d2d46..f1c2f6a012d3e 100644 --- a/src/v/config/rjson_serialization.cc +++ b/src/v/config/rjson_serialization.cc @@ -60,11 +60,6 @@ void rjson_serialize_impl( w.Key("truststore_file"); w.String((*(v.get_truststore_file())).c_str()); } - - if (v.get_principal_mapping_rules()) { - w.Key("principal_mapping_rules"); - w.String(*v.get_principal_mapping_rules()); - } } void rjson_serialize( diff --git a/src/v/config/tls_config.h b/src/v/config/tls_config.h index 3389a327f9182..fb461da7d1e90 100644 --- a/src/v/config/tls_config.h +++ b/src/v/config/tls_config.h @@ -23,7 +23,6 @@ #include #include -#include #include #include @@ -57,13 +56,11 @@ class tls_config { bool enabled, std::optional key_cert, std::optional truststore, - bool require_client_auth, - std::optional principal_mapping_rules) + bool require_client_auth) : _enabled(enabled) , _key_cert(std::move(key_cert)) , _truststore_file(std::move(truststore)) - , _require_client_auth(require_client_auth) - , _principal_mapping_rules(std::move(principal_mapping_rules)) {} + , _require_client_auth(require_client_auth) {} bool is_enabled() const { return _enabled; } @@ -77,10 +74,6 @@ class tls_config { bool get_require_client_auth() const { return _require_client_auth; } - const std::optional& get_principal_mapping_rules() const { - return _principal_mapping_rules; - } - ss::future> get_credentials_builder() const& { if (_enabled) { @@ -125,19 +118,6 @@ class tls_config { return "Trust store is required when client authentication is " "enabled"; } - if (c.get_principal_mapping_rules()) { - if (!c.get_require_client_auth()) { - return "Client authentication is required when principal " - "mapping rules are set"; - } - // Validate regex of the mapping rules - try { - security::tls::detail::parse_rules( - c.get_principal_mapping_rules()); - } catch (const std::runtime_error& e) { - return e.what(); - } - } return std::nullopt; } @@ -150,12 +130,9 @@ class tls_config { << "enabled: " << c.is_enabled() << " " << "key/cert files: " << c.get_key_cert_files() << " " << "ca file: " << c.get_truststore_file() << " " - << "client_auth_required: " << c.get_require_client_auth(); - if (c.get_principal_mapping_rules()) { - o << " principal_mapping_rules: " - << c.get_principal_mapping_rules(); - } - return o << " }"; + << "client_auth_required: " << c.get_require_client_auth() << "" + << " }"; + return o; } private: @@ -163,7 +140,6 @@ class tls_config { std::optional _key_cert; std::optional _truststore_file; bool _require_client_auth{false}; - std::optional _principal_mapping_rules; }; } // namespace config @@ -203,11 +179,6 @@ struct convert { node["truststore_file"] = *rhs.get_truststore_file(); } - if (rhs.get_principal_mapping_rules()) { - node["principal_mapping_rules"] - = *rhs.get_principal_mapping_rules(); - } - return node; } @@ -228,8 +199,7 @@ struct convert { } auto enabled = node["enabled"] && node["enabled"].as(); if (!enabled) { - rhs = config::tls_config( - false, std::nullopt, std::nullopt, false, std::nullopt); + rhs = config::tls_config(false, std::nullopt, std::nullopt, false); } else { auto key_cert = node["key_file"] @@ -237,17 +207,12 @@ struct convert { to_absolute(node["key_file"].as()), to_absolute(node["cert_file"].as())}) : std::nullopt; - auto principal_mapping_rules - = node["principal_mapping_rules"] - ? node["principal_mapping_rules"].as() - : std::optional(); rhs = config::tls_config( enabled, key_cert, to_absolute(read_optional(node, "truststore_file")), node["require_client_auth"] - && node["require_client_auth"].as(), - principal_mapping_rules); + && node["require_client_auth"].as()); } return true; } diff --git a/src/v/coproc/tests/fixtures/fiber_mock_fixture.cc b/src/v/coproc/tests/fixtures/fiber_mock_fixture.cc index 528c78f414175..c2bc47e6651b7 100644 --- a/src/v/coproc/tests/fixtures/fiber_mock_fixture.cc +++ b/src/v/coproc/tests/fixtures/fiber_mock_fixture.cc @@ -178,7 +178,7 @@ ss::future> fiber_mock_fixture::make_source( auto batch = make_random_batch(params.records_per_input); co_await tests::cooperative_spin_wait_with_timeout( 2s, [partition]() { return partition->is_elected_leader(); }); - auto r = co_await partition->replicate( + auto r = co_await partition->raft()->replicate( std::move(batch), raft::replicate_options(raft::consistency_level::leader_ack)); vassert(!r.has_error(), "Write error: {}", r.error()); diff --git a/src/v/coproc/types.h b/src/v/coproc/types.h index 3951d7a2b4e9b..6bf8a426d8f00 100644 --- a/src/v/coproc/types.h +++ b/src/v/coproc/types.h @@ -47,6 +47,7 @@ enum class topic_ingestion_policy : int8_t { earliest = 0, stored, latest }; /// \brief type to use for registration/deregistration of a topic struct enable_copros_request { + using rpc_serde_exempt = std::true_type; struct data { script_id id; iobuf source_code; @@ -57,6 +58,7 @@ struct enable_copros_request { /// \brief registration acks per copro, responses are organized in the /// same order as the list of topics in the 'topics' array struct enable_copros_reply { + using rpc_serde_exempt = std::true_type; using topic_policy = std::pair; struct script_metadata { script_id id; @@ -76,12 +78,14 @@ using state_size_t = named_type; /// \brief deregistration request, remove all topics registered to a coprocessor /// with id 'script_id'. struct disable_copros_request { + using rpc_serde_exempt = std::true_type; std::vector ids; }; /// \brief deregistration acks per topic, responses are organized in the /// same order as the list of topics in the 'ids' array struct disable_copros_reply { + using rpc_serde_exempt = std::true_type; using ack = std::pair; std::vector acks; }; @@ -89,6 +93,7 @@ struct disable_copros_reply { /// \brief Request that co-processors with the given script ids, process batches /// from the reader whose source topic is the given ntp struct process_batch_request { + using rpc_serde_exempt = std::true_type; struct data { std::vector ids; model::ntp ntp; @@ -100,6 +105,7 @@ struct process_batch_request { /// \brief Response from the above request, acks from script ids that have /// processed the record and produce new batches on a new materialized ntp struct process_batch_reply { + using rpc_serde_exempt = std::true_type; struct data { script_id id; model::ntp source; diff --git a/src/v/kafka/CMakeLists.txt b/src/v/kafka/CMakeLists.txt index c29967aec406b..067fc72b9cd00 100644 --- a/src/v/kafka/CMakeLists.txt +++ b/src/v/kafka/CMakeLists.txt @@ -34,6 +34,7 @@ set(handlers_srcs server/handlers/delete_acls.cc server/handlers/create_partitions.cc server/handlers/offset_for_leader_epoch.cc + server/handlers/handler_interface.cc server/handlers/topics/types.cc server/handlers/topics/topic_utils.cc ) @@ -67,6 +68,7 @@ v_cc_library( Seastar::seastar v::bytes v::rpc + v::coproc v::cluster v::kafka_protocol v::security diff --git a/src/v/kafka/group_probe.h b/src/v/kafka/group_probe.h index 371e4b31b4bea..26535deb0f3f0 100644 --- a/src/v/kafka/group_probe.h +++ b/src/v/kafka/group_probe.h @@ -62,9 +62,9 @@ class group_offset_probe { return; } - auto group_label = sm::label("group"); - auto topic_label = sm::label("topic"); - auto partition_label = sm::label("partition"); + auto group_label = ssx::metrics::make_namespaced_label("group"); + auto topic_label = ssx::metrics::make_namespaced_label("topic"); + auto partition_label = ssx::metrics::make_namespaced_label("partition"); std::vector labels{ group_label(group_id()), topic_label(tp.topic()), @@ -110,7 +110,7 @@ class group_probe { return; } - auto group_label = sm::label("group"); + auto group_label = ssx::metrics::make_namespaced_label("group"); std::vector labels{group_label(group_id())}; diff --git a/src/v/kafka/latency_probe.h b/src/v/kafka/latency_probe.h index f38b3fb34b6ed..3946475e6cfbc 100644 --- a/src/v/kafka/latency_probe.h +++ b/src/v/kafka/latency_probe.h @@ -60,7 +60,7 @@ class latency_probe { sm::make_histogram( "request_latency_seconds", sm::description("Internal latency of kafka produce requests"), - {sm::label("request")("produce")}, + {ssx::metrics::make_namespaced_label("request")("produce")}, [this] { return ssx::metrics::report_default_histogram( _produce_latency); @@ -69,7 +69,7 @@ class latency_probe { sm::make_histogram( "request_latency_seconds", sm::description("Internal latency of kafka consume requests"), - {sm::label("request")("consume")}, + {ssx::metrics::make_namespaced_label("request")("consume")}, [this] { return ssx::metrics::report_default_histogram(_fetch_latency); }) diff --git a/src/v/kafka/protocol/schemata/generator.py b/src/v/kafka/protocol/schemata/generator.py index 9b6b9cc89166f..12481a57b75a3 100644 --- a/src/v/kafka/protocol/schemata/generator.py +++ b/src/v/kafka/protocol/schemata/generator.py @@ -160,6 +160,14 @@ }, }, }, + "CreateTopicsResponseData": { + "Topics": { + "Configs": { + "ConfigSource": ("kafka::describe_configs_source", "int8"), + }, + "TopicConfigErrorCode": ("kafka::error_code", "int16"), + }, + }, "FindCoordinatorRequestData": { "KeyType": ("kafka::coordinator_type", "int8"), }, @@ -1144,26 +1152,20 @@ class response; {%- endmacro %} {% macro conditional_tag_encode(tdef, vec) %} -{%- if tdef.is_array %} {%- if tdef.nullable() %} -{%- call tag_version_guard(tdef) %} if ({{ tdef.name }}) { {{ vec }}.push_back({{ tdef.tag() }}); } -{%- endcall %} -{%- else %} -{%- call tag_version_guard(tdef) %} +{%- elif tdef.is_array %} if (!{{ tdef.name }}.empty()) { {{ vec }}.push_back({{ tdef.tag() }}); } -{%- endcall %} -{%- endif %} {%- elif tdef.default_value() != "" %} -{%- call tag_version_guard(tdef) %} if ({{ tdef.name }} != {{ tdef.default_value() }}) { {{ vec }}.push_back({{ tdef.tag() }}); } -{%- endcall %} +{%- else %} +{{ vec }}.push_back({{ tdef.tag() }}); {%- endif %} {%- endmacro %} @@ -1171,7 +1173,9 @@ class response; /// Tags encoding section std::vector to_encode; {%- for tdef in tag_definitions -%} +{%- call tag_version_guard(tdef) %} {{- conditional_tag_encode(tdef, "to_encode") }} +{%- endcall %} {%- endfor %} writer.write_unsigned_varint(to_encode.size()); for(size_t tag : to_encode) { diff --git a/src/v/kafka/server/connection_context.cc b/src/v/kafka/server/connection_context.cc index 4847f601013bd..d01dfdf39012d 100644 --- a/src/v/kafka/server/connection_context.cc +++ b/src/v/kafka/server/connection_context.cc @@ -13,16 +13,19 @@ #include "bytes/iobuf.h" #include "config/configuration.h" #include "kafka/protocol/sasl_authenticate.h" +#include "kafka/server/handlers/handler_interface.h" #include "kafka/server/protocol.h" #include "kafka/server/protocol_utils.h" #include "kafka/server/quota_manager.h" #include "kafka/server/request_context.h" +#include "kafka/server/response.h" #include "security/exceptions.h" #include "units.h" #include "vlog.h" #include #include +#include #include #include @@ -68,10 +71,7 @@ ss::future<> connection_context::process_one_request() { _rs.probe().header_corrupted(); return ss::make_ready_future<>(); } - return handle_mtls_auth() - .then([this, h = std::move(h.value()), s]() mutable { - return dispatch_method_once(std::move(h), s); - }) + return dispatch_method_once(std::move(h.value()), s) .handle_exception_type([this](const std::bad_alloc&) { // In general, dispatch_method_once does not throw, // but bad_allocs are an exception. Log it cleanly @@ -87,56 +87,6 @@ ss::future<> connection_context::process_one_request() { }); } -/* - * handle mtls authentication. this should only happen once when the connection - * is setup. even though this is called in the normal request handling path, - * this property should hold becuase: - * - * 1. is a noop if a mtls principal has been extracted - * 2. all code paths that don't set the principal throw and drop the connection - * - * NOTE: handle_mtls_auth is called after reading header off the wire. this is - * odd because we would expect that tls negotation etc... all happens before we - * here to the application layer. however, it appears that the way seastar works - * that we need to read some data off the wire to drive this process within the - * internal connection handling. - */ -ss::future<> connection_context::handle_mtls_auth() { - if (!_use_mtls || _mtls_principal.has_value()) { - return ss::now(); - } - return ss::with_timeout( - model::timeout_clock::now() + 5s, - _rs.conn->get_distinguished_name()) - .then([this](std::optional dn) { - if (!dn.has_value()) { - throw security::exception( - security::errc::invalid_credentials, - "failed to fetch distinguished name"); - } - /* - * for now it probably is fine to store the mapping per connection. - * but it seems like we could also share this across all connections - * with the same tls configuration. - */ - _mtls_principal = _rs.conn->get_principal_mapping()->apply( - dn->subject); - if (!_mtls_principal) { - throw security::exception( - security::errc::invalid_credentials, - fmt::format( - "failed to extract principal from distinguished name: {}", - dn->subject)); - } - - vlog( - _authlog.debug, - "got principal: {}, from distinguished name: {}", - *_mtls_principal, - dn->subject); - }); -} - /* * The SASL authentication flow for a client using version 0 of SASL handshake * doesn't use an envelope request for tokens. This method intercepts the @@ -182,8 +132,9 @@ ss::future<> connection_context::handle_auth_v0(const size_t size) { }, std::move(request_buf), 0s); + auto sres = session_resources{}; auto resp = co_await kafka::process_request( - std::move(ctx), _proto.smp_group()) + std::move(ctx), _proto.smp_group(), sres) .response; auto data = std::move(*resp).release(); response.decode(std::move(data), version); @@ -213,8 +164,7 @@ bool connection_context::is_finished_parsing() const { return _rs.conn->input().eof() || _rs.abort_requested(); } -ss::future -connection_context::throttle_request( +ss::future connection_context::throttle_request( const request_header& hdr, size_t request_size) { // update the throughput tracker for this client using the // size of the current request and return any computed delay @@ -236,8 +186,9 @@ connection_context::throttle_request( } auto track = track_latency(hdr.key); return fut - .then( - [this, request_size] { return reserve_request_units(request_size); }) + .then([this, key = hdr.key, request_size] { + return reserve_request_units(key, request_size); + }) .then([this, delay, track, tracker = std::move(tracker)]( ss::semaphore_units<> units) mutable { return server().get_request_unit().then( @@ -262,15 +213,21 @@ connection_context::throttle_request( } ss::future> -connection_context::reserve_request_units(size_t size) { - // Allow for extra copies and bookkeeping - auto mem_estimate = size * 2 + 8000; // NOLINT - if (mem_estimate >= (size_t)std::numeric_limits::max()) { +connection_context::reserve_request_units(api_key key, size_t size) { + // Defer to the handler for the request type for the memory estimate, but + // if the request isn't found, use the default estimate (although in that + // case the request is likely for an API we don't support or malformed, so + // it is likely to fail shortly anyway). + auto handler = handler_for_key(key); + auto mem_estimate = handler ? (*handler)->memory_estimate(size, *this) + : default_memory_estimate(size); + if (unlikely(mem_estimate >= (size_t)std::numeric_limits::max())) { // TODO: Create error response using the specific API? throw std::runtime_error(fmt::format( - "request too large > 1GB (size: {}; estimate: {})", + "request too large > 1GB (size: {}, estimate: {}, API: {})", size, - mem_estimate)); + mem_estimate, + handler ? (*handler)->name() : "")); } auto fut = ss::get_units(_rs.memory(), mem_estimate); if (_rs.memory().waiters()) { @@ -282,11 +239,15 @@ connection_context::reserve_request_units(size_t size) { ss::future<> connection_context::dispatch_method_once(request_header hdr, size_t size) { return throttle_request(hdr, size).then([this, hdr = std::move(hdr), size]( - session_resources sres) mutable { + session_resources + sres_in) mutable { if (_rs.abort_requested()) { // protect against shutdown behavior return ss::make_ready_future<>(); } + + auto sres = ss::make_lw_shared(std::move(sres_in)); + auto remaining = size - request_header_size - hdr.client_id_buffer.size() - hdr.tags_size_bytes; return read_iobuf_exactly(_rs.conn->input(), remaining) @@ -298,7 +259,7 @@ connection_context::dispatch_method_once(request_header hdr, size_t size) { } auto self = shared_from_this(); auto rctx = request_context( - self, std::move(hdr), std::move(buf), sres.backpressure_delay); + self, std::move(hdr), std::move(buf), sres->backpressure_delay); /* * we process requests in order since all subsequent requests * are dependent on authentication having completed. @@ -323,7 +284,7 @@ connection_context::dispatch_method_once(request_header hdr, size_t size) { const sequence_id seq = _seq_idx; _seq_idx = _seq_idx + sequence_id(1); auto res = kafka::process_request( - std::move(rctx), _proto.smp_group()); + std::move(rctx), _proto.smp_group(), *sres); /** * first stage processed in a foreground. */ @@ -333,7 +294,7 @@ connection_context::dispatch_method_once(request_header hdr, size_t size) { seq, correlation, self, - s = std::move(sres)](ss::future<> d) mutable { + sres = std::move(sres)](ss::future<> d) mutable { /* * if the dispatch/first stage failed, then we need to * need to consume the second stage since it might be @@ -362,13 +323,22 @@ connection_context::dispatch_method_once(request_header hdr, size_t size) { ssx::background = ssx::spawn_with_gate_then( _rs.conn_gate(), - [this, f = std::move(f), seq, correlation]() mutable { - return f.then([this, seq, correlation]( - response_ptr r) mutable { - r->set_correlation(correlation); - _responses.insert({seq, std::move(r)}); - return process_next_response(); - }); + [this, + f = std::move(f), + sres = std::move(sres), + seq, + correlation]() mutable { + return f.then( + [this, + sres = std::move(sres), + seq, + correlation](response_ptr r) mutable { + r->set_correlation(correlation); + response_and_resources randr{ + std::move(r), std::move(sres)}; + _responses.insert({seq, std::move(randr)}); + return maybe_process_responses(); + }); }) .handle_exception([self](std::exception_ptr e) { // ssx::spawn_with_gate already caught @@ -397,8 +367,7 @@ connection_context::dispatch_method_once(request_header hdr, size_t size) { self->_rs.probe().service_error(); self->_rs.conn->shutdown_input(); - }) - .finally([s = std::move(s), self] {}); + }); return d; }) .handle_exception([self](std::exception_ptr e) { @@ -410,7 +379,20 @@ connection_context::dispatch_method_once(request_header hdr, size_t size) { }); } -ss::future<> connection_context::process_next_response() { +/** + * This method processes as many responses as possible, in request order. Since + * we proces the second stage asynchronously within a given connection, reponses + * may become ready out of order, but Kafka clients expect responses exactly in + * request order. + * + * The _responses queue handles that: responses are enqueued there in completion + * order, but only sent to the client in response order. So this method, called + * after every response is ready, may end up sending zero, one or more requests, + * depending on the completion order. + * + * @return ss::future<> + */ +ss::future<> connection_context::maybe_process_responses() { return ss::repeat([this]() mutable { auto it = _responses.find(_next_response); if (it == _responses.end()) { @@ -420,20 +402,25 @@ ss::future<> connection_context::process_next_response() { // found one; increment counter _next_response = _next_response + sequence_id(1); - auto r = std::move(it->second); + auto resp_and_res = std::move(it->second); + _responses.erase(it); - if (r->is_noop()) { + if (resp_and_res.response->is_noop()) { return ss::make_ready_future( ss::stop_iteration::no); } - auto msg = response_as_scattered(std::move(r)); + auto msg = response_as_scattered(std::move(resp_and_res.response)); try { - return _rs.conn->write(std::move(msg)).then([] { - return ss::make_ready_future( - ss::stop_iteration::no); - }); + return _rs.conn->write(std::move(msg)) + .then([] { + return ss::make_ready_future( + ss::stop_iteration::no); + }) + // release the resources only once it has been written to the + // connection. + .finally([resources = std::move(resp_and_res.resources)] {}); } catch (...) { vlog( klog.debug, diff --git a/src/v/kafka/server/connection_context.h b/src/v/kafka/server/connection_context.h index 4276f8f9da652..d46a5b67f8c56 100644 --- a/src/v/kafka/server/connection_context.h +++ b/src/v/kafka/server/connection_context.h @@ -11,9 +11,11 @@ #pragma once #include "kafka/server/protocol.h" #include "kafka/server/response.h" +#include "kafka/types.h" #include "net/server.h" #include "seastarx.h" #include "security/acl.h" +#include "security/mtls.h" #include "security/sasl_authentication.h" #include "utils/hdr_hist.h" #include "utils/named_type.h" @@ -37,6 +39,41 @@ using authz_quiet = ss::bool_class; struct request_header; class request_context; +// used to track number of pending requests +class request_tracker { +public: + explicit request_tracker(net::server_probe& probe) noexcept + : _probe(probe) { + _probe.request_received(); + } + request_tracker(const request_tracker&) = delete; + request_tracker(request_tracker&&) = delete; + request_tracker& operator=(const request_tracker&) = delete; + request_tracker& operator=(request_tracker&&) = delete; + + ~request_tracker() noexcept { _probe.request_completed(); } + +private: + net::server_probe& _probe; +}; + +// Used to hold resources associated with a given request until +// the response has been send, as well as to track some statistics +// about the request. +// +// The resources in particular should be not be destroyed until +// the request is complete (e.g., all the information written to +// the socket so that no userspace buffers remain). +struct session_resources { + using pointer = ss::lw_shared_ptr; + + ss::lowres_clock::duration backpressure_delay; + ss::semaphore_units<> memlocks; + ss::semaphore_units<> queue_units; + std::unique_ptr method_latency; + std::unique_ptr tracker; +}; + class connection_context final : public ss::enable_lw_shared_from_this { public: @@ -45,7 +82,7 @@ class connection_context final net::server::resources&& r, security::sasl_server sasl, bool enable_authorizer, - bool use_mtls) noexcept + std::optional mtls_state) noexcept : _proto(p) , _rs(std::move(r)) , _sasl(std::move(sasl)) @@ -53,7 +90,7 @@ class connection_context final , _client_addr(_rs.conn ? _rs.conn->addr.addr() : ss::net::inet_address{}) , _enable_authorizer(enable_authorizer) , _authlog(_client_addr, client_port()) - , _use_mtls(use_mtls) {} + , _mtls_state(std::move(mtls_state)) {} ~connection_context() noexcept = default; connection_context(const connection_context&) = delete; @@ -68,20 +105,17 @@ class connection_context final template bool authorized( security::acl_operation operation, const T& name, authz_quiet quiet) { - // mtls configured? - if (_use_mtls) { - if (_mtls_principal.has_value()) { - return authorized_user( - _mtls_principal.value(), operation, name, quiet); - } - return false; - } - // sasl configured? + // authorization disabled? if (!_enable_authorizer) { return true; } - auto user = sasl().principal(); - return authorized_user(std::move(user), operation, name, quiet); + // mtls configured? + if (_mtls_state) { + return authorized_user( + _mtls_state->principal(), operation, name, quiet); + } + // use sasl + return authorized_user(sasl().principal(), operation, name, quiet); } template @@ -131,49 +165,51 @@ class connection_context final } private: - // used to track number of pending requests - class request_tracker { - public: - explicit request_tracker(net::server_probe& probe) noexcept - : _probe(probe) { - _probe.request_received(); - } - request_tracker(const request_tracker&) = delete; - request_tracker(request_tracker&&) = delete; - request_tracker& operator=(const request_tracker&) = delete; - request_tracker& operator=(request_tracker&&) = delete; - - ~request_tracker() noexcept { _probe.request_completed(); } - - private: - net::server_probe& _probe; - }; - // used to pass around some internal state - struct session_resources { - ss::lowres_clock::duration backpressure_delay; - ss::semaphore_units<> memlocks; - ss::semaphore_units<> queue_units; - std::unique_ptr method_latency; - std::unique_ptr tracker; - }; - - /// called by throttle_request - ss::future> reserve_request_units(size_t size); - - /// apply correct backpressure sequence + // Reserve units from memory from the memory semaphore in proportion + // to the number of bytes the request procesisng is expected to + // take. + ss::future> + reserve_request_units(api_key key, size_t size); + + // Apply backpressure sequence, where the request processing may be + // delayed for various reasons, including throttling but also because + // too few server resources are available to accomodate the request + // currently. + // When the returned future resolves, the throttling period is over and + // the associated resouces have been obtained and are tracked by the + // contained session_resources object. ss::future throttle_request(const request_header&, size_t sz); - ss::future<> handle_mtls_auth(); ss::future<> dispatch_method_once(request_header, size_t sz); - ss::future<> process_next_response(); + + /** + * Process zero or more ready responses in request order. + * + * The future<> returned by this method resolves when all ready *and* + * in-order responses have been processed, which is not the same as all + * ready responses. In particular, responses which are ready may not be + * processed if there are earlier (lower sequence number) responses which + * are not yet ready: they will be processed by a future invocation. + * + * @return ss::future<> a future which as described above. + */ + ss::future<> maybe_process_responses(); ss::future<> do_process(request_context); ss::future<> handle_auth_v0(size_t); private: + /** + * Bundles together a response and its associated resources. + */ + struct response_and_resources { + response_ptr response; + session_resources::pointer resources; + }; + using sequence_id = named_type; - using map_t = absl::flat_hash_map; + using map_t = absl::flat_hash_map; class ctx_log { public: @@ -234,8 +270,7 @@ class connection_context final const ss::net::inet_address _client_addr; const bool _enable_authorizer; ctx_log _authlog; - bool _use_mtls{false}; - std::optional _mtls_principal; + std::optional _mtls_state; }; } // namespace kafka diff --git a/src/v/kafka/server/flex_versions.cc b/src/v/kafka/server/flex_versions.cc index 1a3565b41a715..b0e6bd2cbea21 100644 --- a/src/v/kafka/server/flex_versions.cc +++ b/src/v/kafka/server/flex_versions.cc @@ -17,13 +17,6 @@ namespace kafka { /// requests will map to a value of api_key(-2) static constexpr api_version invalid_api = api_version(-2); -template -static constexpr size_t max_api_key(type_list) { - /// Black magic here is an overload of std::max() that takes an - /// std::initializer_list - return std::max({RequestTypes::api::key()...}); -} - template static constexpr auto get_flexible_request_min_versions_list(type_list r) { diff --git a/src/v/kafka/server/fwd.h b/src/v/kafka/server/fwd.h index a893be15ed167..e034d9f650847 100644 --- a/src/v/kafka/server/fwd.h +++ b/src/v/kafka/server/fwd.h @@ -13,13 +13,15 @@ namespace kafka { +// sorted +class connection_context; class coordinator_ntp_mapper; class fetch_session_cache; class group_manager; class group_router; +class quota_manager; +class request_context; class rm_group_frontend; class rm_group_proxy_impl; -class request_context; -class quota_manager; } // namespace kafka diff --git a/src/v/kafka/server/group.cc b/src/v/kafka/server/group.cc index 1441b02bfb60f..b7d7c2262a81b 100644 --- a/src/v/kafka/server/group.cc +++ b/src/v/kafka/server/group.cc @@ -1689,7 +1689,7 @@ group::commit_tx(cluster::commit_group_tx_request r) { auto reader = model::make_memory_record_batch_reader(std::move(batch)); - auto e = co_await _partition->replicate( + auto e = co_await _partition->raft()->replicate( _term, std::move(reader), raft::replicate_options(raft::consistency_level::quorum_ack)); @@ -1772,7 +1772,7 @@ group::begin_tx(cluster::begin_group_tx_request r) { r.pid, std::move(fence)); auto reader = model::make_memory_record_batch_reader(std::move(batch)); - auto e = co_await _partition->replicate( + auto e = co_await _partition->raft()->replicate( _term, std::move(reader), raft::replicate_options(raft::consistency_level::quorum_ack)); @@ -1887,7 +1887,7 @@ group::prepare_tx(cluster::prepare_group_tx_request r) { std::move(tx_entry)); auto reader = model::make_memory_record_batch_reader(std::move(batch)); - auto e = co_await _partition->replicate( + auto e = co_await _partition->raft()->replicate( _term, std::move(reader), raft::replicate_options(raft::consistency_level::quorum_ack)); @@ -1983,7 +1983,7 @@ group::abort_tx(cluster::abort_group_tx_request r) { std::move(tx)); auto reader = model::make_memory_record_batch_reader(std::move(batch)); - auto e = co_await _partition->replicate( + auto e = co_await _partition->raft()->replicate( _term, std::move(reader), raft::replicate_options(raft::consistency_level::quorum_ack)); @@ -2103,7 +2103,8 @@ group::offset_commit_stages group::store_offsets(offset_commit_request&& r) { auto batch = std::move(builder).build(); auto reader = model::make_memory_record_batch_reader(std::move(batch)); - auto replicate_stages = _partition->replicate_in_stages( + auto replicate_stages = _partition->raft()->replicate_in_stages( + _term, std::move(reader), raft::replicate_options(raft::consistency_level::quorum_ack)); @@ -2492,7 +2493,8 @@ ss::future group::remove() { auto reader = model::make_memory_record_batch_reader(std::move(batch)); try { - auto result = co_await _partition->replicate( + auto result = co_await _partition->raft()->replicate( + _term, std::move(reader), raft::replicate_options(raft::consistency_level::quorum_ack)); if (result) { @@ -2572,7 +2574,8 @@ group::remove_topic_partitions(const std::vector& tps) { auto reader = model::make_memory_record_batch_reader(std::move(batch)); try { - auto result = co_await _partition->replicate( + auto result = co_await _partition->raft()->replicate( + _term, std::move(reader), raft::replicate_options(raft::consistency_level::quorum_ack)); if (result) { @@ -2599,7 +2602,8 @@ group::remove_topic_partitions(const std::vector& tps) { ss::future> group::store_group(model::record_batch batch) { - return _partition->replicate( + return _partition->raft()->replicate( + _term, model::make_memory_record_batch_reader(std::move(batch)), raft::replicate_options(raft::consistency_level::quorum_ack)); } diff --git a/src/v/kafka/server/group_metadata_migration.cc b/src/v/kafka/server/group_metadata_migration.cc index c7a27abe203b1..a98aba8bbb5f8 100644 --- a/src/v/kafka/server/group_metadata_migration.cc +++ b/src/v/kafka/server/group_metadata_migration.cc @@ -332,6 +332,7 @@ ss::future replicate( [ntp = std::move(ntp), f_reader = std::move(f_reader)](cluster::partition_manager& pm) mutable { return pm.get(ntp) + ->raft() ->replicate( std::move(f_reader), raft::replicate_options(raft::consistency_level::quorum_ack)) diff --git a/src/v/kafka/server/handlers/add_offsets_to_txn.h b/src/v/kafka/server/handlers/add_offsets_to_txn.h index e4b1669b970a7..fbc9fc1324e24 100644 --- a/src/v/kafka/server/handlers/add_offsets_to_txn.h +++ b/src/v/kafka/server/handlers/add_offsets_to_txn.h @@ -14,6 +14,7 @@ namespace kafka { -using add_offsets_to_txn_handler = handler; +using add_offsets_to_txn_handler + = single_stage_handler; } diff --git a/src/v/kafka/server/handlers/add_partitions_to_txn.h b/src/v/kafka/server/handlers/add_partitions_to_txn.h index 5b0f2523b4f36..aee85586deee2 100644 --- a/src/v/kafka/server/handlers/add_partitions_to_txn.h +++ b/src/v/kafka/server/handlers/add_partitions_to_txn.h @@ -14,6 +14,7 @@ namespace kafka { -using add_partitions_to_txn_handler = handler; +using add_partitions_to_txn_handler + = single_stage_handler; } diff --git a/src/v/kafka/server/handlers/alter_configs.h b/src/v/kafka/server/handlers/alter_configs.h index 7edcf2f987ad4..d61eb8c3f472d 100644 --- a/src/v/kafka/server/handlers/alter_configs.h +++ b/src/v/kafka/server/handlers/alter_configs.h @@ -14,6 +14,6 @@ namespace kafka { -using alter_configs_handler = handler; +using alter_configs_handler = single_stage_handler; } diff --git a/src/v/kafka/server/handlers/api_versions.h b/src/v/kafka/server/handlers/api_versions.h index c131036d3f2a0..7ab8921561ea8 100644 --- a/src/v/kafka/server/handlers/api_versions.h +++ b/src/v/kafka/server/handlers/api_versions.h @@ -14,7 +14,8 @@ namespace kafka { -struct api_versions_handler : public handler { +struct api_versions_handler + : public single_stage_handler { static constexpr api_version min_flexible = api_version(3); static ss::future diff --git a/src/v/kafka/server/handlers/create_acls.h b/src/v/kafka/server/handlers/create_acls.h index e9719121d6dfc..d9a6161b71a1a 100644 --- a/src/v/kafka/server/handlers/create_acls.h +++ b/src/v/kafka/server/handlers/create_acls.h @@ -14,6 +14,6 @@ namespace kafka { -using create_acls_handler = handler; +using create_acls_handler = single_stage_handler; } diff --git a/src/v/kafka/server/handlers/create_partitions.h b/src/v/kafka/server/handlers/create_partitions.h index 4102398e8d8bd..16b1dcc9de27c 100644 --- a/src/v/kafka/server/handlers/create_partitions.h +++ b/src/v/kafka/server/handlers/create_partitions.h @@ -14,6 +14,7 @@ namespace kafka { -using create_partitions_handler = handler; +using create_partitions_handler + = single_stage_handler; } diff --git a/src/v/kafka/server/handlers/create_topics.cc b/src/v/kafka/server/handlers/create_topics.cc index ffc33cb3106a2..e088df7371a57 100644 --- a/src/v/kafka/server/handlers/create_topics.cc +++ b/src/v/kafka/server/handlers/create_topics.cc @@ -33,19 +33,18 @@ namespace kafka { -static constexpr std::array supported_configs{ - {"compression.type", - "cleanup.policy", - "message.timestamp.type", - "segment.bytes", - "compaction.strategy", - "retention.bytes", - "retention.ms", - "redpanda.remote.recovery", - "redpanda.remote.write", - "redpanda.remote.read", - "redpanda.remote.readreplica", - "redpanda.remote.readreplica.bucket"}}; +static constexpr std::array supported_configs{ + topic_property_compression, + topic_property_cleanup_policy, + topic_property_timestamp_type, + topic_property_segment_size, + topic_property_compaction_strategy, + topic_property_retention_bytes, + topic_property_retention_duration, + topic_property_recovery, + topic_property_remote_write, + topic_property_remote_read, + topic_property_read_replica}; bool is_supported(std::string_view name) { return std::any_of( @@ -65,9 +64,49 @@ using validators = make_validator_types< compaction_strategy_validator, timestamp_type_validator, cleanup_policy_validator, - remote_read_and_write_are_not_supported_for_read_replica, - s3_bucket_is_required_for_read_replica, - s3_bucket_is_supported_only_for_read_replica>; + remote_read_and_write_are_not_supported_for_read_replica>; + +static std::vector +properties_to_result_configs(config_map_t config_map) { + std::vector configs; + configs.reserve(config_map.size()); + std::transform( + config_map.begin(), + config_map.end(), + std::back_inserter(configs), + [](auto& cfg) { + return creatable_topic_configs{ + .name = cfg.first, + .value = {std::move(cfg.second)}, + .config_source = kafka::describe_configs_source::default_config, + }; + }); + return configs; +} + +static void +append_topic_configs(request_context& ctx, create_topics_response& response) { + for (auto& ct_result : response.data.topics) { + if (ct_result.error_code != kafka::error_code::none) { + ct_result.topic_config_error_code = ct_result.error_code; + continue; + } + auto cfg = ctx.metadata_cache().get_topic_cfg( + model::topic_namespace_view{model::kafka_namespace, ct_result.name}); + if (cfg) { + auto config_map = from_cluster_type(cfg->properties); + ct_result.configs = { + properties_to_result_configs(std::move(config_map))}; + ct_result.topic_config_error_code = kafka::error_code::none; + } else { + // Topic was sucessfully created but metadata request did not + // succeed, if possible, could mean topic was deleted just after + // creation + ct_result.topic_config_error_code + = kafka::error_code::unknown_server_error; + } + } +} template<> ss::future create_topics_handler::handle( @@ -156,8 +195,15 @@ ss::future create_topics_handler::handle( begin, valid_range_end, std::back_inserter(response.data.topics), - [](const creatable_topic& t) { - return generate_successfull_result(t); + [&ctx](const creatable_topic& t) { + auto result = generate_successfull_result(t); + if (ctx.header().version >= api_version(5)) { + auto default_properties + = ctx.metadata_cache().get_default_properties(); + result.configs = {properties_to_result_configs( + from_cluster_type(default_properties))}; + } + return result; }); return ctx.respond(std::move(response)); } @@ -190,6 +236,9 @@ ss::future create_topics_handler::handle( std::vector c_res) mutable { // Append controller results to validation errors append_cluster_results(c_res, response.data.topics); + if (ctx.header().version >= api_version(5)) { + append_topic_configs(ctx, response); + } return ctx.respond(response); }); }); diff --git a/src/v/kafka/server/handlers/create_topics.h b/src/v/kafka/server/handlers/create_topics.h index 2f2774856173b..2ab493358dd02 100644 --- a/src/v/kafka/server/handlers/create_topics.h +++ b/src/v/kafka/server/handlers/create_topics.h @@ -14,6 +14,6 @@ namespace kafka { -using create_topics_handler = handler; +using create_topics_handler = single_stage_handler; } diff --git a/src/v/kafka/server/handlers/delete_acls.h b/src/v/kafka/server/handlers/delete_acls.h index d19ab798cf467..8e45cc5679fa6 100644 --- a/src/v/kafka/server/handlers/delete_acls.h +++ b/src/v/kafka/server/handlers/delete_acls.h @@ -14,6 +14,6 @@ namespace kafka { -using delete_acls_handler = handler; +using delete_acls_handler = single_stage_handler; } diff --git a/src/v/kafka/server/handlers/delete_groups.h b/src/v/kafka/server/handlers/delete_groups.h index 85d01eb8c29bd..d9858140b83a1 100644 --- a/src/v/kafka/server/handlers/delete_groups.h +++ b/src/v/kafka/server/handlers/delete_groups.h @@ -14,6 +14,6 @@ namespace kafka { -using delete_groups_handler = handler; +using delete_groups_handler = single_stage_handler; } diff --git a/src/v/kafka/server/handlers/delete_topics.h b/src/v/kafka/server/handlers/delete_topics.h index 7dd8d66d2f157..e9b6606cbe004 100644 --- a/src/v/kafka/server/handlers/delete_topics.h +++ b/src/v/kafka/server/handlers/delete_topics.h @@ -14,6 +14,6 @@ namespace kafka { -using delete_topics_handler = handler; +using delete_topics_handler = single_stage_handler; } diff --git a/src/v/kafka/server/handlers/describe_acls.h b/src/v/kafka/server/handlers/describe_acls.h index 3377ac8a28582..996c6fa230aad 100644 --- a/src/v/kafka/server/handlers/describe_acls.h +++ b/src/v/kafka/server/handlers/describe_acls.h @@ -14,6 +14,6 @@ namespace kafka { -using describe_acls_handler = handler; +using describe_acls_handler = single_stage_handler; } diff --git a/src/v/kafka/server/handlers/describe_configs.cc b/src/v/kafka/server/handlers/describe_configs.cc index ff25d103c4456..f713130653989 100644 --- a/src/v/kafka/server/handlers/describe_configs.cc +++ b/src/v/kafka/server/handlers/describe_configs.cc @@ -265,6 +265,24 @@ kafka_endpoint_format(const std::vector& endpoints) { return ssx::sformat("{}", fmt::join(uris, ",")); } +static ss::sstring kafka_authn_endpoint_format( + const std::vector& endpoints) { + std::vector uris; + uris.reserve(endpoints.size()); + std::transform( + endpoints.cbegin(), + endpoints.cend(), + std::back_inserter(uris), + [](const config::broker_authn_endpoint& ep) { + return ssx::sformat( + "{}://{}:{}", + (ep.name.empty() ? "plain" : ep.name), + ep.address.host(), + ep.address.port()); + }); + return ssx::sformat("{}", fmt::join(uris, ",")); +} + static void report_broker_config( const describe_configs_resource& resource, describe_configs_result& result, @@ -299,7 +317,7 @@ static void report_broker_config( "listeners", config::node().kafka_api, include_synonyms, - &kafka_endpoint_format); + &kafka_authn_endpoint_format); add_broker_config_if_requested( resource, diff --git a/src/v/kafka/server/handlers/describe_configs.h b/src/v/kafka/server/handlers/describe_configs.h index 27f7235dc762a..97199e628b559 100644 --- a/src/v/kafka/server/handlers/describe_configs.h +++ b/src/v/kafka/server/handlers/describe_configs.h @@ -14,6 +14,7 @@ namespace kafka { -using describe_configs_handler = handler; +using describe_configs_handler + = single_stage_handler; } diff --git a/src/v/kafka/server/handlers/describe_groups.h b/src/v/kafka/server/handlers/describe_groups.h index 6f804548b8937..b62004ae9fa60 100644 --- a/src/v/kafka/server/handlers/describe_groups.h +++ b/src/v/kafka/server/handlers/describe_groups.h @@ -14,6 +14,6 @@ namespace kafka { -using describe_groups_handler = handler; +using describe_groups_handler = single_stage_handler; } diff --git a/src/v/kafka/server/handlers/describe_log_dirs.h b/src/v/kafka/server/handlers/describe_log_dirs.h index 13d11c440ad73..1731e88621a92 100644 --- a/src/v/kafka/server/handlers/describe_log_dirs.h +++ b/src/v/kafka/server/handlers/describe_log_dirs.h @@ -14,6 +14,7 @@ namespace kafka { -using describe_log_dirs_handler = handler; +using describe_log_dirs_handler + = single_stage_handler; } diff --git a/src/v/kafka/server/handlers/end_txn.h b/src/v/kafka/server/handlers/end_txn.h index 72362cb00fed0..cd80b0d41c255 100644 --- a/src/v/kafka/server/handlers/end_txn.h +++ b/src/v/kafka/server/handlers/end_txn.h @@ -14,6 +14,6 @@ namespace kafka { -using end_txn_handler = handler; +using end_txn_handler = single_stage_handler; } diff --git a/src/v/kafka/server/handlers/fetch.h b/src/v/kafka/server/handlers/fetch.h index d43b4e9b0b33b..8f44d89f1451d 100644 --- a/src/v/kafka/server/handlers/fetch.h +++ b/src/v/kafka/server/handlers/fetch.h @@ -17,7 +17,7 @@ namespace kafka { -using fetch_handler = handler; +using fetch_handler = single_stage_handler; /* * Fetch operation context diff --git a/src/v/kafka/server/handlers/find_coordinator.cc b/src/v/kafka/server/handlers/find_coordinator.cc index e10100e028f1f..3147c07f1eae1 100644 --- a/src/v/kafka/server/handlers/find_coordinator.cc +++ b/src/v/kafka/server/handlers/find_coordinator.cc @@ -72,6 +72,21 @@ ss::future find_coordinator_handler::handle( find_coordinator_request request; request.decode(ctx.reader(), ctx.header().version); + if (request.data.key_type == coordinator_type::group) { + if (!ctx.authorized( + security::acl_operation::describe, group_id(request.data.key))) { + return ctx.respond(find_coordinator_response( + error_code::group_authorization_failed)); + } + } else if (request.data.key_type == coordinator_type::transaction) { + if (!ctx.authorized( + security::acl_operation::describe, + transactional_id(request.data.key))) { + return ctx.respond(find_coordinator_response( + error_code::transactional_id_authorization_failed)); + } + } + if (request.data.key_type == coordinator_type::transaction) { if (!ctx.are_transactions_enabled()) { return ctx.respond( @@ -98,21 +113,6 @@ ss::future find_coordinator_handler::handle( find_coordinator_response(error_code::unsupported_version)); } - if (request.data.key_type == coordinator_type::group) { - if (!ctx.authorized( - security::acl_operation::describe, group_id(request.data.key))) { - return ctx.respond(find_coordinator_response( - error_code::group_authorization_failed)); - } - } else if (request.data.key_type == coordinator_type::transaction) { - if (!ctx.authorized( - security::acl_operation::describe, - transactional_id(request.data.key))) { - return ctx.respond(find_coordinator_response( - error_code::transactional_id_authorization_failed)); - } - } - return ss::do_with( std::move(ctx), [request = std::move(request)](request_context& ctx) mutable { diff --git a/src/v/kafka/server/handlers/find_coordinator.h b/src/v/kafka/server/handlers/find_coordinator.h index 8e3d83bfe4d67..1f5ff07fb97f4 100644 --- a/src/v/kafka/server/handlers/find_coordinator.h +++ b/src/v/kafka/server/handlers/find_coordinator.h @@ -14,6 +14,7 @@ namespace kafka { -using find_coordinator_handler = handler; +using find_coordinator_handler + = single_stage_handler; } // namespace kafka diff --git a/src/v/kafka/server/handlers/handler.h b/src/v/kafka/server/handlers/handler.h index a4b87a7545863..6e294af073c50 100644 --- a/src/v/kafka/server/handlers/handler.h +++ b/src/v/kafka/server/handlers/handler.h @@ -10,6 +10,7 @@ */ #pragma once #include "kafka/protocol/types.h" +#include "kafka/server/fwd.h" #include "kafka/server/request_context.h" #include "kafka/server/response.h" #include "kafka/types.h" @@ -18,18 +19,80 @@ namespace kafka { +using memory_estimate_fn = size_t(size_t, connection_context&); + +constexpr size_t +default_estimate_adaptor(size_t request_size, connection_context&) { + return default_memory_estimate(request_size); +} + +/** + * Handlers are generally specializations of this template, via one of the + * two aliases (handler or two_phase_hander) declared below, though it is + * not strictly necessary (only conforming to one of the two KafkaApi* + * concepts is needed). + * + * The benefit of this template is that it takes care of the most of the + * handler boilerplate. + */ template< typename RequestApi, api_version::type MinSupported, - api_version::type MaxSupported> -struct handler { + api_version::type MaxSupported, + typename HandleRetType, + memory_estimate_fn MemEstimator> +struct handler_template { using api = RequestApi; static constexpr api_version min_supported = api_version(MinSupported); static constexpr api_version max_supported = api_version(MaxSupported); - static ss::future - handle(request_context, ss::smp_service_group); + + static HandleRetType handle(request_context, ss::smp_service_group); + + /** + * See handler_interface::memory_estimate for a description of this + * function. + */ + static size_t + memory_estimate(size_t request_size, connection_context& conn_ctx) { + return MemEstimator(request_size, conn_ctx); + } }; +/** + * A single-stage handler implements the entire request handling in the initial + * stage which occurs before any subsequent request is processed. + */ +template< + typename RequestApi, + api_version::type MinSupported, + api_version::type MaxSupported, + memory_estimate_fn MemEstimator = default_estimate_adaptor> +using single_stage_handler = handler_template< + RequestApi, + MinSupported, + MaxSupported, + ss::future, + MemEstimator>; + +/** + * A two-stage handler has an initial stage which happens before any other + * request can start processing (as in a single-stage handler) but then also has + * a second stage which is processed in the background allowing other requests + * on the same connection to start their handler. Responses are still sent in + * order, but processing is out-of-order. + */ +template< + typename RequestApi, + api_version::type MinSupported, + api_version::type MaxSupported, + memory_estimate_fn MemEstimator = default_estimate_adaptor> +using two_phase_handler = handler_template< + RequestApi, + MinSupported, + MaxSupported, + process_result_stages, + MemEstimator>; + template concept KafkaApiHandler = KafkaApi && requires( T h, request_context&& ctx, ss::smp_service_group g) { @@ -45,4 +108,7 @@ concept KafkaApiTwoPhaseHandler = KafkaApi && requires( { T::handle(std::move(ctx), g) } -> std::same_as; }; +template +concept KafkaApiHandlerAny = KafkaApiHandler || KafkaApiTwoPhaseHandler; + } // namespace kafka diff --git a/src/v/kafka/server/handlers/handler_interface.cc b/src/v/kafka/server/handlers/handler_interface.cc new file mode 100644 index 0000000000000..44593b0f96e42 --- /dev/null +++ b/src/v/kafka/server/handlers/handler_interface.cc @@ -0,0 +1,141 @@ +/* + * Copyright 2022 Redpanda Data, Inc. + * + * Use of this software is governed by the Business Source License + * included in the file licenses/BSL.md + * + * As of the Change Date specified in that file, in accordance with + * the Business Source License, use of this software will be governed + * by the Apache License, Version 2.0 + */ +#include "kafka/server/handlers/handler_interface.h" + +#include "kafka/server/handlers/handlers.h" +#include "kafka/server/handlers/produce.h" +#include "kafka/server/response.h" +#include "kafka/types.h" + +#include + +namespace kafka { + +/** + * @brief Packages together basic information common to every handler. + */ +struct handler_info { + handler_info( + api_key key, + const char* name, + api_version min_api, + api_version max_api, + memory_estimate_fn* mem_estimate) noexcept + : _key(key) + , _name(name) + , _min_api(min_api) + , _max_api(max_api) + , _mem_estimate(mem_estimate) {} + + api_key _key; + const char* _name; + api_version _min_api, _max_api; + memory_estimate_fn* _mem_estimate; +}; + +/** + * @brief Creates a type-erased handler implementation given info and a handle + * method. + * + * There are only two variants of this handler, for one and two pass + * implementations. + * This keeps the generated code duplication to a minimum, compared to + * templating this on the handler type. + * + * @tparam is_two_pass true if the handler is two-pass + */ +template +struct handler_base final : public handler_interface { + using single_pass_handler + = ss::future(request_context, ss::smp_service_group); + using two_pass_handler + = process_result_stages(request_context, ss::smp_service_group); + using fn_type + = std::conditional_t; + + handler_base(const handler_info& info, fn_type* handle_fn) noexcept + : _info(info) + , _handle_fn(handle_fn) {} + + api_version min_supported() const override { return _info._min_api; } + api_version max_supported() const override { return _info._max_api; } + + api_key key() const override { return _info._key; } + const char* name() const override { return _info._name; } + + size_t memory_estimate( + size_t request_size, connection_context& conn_ctx) const override { + return _info._mem_estimate(request_size, conn_ctx); + } + /** + * Only handle varies with one or two pass, since one pass handlers + * must pass through single_stage() to covert them to two-pass. + */ + process_result_stages + handle(request_context&& rc, ss::smp_service_group g) const override { + if constexpr (is_two_pass) { + return _handle_fn(std::move(rc), g); + } else { + return process_result_stages::single_stage( + _handle_fn(std::move(rc), g)); + } + } + +private: + handler_info _info; + fn_type* _handle_fn; +}; + +/** + * @brief Instance holder for the handler_base. + * + * Given a handler type H, exposes a static instance of the assoicated handler + * base object. + * + * @tparam H the handler type. + */ +template +struct handler_holder { + static const inline handler_base> instance{ + handler_info{ + H::api::key, + H::api::name, + H::min_supported, + H::max_supported, + H::memory_estimate}, + H::handle}; +}; + +template +constexpr auto make_lut(type_list) { + constexpr int max_index = std::max({Ts::api::key...}); + static_assert(max_index < sizeof...(Ts) * 10, "LUT is too sparse"); + + std::array lut{}; + ((lut[Ts::api::key] = &handler_holder::instance), ...); + + return lut; +} + +std::optional handler_for_key(kafka::api_key key) noexcept { + static constexpr auto lut = make_lut(request_types{}); + if (key >= (short)0 && key < (short)lut.size()) { + // We have already checked the bounds above so it is safe to use [] + // instead of at() + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-constant-array-index) + if (auto handler = lut[key]) { + return handler; + } + } + return std::nullopt; +} + +} // namespace kafka diff --git a/src/v/kafka/server/handlers/handler_interface.h b/src/v/kafka/server/handlers/handler_interface.h new file mode 100644 index 0000000000000..dc4857e9f9933 --- /dev/null +++ b/src/v/kafka/server/handlers/handler_interface.h @@ -0,0 +1,111 @@ +/* + * Copyright 2022 Redpanda Data, Inc. + * + * Use of this software is governed by the Business Source License + * included in the file licenses/BSL.md + * + * As of the Change Date specified in that file, in accordance with + * the Business Source License, use of this software will be governed + * by the Apache License, Version 2.0 + */ +#pragma once +#include "kafka/server/fwd.h" +#include "kafka/server/response.h" +#include "kafka/types.h" + +namespace kafka { +/** + * @brief Runtime polymorphic handler type. + * + * Allows access to all kafka request handling implementations though a + * type erased interface. This avoids the need to bring every handler + * type into scope and make everything that touches the handler a template + * function on the handler type. + * + */ +struct handler_interface { + /** + * @brief The minimum supported API version, inclusive. + */ + virtual api_version min_supported() const = 0; + + /** + * @brief The maximum supported API version, inclusive. + */ + virtual api_version max_supported() const = 0; + + /** + * @brief The name of the API method. + */ + virtual const char* name() const = 0; + + /** + * @brief The API key associated with the method. + */ + virtual api_key key() const = 0; + + /** + * @brief Estimates the memory used to process the request. + * + * Returns an esimate of the memory needed to process a request. This is + * used to block the request until sufficient memory is available using the + * "memory units" semaphore. Ideally this should be a conservative request + * (i.e., a possible overestimate in cases where the memory use may vary + * significantly) as the result of a too-small estimate may be an + * out-of-memory condition, while a too-large estimate will "merely" reduce + * performance. + * + * Unfortunately, this estimate happens early in the decoding process, after + * only the request size and header has been read, so handlers don't have + * as much information as they may like to make this decision. The + * connection_context for the associated connection is passed to give access + * to global state which may be useful in making the estimate. + */ + virtual size_t memory_estimate( + size_t request_size, connection_context& conn_ctx) const = 0; + + /** + * @brief Handles the request. + * + * Invokes the request handler with the given request context + * (which will be moved from) and smp_service_groups. + * + * The result stages objects contains futures for both the initial + * dispatch phase, and the find response. For API methods which + * are implemented a single phase, the same type is returned, but + * the response future will complete as soon as the dispatch one does. + * + * @return process_result_stages representing the future completion of + * the handler. + */ + virtual process_result_stages + handle(request_context&&, ss::smp_service_group) const = 0; + + virtual ~handler_interface() = default; +}; + +/** + * @brief Pointer to a handler. + * + * Most code will use handler objects, which are simply pointers + * to handlers, generally const objects with static storage duration + * obtained from handler_for_key. + */ +using handler = const handler_interface*; + +/** + * @brief Return a handler for the given key, if any. + * + * Returns a pointer to a constant singleton handler for the given + * key, or an empty optional if no such handler exists. The contained + * any_hanlder is guaranteed to be non-null if the optional as a value. + * + * This method looks up the handler in a table populated by all handlers + * in kafka::request_types. + * + * @param key the API key for the handler + * @return std::optional the handler, if any + */ +std::optional handler_for_key(api_key key) noexcept; + +} // namespace kafka diff --git a/src/v/kafka/server/handlers/handlers.h b/src/v/kafka/server/handlers/handlers.h index 80cb2d22ebfbf..d98c4d0885f01 100644 --- a/src/v/kafka/server/handlers/handlers.h +++ b/src/v/kafka/server/handlers/handlers.h @@ -87,4 +87,11 @@ using request_types = make_request_types< end_txn_handler, create_partitions_handler, offset_for_leader_epoch_handler>; + +template +static constexpr size_t max_api_key(type_list) { + /// Black magic here is an overload of std::max() that takes an + /// std::initializer_list + return std::max({RequestTypes::api::key()...}); +} } // namespace kafka diff --git a/src/v/kafka/server/handlers/heartbeat.h b/src/v/kafka/server/handlers/heartbeat.h index 27a4c22b1cbd9..437279760a549 100644 --- a/src/v/kafka/server/handlers/heartbeat.h +++ b/src/v/kafka/server/handlers/heartbeat.h @@ -14,6 +14,6 @@ namespace kafka { -using heartbeat_handler = handler; +using heartbeat_handler = single_stage_handler; } diff --git a/src/v/kafka/server/handlers/incremental_alter_configs.h b/src/v/kafka/server/handlers/incremental_alter_configs.h index 8e902f5da6b36..9dbfde6be92e6 100644 --- a/src/v/kafka/server/handlers/incremental_alter_configs.h +++ b/src/v/kafka/server/handlers/incremental_alter_configs.h @@ -15,6 +15,6 @@ namespace kafka { using incremental_alter_configs_handler - = handler; + = single_stage_handler; } diff --git a/src/v/kafka/server/handlers/init_producer_id.h b/src/v/kafka/server/handlers/init_producer_id.h index 5068f4325684b..be4c7d0a080a7 100644 --- a/src/v/kafka/server/handlers/init_producer_id.h +++ b/src/v/kafka/server/handlers/init_producer_id.h @@ -14,6 +14,7 @@ namespace kafka { -using init_producer_id_handler = handler; +using init_producer_id_handler + = single_stage_handler; } diff --git a/src/v/kafka/server/handlers/join_group.cc b/src/v/kafka/server/handlers/join_group.cc index f7bc53b80a4e4..57fe29ca93623 100644 --- a/src/v/kafka/server/handlers/join_group.cc +++ b/src/v/kafka/server/handlers/join_group.cc @@ -35,6 +35,7 @@ static void decode_request(request_context& ctx, join_group_request& req) { fmt::format("{}", ctx.connection()->client_host())); } +template<> process_result_stages join_group_handler::handle( request_context ctx, [[maybe_unused]] ss::smp_service_group g) { join_group_request request; diff --git a/src/v/kafka/server/handlers/join_group.h b/src/v/kafka/server/handlers/join_group.h index 1d3ec508350a7..1830badc2f277 100644 --- a/src/v/kafka/server/handlers/join_group.h +++ b/src/v/kafka/server/handlers/join_group.h @@ -14,10 +14,6 @@ namespace kafka { -struct join_group_handler { - using api = join_group_api; - static constexpr api_version min_supported = api_version(0); - static constexpr api_version max_supported = api_version(5); - static process_result_stages handle(request_context, ss::smp_service_group); -}; +using join_group_handler = two_phase_handler; + } // namespace kafka diff --git a/src/v/kafka/server/handlers/leave_group.h b/src/v/kafka/server/handlers/leave_group.h index a959c6dc4ddd3..61adf1450dec7 100644 --- a/src/v/kafka/server/handlers/leave_group.h +++ b/src/v/kafka/server/handlers/leave_group.h @@ -14,6 +14,6 @@ namespace kafka { -using leave_group_handler = handler; +using leave_group_handler = single_stage_handler; } diff --git a/src/v/kafka/server/handlers/list_groups.h b/src/v/kafka/server/handlers/list_groups.h index efe1657ae0827..b345f794a0e99 100644 --- a/src/v/kafka/server/handlers/list_groups.h +++ b/src/v/kafka/server/handlers/list_groups.h @@ -14,6 +14,6 @@ namespace kafka { -using list_groups_handler = handler; +using list_groups_handler = single_stage_handler; } diff --git a/src/v/kafka/server/handlers/list_offsets.h b/src/v/kafka/server/handlers/list_offsets.h index 896d0344b42aa..bb88af1b1a7e0 100644 --- a/src/v/kafka/server/handlers/list_offsets.h +++ b/src/v/kafka/server/handlers/list_offsets.h @@ -14,6 +14,6 @@ namespace kafka { -using list_offsets_handler = handler; +using list_offsets_handler = single_stage_handler; } diff --git a/src/v/kafka/server/handlers/metadata.cc b/src/v/kafka/server/handlers/metadata.cc index a372d5b6a4fec..2667b11176f56 100644 --- a/src/v/kafka/server/handlers/metadata.cc +++ b/src/v/kafka/server/handlers/metadata.cc @@ -14,10 +14,13 @@ #include "cluster/types.h" #include "config/configuration.h" #include "config/node_config.h" +#include "kafka/protocol/schemata/metadata_response.h" #include "kafka/server/errors.h" +#include "kafka/server/fwd.h" #include "kafka/server/handlers/details/leader_epoch.h" #include "kafka/server/handlers/details/security.h" #include "kafka/server/handlers/topics/topic_utils.h" +#include "kafka/server/response.h" #include "kafka/types.h" #include "likely.h" #include "model/metadata.h" @@ -229,7 +232,7 @@ get_topic_metadata(request_context& ctx, metadata_request& request) { authz_quiet{true})) { continue; } - res.push_back(make_topic_response(ctx, request, md)); + res.push_back(make_topic_response(ctx, request, md.metadata)); } return ss::make_ready_future>( @@ -422,4 +425,80 @@ ss::future metadata_handler::handle( co_return co_await ctx.respond(std::move(reply)); } +size_t +metadata_memory_estimator(size_t request_size, connection_context& conn_ctx) { + // We cannot make a precise estimate of the size of a metadata response by + // examining only the size of the request (nor even by examining the entire + // request) since the response depends on the number of partitions in the + // cluster. Instead, we return a conservative estimate based on the current + // number of topics & partitions in the cluster. + + // Essentially we need to estimate the size taken by a "maximum size" + // metadata_response_data response. The maximum size is when metadata for + // all topics is returned, which is also a common case in practice. This + // involves calculating the size for each topic's portion of the response, + // since the size varies both based on the number of partitions and the + // replica count. + + // We start with a base estimate of 10K and then proceed to ignore + // everything other than the topic/partition part of the response, since + // that's what takes space in large responses and we assume the remaining + // part of the response (the broker list being the second largest part) will + // fit in this 10000k slush fund. + size_t size_estimate = 10000; + + auto& md_cache = conn_ctx.server().metadata_cache(); + + // The size will vary with the number of brokers, though this effect is + // probably small if there are large numbers of partitions + + // This covers the variable part of the broker response, i.e., the broker + // hostname + rack We just hope these are less than this amount, because we + // don't want to execute the relatively complex logic to guess the listener + // just for the size estimate. + constexpr size_t extra_bytes_per_broker = 200; + size_estimate + += md_cache.all_brokers().size() + * (sizeof(metadata_response_broker) + extra_bytes_per_broker); + + for (auto& [tp_ns, topic_metadata] : md_cache.all_topics_metadata()) { + // metadata_response_topic + size_estimate += sizeof(kafka::metadata_response_topic); + size_estimate += tp_ns.tp().size(); + + using partition = kafka::metadata_response_partition; + + // Base number of bytes needed to represent each partition, ignoring the + // variable part attributable to the replica count, we just take as the + // size of the partition response structure. + constexpr size_t bytes_per_partition = sizeof(partition); + + // Then, we need the number of additional bytes per replica, per + // partition, associated with storing the replica list in + // metadata_response_partition::replicas/isr_nodes, which we take to + // be the size of the elements in those lists (4 bytes each). + constexpr size_t bytes_per_replica = sizeof(partition::replica_nodes[0]) + + sizeof(partition::isr_nodes[0]); + + // The actual partition and replica count for this topic. + int32_t pcount = topic_metadata.get_configuration().partition_count; + int32_t rcount = topic_metadata.get_configuration().replication_factor; + + size_estimate += pcount + * (bytes_per_partition + bytes_per_replica * rcount); + } + + // Finally, we double the estimate, because the highwater mark for memory + // use comes when the in-memory structures (metadata_response_data and + // subobjects) exist on the heap and they are encoded into the reponse, + // which will also exist on the heap. The calculation above handles the + // first size, and the encoded response ends up being very similar in size, + // so we double the estimate to account for both. + size_estimate *= 2; + + // We still add on the default_estimate to handle the size of the request + // itself and miscellaneous other procesing (this is a small adjustment, + // generally ~8000 bytes). + return default_memory_estimate(request_size) + size_estimate; +} } // namespace kafka diff --git a/src/v/kafka/server/handlers/metadata.h b/src/v/kafka/server/handlers/metadata.h index 89445b193fd0f..bd0e78bb70039 100644 --- a/src/v/kafka/server/handlers/metadata.h +++ b/src/v/kafka/server/handlers/metadata.h @@ -14,6 +14,17 @@ namespace kafka { -using metadata_handler = handler; +/** + * Estimate the size of a metadata request. + * + * Metadata requests are generally very small (a request for *all* metadata + * about a cluster is less than 30 bytes) but the response may be very large, so + * the default estimator is unsuitable. See the implementation for further + * notes. + */ +memory_estimate_fn metadata_memory_estimator; + +using metadata_handler + = single_stage_handler; -} +} // namespace kafka diff --git a/src/v/kafka/server/handlers/offset_commit.cc b/src/v/kafka/server/handlers/offset_commit.cc index ddaa68616aaba..7c1c4d10976a0 100644 --- a/src/v/kafka/server/handlers/offset_commit.cc +++ b/src/v/kafka/server/handlers/offset_commit.cc @@ -53,6 +53,7 @@ struct offset_commit_ctx { , ssg(ssg) {} }; +template<> process_result_stages offset_commit_handler::handle(request_context ctx, ss::smp_service_group ssg) { offset_commit_request request; diff --git a/src/v/kafka/server/handlers/offset_commit.h b/src/v/kafka/server/handlers/offset_commit.h index 2355c580aeac2..5a0512d4d043c 100644 --- a/src/v/kafka/server/handlers/offset_commit.h +++ b/src/v/kafka/server/handlers/offset_commit.h @@ -11,16 +11,13 @@ #pragma once #include "kafka/protocol/offset_commit.h" #include "kafka/server/handlers/handler.h" +#include "kafka/server/response.h" namespace kafka { // in version 0 kafka stores offsets in zookeeper. if we ever need to // support version 0 then we need to do some code review to see if this has // any implications on semantics. -struct offset_commit_handler { - using api = offset_commit_api; - static constexpr api_version min_supported = api_version(1); - static constexpr api_version max_supported = api_version(7); - static process_result_stages handle(request_context, ss::smp_service_group); -}; +using offset_commit_handler = two_phase_handler; + } // namespace kafka diff --git a/src/v/kafka/server/handlers/offset_fetch.h b/src/v/kafka/server/handlers/offset_fetch.h index 5cb87438b02ba..64ff13891db5e 100644 --- a/src/v/kafka/server/handlers/offset_fetch.h +++ b/src/v/kafka/server/handlers/offset_fetch.h @@ -17,6 +17,6 @@ namespace kafka { // in version 0 kafka stores offsets in zookeeper. if we ever need to // support version 0 then we need to do some code review to see if this has // any implications on semantics. -using offset_fetch_handler = handler; +using offset_fetch_handler = single_stage_handler; } // namespace kafka diff --git a/src/v/kafka/server/handlers/offset_for_leader_epoch.h b/src/v/kafka/server/handlers/offset_for_leader_epoch.h index e191aabf2a4c1..0c0e047b10a64 100644 --- a/src/v/kafka/server/handlers/offset_for_leader_epoch.h +++ b/src/v/kafka/server/handlers/offset_for_leader_epoch.h @@ -15,5 +15,5 @@ namespace kafka { using offset_for_leader_epoch_handler - = handler; + = single_stage_handler; } diff --git a/src/v/kafka/server/handlers/produce.cc b/src/v/kafka/server/handlers/produce.cc index 687b127c6c0b4..29b2dfcde8a0d 100644 --- a/src/v/kafka/server/handlers/produce.cc +++ b/src/v/kafka/server/handlers/produce.cc @@ -43,6 +43,8 @@ namespace kafka { +static constexpr auto despam_interval = std::chrono::minutes(5); + produce_response produce_request::make_error_response(error_code error) const { produce_response response; @@ -139,7 +141,7 @@ static error_code map_produce_error_code(std::error_code ec) { case raft::errc::shutting_down: return error_code::request_timed_out; default: - return error_code::unknown_server_error; + return error_code::request_timed_out; } } @@ -157,11 +159,11 @@ static error_code map_produce_error_code(std::error_code ec) { case cluster::errc::invalid_request: return error_code::invalid_request; default: - return error_code::unknown_server_error; + return error_code::request_timed_out; } } - return error_code::unknown_server_error; + return error_code::request_timed_out; } /* @@ -198,7 +200,7 @@ static partition_produce_stages partition_append( p.error_code = map_produce_error_code(r.error()); } } catch (...) { - p.error_code = error_code::unknown_server_error; + p.error_code = error_code::request_timed_out; } return p; }), @@ -464,6 +466,7 @@ static std::vector produce_topics(produce_ctx& octx) { return topics; } +template<> process_result_stages produce_handler::handle(request_context ctx, ss::smp_service_group ssg) { produce_request request; diff --git a/src/v/kafka/server/handlers/produce.h b/src/v/kafka/server/handlers/produce.h index ae7673858d77b..617b7cb1c98f7 100644 --- a/src/v/kafka/server/handlers/produce.h +++ b/src/v/kafka/server/handlers/produce.h @@ -14,12 +14,6 @@ namespace kafka { -struct produce_handler { - using api = produce_api; - static constexpr api_version min_supported = api_version(0); - static constexpr api_version max_supported = api_version(7); - static process_result_stages handle(request_context, ss::smp_service_group); - static constexpr auto despam_interval = std::chrono::minutes(5); -}; +using produce_handler = two_phase_handler; } // namespace kafka diff --git a/src/v/kafka/server/handlers/sasl_authenticate.h b/src/v/kafka/server/handlers/sasl_authenticate.h index d86e3152223c4..5165e094db17a 100644 --- a/src/v/kafka/server/handlers/sasl_authenticate.h +++ b/src/v/kafka/server/handlers/sasl_authenticate.h @@ -14,6 +14,7 @@ namespace kafka { -using sasl_authenticate_handler = handler; +using sasl_authenticate_handler + = single_stage_handler; } diff --git a/src/v/kafka/server/handlers/sasl_handshake.h b/src/v/kafka/server/handlers/sasl_handshake.h index d5a9343f939bc..7d5e5a3867f2a 100644 --- a/src/v/kafka/server/handlers/sasl_handshake.h +++ b/src/v/kafka/server/handlers/sasl_handshake.h @@ -14,6 +14,6 @@ namespace kafka { -using sasl_handshake_handler = handler; +using sasl_handshake_handler = single_stage_handler; } diff --git a/src/v/kafka/server/handlers/sync_group.cc b/src/v/kafka/server/handlers/sync_group.cc index fcbfe4a1c7437..0ddee9864020e 100644 --- a/src/v/kafka/server/handlers/sync_group.cc +++ b/src/v/kafka/server/handlers/sync_group.cc @@ -21,6 +21,7 @@ namespace kafka { +template<> process_result_stages sync_group_handler::handle( request_context ctx, [[maybe_unused]] ss::smp_service_group g) { sync_group_request request; diff --git a/src/v/kafka/server/handlers/sync_group.h b/src/v/kafka/server/handlers/sync_group.h index 711cca63bae92..b23ceb79578aa 100644 --- a/src/v/kafka/server/handlers/sync_group.h +++ b/src/v/kafka/server/handlers/sync_group.h @@ -14,11 +14,6 @@ namespace kafka { -struct sync_group_handler { - using api = sync_group_api; - static constexpr api_version min_supported = api_version(0); - static constexpr api_version max_supported = api_version(3); - static process_result_stages handle(request_context, ss::smp_service_group); -}; +using sync_group_handler = two_phase_handler; } // namespace kafka diff --git a/src/v/kafka/server/handlers/topics/topic_utils.h b/src/v/kafka/server/handlers/topics/topic_utils.h index 957ecb0cc3816..2023a28e3d310 100644 --- a/src/v/kafka/server/handlers/topics/topic_utils.h +++ b/src/v/kafka/server/handlers/topics/topic_utils.h @@ -45,7 +45,10 @@ template requires TopicRequestItem creatable_topic_result generate_error(T item, error_code code, const ss::sstring& msg) { return creatable_topic_result{ - .name = item.name, .error_code = code, .error_message = msg}; + .name = item.name, + .error_code = code, + .error_message = msg, + .topic_config_error_code = code}; } /// Generates successfull creatable_topic_result for single topic request item diff --git a/src/v/kafka/server/handlers/topics/types.cc b/src/v/kafka/server/handlers/topics/types.cc index 97362b777d705..80594c920795c 100644 --- a/src/v/kafka/server/handlers/topics/types.cc +++ b/src/v/kafka/server/handlers/topics/types.cc @@ -33,7 +33,12 @@ namespace kafka { -config_map_t config_map(const std::vector& config) { +template +concept CreatableTopicCfg = std::is_same_v || std:: + is_same_v; + +template +config_map_t make_config_map(const std::vector& config) { config_map_t ret; ret.reserve(config.size()); for (const auto& c : config) { @@ -44,6 +49,14 @@ config_map_t config_map(const std::vector& config) { return ret; } +config_map_t config_map(const std::vector& config) { + return make_config_map(config); +} + +config_map_t config_map(const std::vector& config) { + return make_config_map(config); +} + // Either parse configuration or return nullopt template static std::optional @@ -139,10 +152,13 @@ to_cluster_type(const creatable_topic& t) { cfg.properties.recovery = get_bool_value( config_entries, topic_property_recovery); cfg.properties.shadow_indexing = get_shadow_indexing_mode(config_entries); - cfg.properties.read_replica = get_bool_value( - config_entries, topic_property_read_replica); cfg.properties.read_replica_bucket = get_string_value( - config_entries, topic_property_read_replica_bucket); + config_entries, topic_property_read_replica); + if (cfg.properties.read_replica_bucket.has_value()) { + cfg.properties.read_replica = true; + } + /// Final topic_property not decoded here is \ref remote_topic_properties, + /// is more of an implementation detail no need to ever show user auto ret = cluster::custom_assignable_topic_configuration(cfg); /** @@ -164,4 +180,80 @@ to_cluster_type(const creatable_topic& t) { return ret; } +template +static ss::sstring from_config_type(const T& v) { + if constexpr (std::is_enum_v) { + return ss::to_sstring(static_cast>(v)); + } else if constexpr (std::is_same_v) { + return v ? "true" : "false"; + } else if constexpr (std::is_same_v) { + return ss::to_sstring( + std::chrono::duration_cast(v).count()); + } else { + return ss::to_sstring(v); + } +} + +config_map_t from_cluster_type(const cluster::topic_properties& properties) { + config_map_t config_entries; + if (properties.compression) { + config_entries[topic_property_compression] = from_config_type( + *properties.compression); + } + if (properties.cleanup_policy_bitflags) { + config_entries[topic_property_cleanup_policy] = from_config_type( + *properties.cleanup_policy_bitflags); + } + if (properties.compaction_strategy) { + config_entries[topic_property_compaction_strategy] = from_config_type( + *properties.compaction_strategy); + } + if (properties.timestamp_type) { + config_entries[topic_property_timestamp_type] = from_config_type( + *properties.timestamp_type); + } + if (properties.segment_size) { + config_entries[topic_property_segment_size] = from_config_type( + *properties.segment_size); + } + if (properties.retention_bytes.has_value()) { + config_entries[topic_property_retention_bytes] = from_config_type( + properties.retention_bytes.value()); + } + if (properties.retention_duration.has_value()) { + config_entries[topic_property_retention_duration] = from_config_type( + *properties.retention_duration); + } + if (properties.recovery) { + config_entries[topic_property_recovery] = from_config_type( + *properties.recovery); + } + if (properties.shadow_indexing) { + config_entries[topic_property_remote_write] = "false"; + config_entries[topic_property_remote_read] = "false"; + + switch (*properties.shadow_indexing) { + case model::shadow_indexing_mode::archival: + config_entries[topic_property_remote_write] = "true"; + break; + case model::shadow_indexing_mode::fetch: + config_entries[topic_property_remote_read] = "true"; + break; + case model::shadow_indexing_mode::full: + config_entries[topic_property_remote_write] = "true"; + config_entries[topic_property_remote_read] = "true"; + break; + default: + break; + } + } + if (properties.read_replica_bucket) { + config_entries[topic_property_read_replica] = from_config_type( + *properties.read_replica_bucket); + } + /// Final topic_property not encoded here is \ref remote_topic_properties, + /// is more of an implementation detail no need to ever show user + return config_entries; +} + } // namespace kafka diff --git a/src/v/kafka/server/handlers/topics/types.h b/src/v/kafka/server/handlers/topics/types.h index fa96d73862237..0c57d1bd458a7 100644 --- a/src/v/kafka/server/handlers/topics/types.h +++ b/src/v/kafka/server/handlers/topics/types.h @@ -53,8 +53,6 @@ static constexpr std::string_view topic_property_remote_read = "redpanda.remote.read"; static constexpr std::string_view topic_property_read_replica = "redpanda.remote.readreplica"; -static constexpr std::string_view topic_property_read_replica_bucket - = "redpanda.remote.readreplica.bucket"; // Data-policy property static constexpr std::string_view topic_property_data_policy_function_name @@ -109,8 +107,10 @@ from_cluster_topic_result(const cluster::topic_result& err) { } config_map_t config_map(const std::vector& config); +config_map_t config_map(const std::vector& config); cluster::custom_assignable_topic_configuration to_cluster_type(const creatable_topic& t); +config_map_t from_cluster_type(const cluster::topic_properties&); } // namespace kafka diff --git a/src/v/kafka/server/handlers/topics/validators.h b/src/v/kafka/server/handlers/topics/validators.h index cb346517beff0..12a92e67a89bb 100644 --- a/src/v/kafka/server/handlers/topics/validators.h +++ b/src/v/kafka/server/handlers/topics/validators.h @@ -163,47 +163,6 @@ struct remote_read_and_write_are_not_supported_for_read_replica { } }; -struct s3_bucket_is_required_for_read_replica { - static constexpr error_code ec = error_code::invalid_config; - static constexpr const char* error_message - = "s3 bucket should be provided for read replica topic"; - - static bool is_valid(const creatable_topic& c) { - auto config_entries = config_map(c.configs); - auto end = config_entries.end(); - bool is_read_replica - = (config_entries.find(topic_property_read_replica) != end); - bool s3_bucket_provided - = (config_entries.find(topic_property_read_replica_bucket) != end); - - if (is_read_replica && !s3_bucket_provided) { - return false; - } - return true; - } -}; - -struct s3_bucket_is_supported_only_for_read_replica { - static constexpr error_code ec = error_code::invalid_config; - static constexpr const char* error_message - = "s3 bucket is supported only when redpanda.remote.readreplica is " - "enabled"; - - static bool is_valid(const creatable_topic& c) { - auto config_entries = config_map(c.configs); - auto end = config_entries.end(); - bool is_read_replica - = (config_entries.find(topic_property_read_replica) != end); - bool s3_bucket_provided - = (config_entries.find(topic_property_read_replica_bucket) != end); - - if (!is_read_replica && s3_bucket_provided) { - return false; - } - return true; - } -}; - struct compression_type_validator_details { using validated_type = model::compression; diff --git a/src/v/kafka/server/handlers/txn_offset_commit.h b/src/v/kafka/server/handlers/txn_offset_commit.h index dcb1fc5786182..c7cebe25954fe 100644 --- a/src/v/kafka/server/handlers/txn_offset_commit.h +++ b/src/v/kafka/server/handlers/txn_offset_commit.h @@ -14,6 +14,7 @@ namespace kafka { -using txn_offset_commit_handler = handler; +using txn_offset_commit_handler + = single_stage_handler; } diff --git a/src/v/kafka/server/protocol.cc b/src/v/kafka/server/protocol.cc index f8167d884537f..2ed5678734380 100644 --- a/src/v/kafka/server/protocol.cc +++ b/src/v/kafka/server/protocol.cc @@ -10,13 +10,18 @@ #include "protocol.h" #include "cluster/topics_frontend.h" +#include "config/broker_authn_endpoint.h" #include "config/configuration.h" +#include "config/node_config.h" #include "kafka/server/connection_context.h" #include "kafka/server/coordinator_ntp_mapper.h" #include "kafka/server/group_router.h" #include "kafka/server/logger.h" #include "kafka/server/request_context.h" #include "kafka/server/response.h" +#include "net/connection.h" +#include "security/errc.h" +#include "security/exceptions.h" #include "security/mtls.h" #include "security/scram_algorithm.h" #include "utils/utf8.h" @@ -80,7 +85,9 @@ protocol::protocol( , _controller_api(controller_api) , _tx_gateway_frontend(tx_gateway_frontend) , _coproc_partition_manager(coproc_partition_manager) - , _data_policy_table(data_policy_table) { + , _data_policy_table(data_policy_table) + , _mtls_principal_mapper( + config::shard_local_cfg().kafka_mtls_principal_mapping_rules.bind()) { if (qdc_config) { _qdc_mon.emplace(*qdc_config); } @@ -92,35 +99,95 @@ coordinator_ntp_mapper& protocol::coordinator_mapper() { return _group_router.local().coordinator_mapper().local(); } +config::broker_authn_method get_authn_method(const net::connection& conn) { + // If authn_method is set on the endpoint + // Use it + // Else if kafka_enable_authorization is not set + // Use sasl if enable_sasl + // Else if has mtls mapping rules + // Use mtls_identity + // Else + // Disable AuthN + + std::optional authn_method; + auto n = conn.name(); + const auto& kafka_api = config::node().kafka_api.value(); + auto ep_it = std::find_if( + kafka_api.begin(), + kafka_api.end(), + [&n](const config::broker_authn_endpoint& ep) { return ep.name == n; }); + if (ep_it != kafka_api.end()) { + authn_method = ep_it->authn_method; + } + if (authn_method.has_value()) { + return *authn_method; + } + const auto& config = config::shard_local_cfg(); + // if kafka_enable_authorization is not set, use sasl iff enable_sasl + if ( + !config.kafka_enable_authorization().has_value() + && config.enable_sasl()) { + return config::broker_authn_method::sasl; + } + return config::broker_authn_method::none; +} + +ss::future get_mtls_principal_state( + const security::tls::principal_mapper& pm, net::connection& conn) { + using namespace std::chrono_literals; + return ss::with_timeout( + model::timeout_clock::now() + 5s, conn.get_distinguished_name()) + .then([&pm](std::optional dn) { + ss::sstring anonymous_principal; + if (!dn.has_value()) { + vlog(klog.info, "failed to fetch distinguished name"); + return security::tls::mtls_state{anonymous_principal}; + } + auto principal = pm.apply(dn->subject); + if (!principal) { + vlog( + klog.info, + "failed to extract principal from distinguished name: {}", + dn->subject); + return security::tls::mtls_state{anonymous_principal}; + } + + vlog( + klog.debug, + "got principal: {}, from distinguished name: {}", + *principal, + dn->subject); + return security::tls::mtls_state{*principal}; + }); +} + ss::future<> protocol::apply(net::server::resources rs) { + const bool authz_enabled + = config::shard_local_cfg().kafka_enable_authorization().value_or( + config::shard_local_cfg().enable_sasl()); + const auto authn_method = get_authn_method(*rs.conn); + /* * if sasl authentication is not enabled then initialize the sasl state to * complete. this will cause auth to be skipped during request processing. - * - * TODO: temporarily acl authorization is enabled/disabled based on sasl - * being enabled/disabled. it may be useful to configure them separately, - * but this will come when identity management is introduced. */ security::sasl_server sasl( - config::shard_local_cfg().enable_sasl() + authn_method == config::broker_authn_method::sasl ? security::sasl_server::sasl_state::initial : security::sasl_server::sasl_state::complete); - const auto enable_mtls_authentication - = rs.conn->get_principal_mapping().has_value() - && feature_table().local().is_active( - cluster::feature::mtls_authentication); + std::optional mtls_state; + if (authn_method == config::broker_authn_method::mtls_identity) { + mtls_state = co_await get_mtls_principal_state( + _mtls_principal_mapper, *rs.conn); + } auto ctx = ss::make_lw_shared( - *this, - std::move(rs), - std::move(sasl), - config::shard_local_cfg().enable_sasl(), - enable_mtls_authentication); - - return ss::do_until( - [ctx] { return ctx->is_finished_parsing(); }, - [ctx] { return ctx->process_one_request(); }) + *this, std::move(rs), std::move(sasl), authz_enabled, mtls_state); + + co_return co_await ss::do_until( + [ctx] { return ctx->is_finished_parsing(); }, + [ctx] { return ctx->process_one_request(); }) .handle_exception([ctx](std::exception_ptr eptr) { auto disconnected = net::is_disconnect_exception(eptr); if (config::shard_local_cfg().enable_sasl()) { diff --git a/src/v/kafka/server/protocol.h b/src/v/kafka/server/protocol.h index 84ba9aced12df..b65b2f3011782 100644 --- a/src/v/kafka/server/protocol.h +++ b/src/v/kafka/server/protocol.h @@ -21,6 +21,7 @@ #include "net/server.h" #include "security/authorizer.h" #include "security/credential_store.h" +#include "security/mtls.h" #include "utils/ema.h" #include "v8_engine/data_policy_table.h" @@ -159,6 +160,7 @@ class protocol final : public net::server::protocol { ss::sharded& _data_policy_table; std::optional _qdc_mon; kafka::fetch_metadata_cache _fetch_metadata_cache; + security::tls::principal_mapper _mtls_principal_mapper; latency_probe _probe; }; diff --git a/src/v/kafka/server/replicated_partition.cc b/src/v/kafka/server/replicated_partition.cc index faf03c03d452a..4bce1aa8766e4 100644 --- a/src/v/kafka/server/replicated_partition.cc +++ b/src/v/kafka/server/replicated_partition.cc @@ -22,6 +22,7 @@ #include "storage/types.h" #include +#include #include @@ -38,6 +39,15 @@ replicated_partition::replicated_partition( ss::future replicated_partition::make_reader( storage::log_reader_config cfg, std::optional deadline) { + if ( + _partition->is_read_replica_mode_enabled() + && _partition->cloud_data_available()) { + // No need to translate the offsets in this case since all fetch + // requests in read replica are served via remote_partition which + // does its own translation. + co_return co_await _partition->make_cloud_reader(cfg); + } + auto local_kafka_start_offset = _translator->from_log_offset( _partition->start_offset()); if ( @@ -106,12 +116,9 @@ ss::future replicated_partition::make_reader( } ss::future> -replicated_partition::aborted_transactions( - model::offset base, - model::offset last, +replicated_partition::aborted_transactions_local( + cloud_storage::offset_range offsets, ss::lw_shared_ptr ot_state) { - vassert(ot_state, "ntp {}: offset translator state must be present", ntp()); - // Note: here we expect that local _partition contains aborted transaction // ids for both local and remote offset ranges. This is true as long as // rm_stm state has not been reset (for example when there is a partition @@ -119,14 +126,13 @@ replicated_partition::aborted_transactions( // eviction point). See // https://github.com/redpanda-data/redpanda/issues/3001 - auto base_rp = ot_state->to_log_offset(base); - auto last_rp = ot_state->to_log_offset(last); - auto source = co_await _partition->aborted_transactions(base_rp, last_rp); + auto source = co_await _partition->aborted_transactions( + offsets.begin_rp, offsets.end_rp); // We trim beginning of aborted ranges to `trim_at` because we don't have // offset translation info for earlier offsets. model::offset trim_at; - if (base_rp >= _partition->start_offset()) { + if (offsets.begin_rp >= _partition->start_offset()) { // Local fetch. Trim to start of the log - it is safe because clients // can't read earlier offsets. trim_at = _partition->start_offset(); @@ -135,7 +141,7 @@ replicated_partition::aborted_transactions( // incorrect because clients can still see earlier offsets but will work // if they won't use aborted ranges from this request to filter batches // belonging to earlier offsets. - trim_at = base_rp; + trim_at = offsets.begin_rp; } std::vector target; @@ -150,6 +156,85 @@ replicated_partition::aborted_transactions( co_return target; } +ss::future> +replicated_partition::aborted_transactions_remote( + cloud_storage::offset_range offsets, + ss::lw_shared_ptr ot_state) { + auto source = co_await _partition->aborted_transactions_cloud(offsets); + std::vector target; + target.reserve(source.size()); + for (const auto& range : source) { + target.push_back(cluster::rm_stm::tx_range{ + .pid = range.pid, + .first = ot_state->from_log_offset( + std::max(offsets.begin_rp, range.first)), + .last = ot_state->from_log_offset(range.last)}); + } + co_return target; +} + +ss::future> +replicated_partition::aborted_transactions( + model::offset base, + model::offset last, + ss::lw_shared_ptr ot_state) { + // We can extract information about aborted transactions from local raft log + // or from the S3 bucket. The decision is made using the following logic: + // - if the record batches were produced by shadow indexing (downloaded from + // S3) + // then we should use the same source for transactions metadata. It's + // guaranteed that in this case we will find the corresponding manifest + // (it's downloaded alongside the segment to SI cache). This also means + // that we will have the manifests hydrated on disk (since we just + // downloaded corresponding segments from S3 to produce batches). + // - if the source of data is local raft log then we should use abroted + // transactions + // snapshot. + // + // Sometimes the snapshot will have data for the offset range even if the + // source is S3 bucket. In this case we won't be using this data because + // it's not guaranteed that it has the data for the entire offset range and + // we won't be able to tell the difference by looking at the results (for + // instance, the offset range is 0-100, but the snapshot has data starting + // from offset 50, it will return data for range 50-100 and we won't be able + // to tell if it didn't have data for 0-50 or there wasn't any transactions + // in that range). + vassert(ot_state, "ntp {}: offset translator state must be present", ntp()); + auto base_rp = ot_state->to_log_offset(base); + auto last_rp = ot_state->to_log_offset(last); + cloud_storage::offset_range offsets = { + .begin = base, + .end = last, + .begin_rp = base_rp, + .end_rp = last_rp, + }; + if (_partition->is_read_replica_mode_enabled()) { + // Always use SI for read replicas + co_return co_await aborted_transactions_remote(offsets, ot_state); + } + if ( + _partition->cloud_data_available() + && offsets.begin_rp < _partition->start_offset()) { + // The fetch request was satisfied using shadow indexing. + auto tx_remote = co_await aborted_transactions_remote( + offsets, ot_state); + if (!tx_remote.empty()) { + // NOTE: we don't have a way to upload tx-manifests to the cloud + // for segments which was uploaded by old redpanda version because + // we can't guarantee that the local snapshot still has the data. + // This means that 'aborted_transaction_remote' might return empty + // result in case if the segment was uploaded by previous version of + // redpanda. In this case we will try to fetch the aborted + // transactions metadata from local snapshot. This approach provide + // the same guarantees that we have in v22.1 for data produced by + // v22.1 and earlier. But for new data we will guarantee that the + // metadata is always available in S3. + co_return tx_remote; + } + } + co_return co_await aborted_transactions_local(offsets, ot_state); +} + ss::future> replicated_partition::timequery(storage::timequery_config cfg) { return _partition->timequery(cfg).then( @@ -165,11 +250,11 @@ ss::future> replicated_partition::replicate( model::record_batch_reader rdr, raft::replicate_options opts) { using ret_t = result; return _partition->replicate(std::move(rdr), opts) - .then([this](result r) { + .then([](result r) { if (!r) { return ret_t(r.error()); } - return ret_t(_translator->from_log_offset(r.value().last_offset)); + return ret_t(model::offset(r.value().last_offset())); }); } @@ -179,15 +264,18 @@ raft::replicate_stages replicated_partition::replicate( raft::replicate_options opts) { using ret_t = result; auto res = _partition->replicate_in_stages(batch_id, std::move(rdr), opts); - res.replicate_finished = res.replicate_finished.then( - [this](result r) { + + raft::replicate_stages out(raft::errc::success); + out.request_enqueued = std::move(res.request_enqueued); + out.replicate_finished = res.replicate_finished.then( + [](result r) { if (!r) { return ret_t(r.error()); } - return ret_t(raft::replicate_result{ - _translator->from_log_offset(r.value().last_offset)}); + return ret_t( + raft::replicate_result{model::offset(r.value().last_offset())}); }); - return res; + return out; } std::optional replicated_partition::get_leader_epoch_last_offset( diff --git a/src/v/kafka/server/replicated_partition.h b/src/v/kafka/server/replicated_partition.h index 5c4379762f680..e442021f1c2e3 100644 --- a/src/v/kafka/server/replicated_partition.h +++ b/src/v/kafka/server/replicated_partition.h @@ -37,6 +37,13 @@ class replicated_partition final : public kafka::partition_proxy::impl { const model::ntp& ntp() const final { return _partition->ntp(); } model::offset start_offset() const final { + if ( + _partition->is_read_replica_mode_enabled() + && _partition->cloud_data_available()) { + // Always assume remote read in this case. + return _partition->start_cloud_offset(); + } + auto local_kafka_start_offset = _translator->from_log_offset( _partition->start_offset()); if ( @@ -49,10 +56,25 @@ class replicated_partition final : public kafka::partition_proxy::impl { } model::offset high_watermark() const final { + if (_partition->is_read_replica_mode_enabled()) { + if (_partition->cloud_data_available()) { + return model::next_offset(_partition->last_cloud_offset()); + } else { + return model::offset(0); + } + } return _translator->from_log_offset(_partition->high_watermark()); } model::offset last_stable_offset() const final { + if (_partition->is_read_replica_mode_enabled()) { + if (_partition->cloud_data_available()) { + // There is no difference between HWM and LO in this mode + return model::next_offset(_partition->last_cloud_offset()); + } else { + return model::offset(0); + } + } return _translator->from_log_offset(_partition->last_stable_offset()); } @@ -103,6 +125,16 @@ class replicated_partition final : public kafka::partition_proxy::impl { model::offset, model::timeout_clock::time_point) final; private: + ss::future> + aborted_transactions_local( + cloud_storage::offset_range, + ss::lw_shared_ptr); + + ss::future> + aborted_transactions_remote( + cloud_storage::offset_range offsets, + ss::lw_shared_ptr ot_state); + ss::lw_shared_ptr _partition; ss::lw_shared_ptr _translator; }; diff --git a/src/v/kafka/server/request_context.h b/src/v/kafka/server/request_context.h index 1f7b1f35e7668..ddee4747a43f3 100644 --- a/src/v/kafka/server/request_context.h +++ b/src/v/kafka/server/request_context.h @@ -219,7 +219,8 @@ class request_context { }; // Executes the API call identified by the specified request_context. -process_result_stages process_request(request_context&&, ss::smp_service_group); +process_result_stages process_request( + request_context&&, ss::smp_service_group, const session_resources&); bool track_latency(api_key); diff --git a/src/v/kafka/server/requests.cc b/src/v/kafka/server/requests.cc index 8aec87556a3ed..950ff2ca2d628 100644 --- a/src/v/kafka/server/requests.cc +++ b/src/v/kafka/server/requests.cc @@ -7,8 +7,14 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0 -#include "kafka/server/handlers/handlers.h" -#include "kafka/server/handlers/produce.h" +#include "kafka/protocol/schemata/api_versions_request.h" +#include "kafka/protocol/schemata/fetch_request.h" +#include "kafka/protocol/schemata/produce_request.h" +#include "kafka/server/connection_context.h" +#include "kafka/server/handlers/api_versions.h" +#include "kafka/server/handlers/handler_interface.h" +#include "kafka/server/handlers/sasl_authenticate.h" +#include "kafka/server/handlers/sasl_handshake.h" #include "kafka/server/request_context.h" #include "kafka/types.h" #include "utils/to_string.h" @@ -33,53 +39,6 @@ struct process_dispatch { // clang-format on } }; -/** - * api_versions request processed in one stage however this template - * specialization exists so that the return value of the request can be examined - * by the connection layer. - */ -template<> -struct process_dispatch { - static process_result_stages - process(request_context&& ctx, ss::smp_service_group g) { - return process_result_stages::single_stage( - api_versions_handler::handle(std::move(ctx), g)); - } -}; - -/** - * Requests processed in two stages - */ -template<> -struct process_dispatch { - static process_result_stages - process(request_context&& ctx, ss::smp_service_group g) { - return produce_handler::handle(std::move(ctx), g); - } -}; - -template<> -struct process_dispatch { - static process_result_stages - process(request_context&& ctx, ss::smp_service_group g) { - return offset_commit_handler::handle(std::move(ctx), g); - } -}; -template<> -struct process_dispatch { - static process_result_stages - process(request_context&& ctx, ss::smp_service_group g) { - return join_group_handler::handle(std::move(ctx), g); - } -}; -template<> -struct process_dispatch { - static process_result_stages - process(request_context&& ctx, ss::smp_service_group g) { - return sync_group_handler::handle(std::move(ctx), g); - } -}; - class kafka_api_version_not_supported_exception : public std::runtime_error { public: explicit kafka_api_version_not_supported_exception(const std::string& m) @@ -121,6 +80,39 @@ requires(KafkaApiHandler || KafkaApiTwoPhaseHandler) return process_dispatch::process(std::move(ctx), g); } +process_result_stages process_generic( + handler handler, + request_context&& ctx, + ss::smp_service_group g, + const session_resources& sres) { + vlog( + klog.trace, + "[{}:{}] processing name:{}, key:{}, version:{} for {}, mem_units: {}", + ctx.connection()->client_host(), + ctx.connection()->client_port(), + handler->name(), + ctx.header().key, + ctx.header().version, + ctx.header().client_id.value_or(std::string_view("unset-client-id")), + sres.memlocks.count()); + + // We do a version check for most API requests, but for api_version + // requests we skip them. We do not apply them for api_versions, + // because the client does not yet know what + // versions this server supports. The api versions request is used by a + // client to query this information. + if (ctx.header().key != api_versions_api::key && + (ctx.header().version < handler->min_supported() || + ctx.header().version > handler->max_supported())) { + throw std::runtime_error(fmt::format( + "Unsupported version {} for {} API", + ctx.header().version, + handler->name())); + } + + return handler->handle(std::move(ctx), g); +} + class kafka_authentication_exception : public std::runtime_error { public: explicit kafka_authentication_exception(const std::string& m) @@ -161,7 +153,7 @@ handle_auth_handshake(request_context&& ctx, ss::smp_service_group g) { static ss::future handle_auth_initial(request_context&& ctx, ss::smp_service_group g) { switch (ctx.header().key) { - case api_versions_handler::api::key: { + case api_versions_api::key: { auto r = api_versions_handler::handle_raw(ctx); if (r.data.error_code == error_code::none) { ctx.sasl().set_state(security::sasl_server::sasl_state::handshake); @@ -247,16 +239,18 @@ handle_auth(request_context&& ctx, ss::smp_service_group g) { // only track latency for push and fetch requests bool track_latency(api_key key) { switch (key) { - case fetch_handler::api::key: - case produce_handler::api::key: + case fetch_api::key: + case produce_api::key: return true; default: return false; } } -process_result_stages -process_request(request_context&& ctx, ss::smp_service_group g) { +process_result_stages process_request( + request_context&& ctx, + ss::smp_service_group g, + const session_resources& sres) { /* * requests are handled as normal when auth is disabled. otherwise no * request is handled until the auth process has completed. @@ -274,47 +268,14 @@ process_request(request_context&& ctx, ss::smp_service_group g) { })); } - switch (ctx.header().key) { - case api_versions_handler::api::key: - return do_process(std::move(ctx), g); - case metadata_handler::api::key: - return do_process(std::move(ctx), g); - case list_groups_handler::api::key: - return do_process(std::move(ctx), g); - case find_coordinator_handler::api::key: - return do_process(std::move(ctx), g); - case offset_fetch_handler::api::key: - return do_process(std::move(ctx), g); - case produce_handler::api::key: - return do_process(std::move(ctx), g); - case list_offsets_handler::api::key: - return do_process(std::move(ctx), g); - case offset_commit_handler::api::key: - return do_process(std::move(ctx), g); - case fetch_handler::api::key: - return do_process(std::move(ctx), g); - case join_group_handler::api::key: - return do_process(std::move(ctx), g); - case heartbeat_handler::api::key: - return do_process(std::move(ctx), g); - case leave_group_handler::api::key: - return do_process(std::move(ctx), g); - case sync_group_handler::api::key: - return do_process(std::move(ctx), g); - case create_topics_handler::api::key: - return do_process(std::move(ctx), g); - case describe_configs_handler::api::key: - return do_process(std::move(ctx), g); - case alter_configs_handler::api::key: - return do_process(std::move(ctx), g); - case delete_topics_handler::api::key: - return do_process(std::move(ctx), g); - case describe_groups_handler::api::key: - return do_process(std::move(ctx), g); - case sasl_handshake_handler::api::key: + auto& key = ctx.header().key; + + if (key == sasl_handshake_handler::api::key) { return process_result_stages::single_stage(ctx.respond( sasl_handshake_response(error_code::illegal_sasl_state, {}))); - case sasl_authenticate_handler::api::key: { + } + + if (key == sasl_authenticate_handler::api::key) { sasl_authenticate_response_data data{ .error_code = error_code::illegal_sasl_state, .error_message = "Authentication process already completed", @@ -322,33 +283,11 @@ process_request(request_context&& ctx, ss::smp_service_group g) { return process_result_stages::single_stage( ctx.respond(sasl_authenticate_response(std::move(data)))); } - case init_producer_id_handler::api::key: - return do_process(std::move(ctx), g); - case incremental_alter_configs_handler::api::key: - return do_process(std::move(ctx), g); - case delete_groups_handler::api::key: - return do_process(std::move(ctx), g); - case describe_acls_handler::api::key: - return do_process(std::move(ctx), g); - case describe_log_dirs_handler::api::key: - return do_process(std::move(ctx), g); - case create_acls_handler::api::key: - return do_process(std::move(ctx), g); - case delete_acls_handler::api::key: - return do_process(std::move(ctx), g); - case add_partitions_to_txn_handler::api::key: - return do_process(std::move(ctx), g); - case txn_offset_commit_handler::api::key: - return do_process(std::move(ctx), g); - case add_offsets_to_txn_handler::api::key: - return do_process(std::move(ctx), g); - case end_txn_handler::api::key: - return do_process(std::move(ctx), g); - case create_partitions_handler::api::key: - return do_process(std::move(ctx), g); - case offset_for_leader_epoch_handler::api::key: - return do_process(std::move(ctx), g); - }; + + if (auto handler = handler_for_key(key)) { + return process_generic(*handler, std::move(ctx), g, sres); + } + throw std::runtime_error( fmt::format("Unsupported API {}", ctx.header().key)); } diff --git a/src/v/kafka/server/response.h b/src/v/kafka/server/response.h index c7be3de993d36..0bbe52bc978e7 100644 --- a/src/v/kafka/server/response.h +++ b/src/v/kafka/server/response.h @@ -105,4 +105,26 @@ struct process_result_stages { ss::future response; }; +/** + * @brief The default memory size estimate. + * + * Request must make an up-front estimate of the amount of memory they will use, + * in order to obtain the corresponding number of units from the memory + * semaphore (blocking if they are not available). Each request type can use + * their own estimation approach, but if not specified this default estimator + * will be used. + * + * Now, this estimator is very poor for many request types: it only applies a + * multiplier to the request size, so only makes + * sense for requests (such as produce) where the size of the request is a + * good indicator of the total memory size. For requests with a small request + * but a large response (fetch, metadata, etc), it is not appropriate. + * + * @return size_t the estimated size required to process the request + */ +constexpr size_t default_memory_estimate(size_t request_size) { + // Allow for extra copies and bookkeeping + return request_size * 2 + 8000; // NOLINT +} + } // namespace kafka diff --git a/src/v/kafka/server/tests/CMakeLists.txt b/src/v/kafka/server/tests/CMakeLists.txt index 875fc931d94be..08298dec60459 100644 --- a/src/v/kafka/server/tests/CMakeLists.txt +++ b/src/v/kafka/server/tests/CMakeLists.txt @@ -8,13 +8,14 @@ rp_test( timeouts_conversion_test.cc types_conversion_tests.cc topic_utils_test.cc + handler_interface_test.cc DEFINITIONS BOOST_TEST_DYN_LINK - LIBRARIES Boost::unit_test_framework v::kafka + LIBRARIES Boost::unit_test_framework v::kafka v::coproc LABELS kafka ) + set(srcs - s3_imposter_fixture.cc consumer_groups_test.cc member_test.cc group_test.cc diff --git a/src/v/kafka/server/tests/create_topics_test.cc b/src/v/kafka/server/tests/create_topics_test.cc index ac7596021fadc..41e043e9a03e9 100644 --- a/src/v/kafka/server/tests/create_topics_test.cc +++ b/src/v/kafka/server/tests/create_topics_test.cc @@ -9,25 +9,19 @@ #include "kafka/protocol/create_topics.h" #include "kafka/protocol/metadata.h" +#include "kafka/server/handlers/topics/types.h" #include "redpanda/tests/fixture.h" #include "resource_mgmt/io_priority.h" -#include "s3_imposter_fixture.h" #include #include #include #include -#include - -inline ss::logger test_log("test"); // NOLINT // rougly equivalent to the test harness: // https://github.com/apache/kafka/blob/8e16158/core/src/test/scala/unit/kafka/server/AbstractCreateTopicsRequestTest.scala -class create_topic_fixture - : public s3_imposter_fixture - , public enable_cloud_storage_fixture - , public redpanda_thread_fixture { +class create_topic_fixture : public redpanda_thread_fixture { public: kafka::create_topics_request make_req( std::vector topics, bool validate_only = false) { @@ -96,18 +90,12 @@ class create_topic_fixture void test_create_topic( kafka::create_topics_request req, - std::optional partition_count = std::nullopt, - std::optional revision_id = std::nullopt) { + kafka::api_version version = kafka::api_version(2)) { auto client = make_kafka_client().get0(); client.connect().get(); - auto resp = client.dispatch(req, kafka::api_version(2)).get0(); - - // todo: here - for (auto req : get_requests()) { - vlog(test_log.info, "{} {}", req._method, req._url); - } + auto resp = client.dispatch(req, version).get0(); - BOOST_TEST( + BOOST_REQUIRE_MESSAGE( std::all_of( std::cbegin(resp.data.topics), std::cend(resp.data.topics), @@ -117,7 +105,16 @@ class create_topic_fixture fmt::format("expected no errors. received response: {}", resp)); for (auto& topic : req.data.topics) { - verify_metadata(client, req, topic, partition_count, revision_id); + verify_metadata(client, req, topic); + + auto it = std::find_if( + resp.data.topics.begin(), + resp.data.topics.end(), + [name = topic.name](const auto& t) { return t.name == name; }); + + BOOST_CHECK(it != resp.data.topics.end()); + verify_response(topic, *it, version, req.data.validate_only); + // TODO: one we combine the cluster fixture with the redpanda // fixture and enable multiple RP instances to run at the same time // in the test, then we should create two clients in this test where @@ -129,9 +126,42 @@ class create_topic_fixture client.stop().then([&client] { client.shutdown(); }).get(); } - void test_create_read_replica_topic( - kafka::create_topics_request req, int partition_count, int revision_id) { - test_create_topic(req, partition_count, revision_id); + void verify_response( + const kafka::creatable_topic& req, + const kafka::creatable_topic_result& topic_res, + kafka::api_version version, + bool validate_only) { + if (version < kafka::api_version(5)) { + /// currently this method only verifies configurations in v5 + /// responses + return; + } + if (validate_only) { + /// Server should return default configs + BOOST_TEST(topic_res.configs, "empty config response"); + auto cfg_map = config_map(*topic_res.configs); + const auto default_topic_properties = kafka::from_cluster_type( + app.metadata_cache.local().get_default_properties()); + BOOST_TEST( + cfg_map == default_topic_properties, + "incorrect default properties"); + BOOST_CHECK_EQUAL( + topic_res.topic_config_error_code, kafka::error_code::none); + return; + } + if (req.configs.empty()) { + /// no custom configs were passed + return; + } + BOOST_TEST(topic_res.configs, "Expecting configs"); + auto resp_cfgs = kafka::config_map(*topic_res.configs); + auto cfg = app.metadata_cache.local().get_topic_cfg( + model::topic_namespace_view{model::kafka_namespace, topic_res.name}); + BOOST_TEST(cfg, "missing topic config"); + auto config_map = kafka::from_cluster_type(cfg->properties); + BOOST_TEST(config_map == resp_cfgs, "configs didn't match"); + BOOST_CHECK_EQUAL( + topic_res.topic_config_error_code, kafka::error_code::none); } void test_create_non_replicable_topic( @@ -174,9 +204,7 @@ class create_topic_fixture void verify_metadata( kafka::client::transport& client, kafka::create_topics_request& create_req, - kafka::creatable_topic& request_topic, - std::optional partition_count = std::nullopt, - std::optional revision_id = std::nullopt) { + kafka::creatable_topic& request_topic) { // query the server for this topic's metadata kafka::metadata_request metadata_req; metadata_req.data.topics @@ -199,9 +227,7 @@ class create_topic_fixture "expected topic not returned from metadata query"); int partitions; - if (partition_count) { - partitions = partition_count.value(); - } else if (!request_topic.assignments.empty()) { + if (!request_topic.assignments.empty()) { partitions = request_topic.assignments.size(); } else { partitions = request_topic.num_partitions; @@ -326,90 +352,13 @@ FIXTURE_TEST(create_non_replicable_topics, create_topic_fixture) { BOOST_CHECK(resp[1].tp_ns.tp() == "topic2"); } -FIXTURE_TEST(read_replica, create_topic_fixture) { - ss::sstring manifest_url = ssx::sformat( - "/f0000000/meta/kafka/test-topic/topic_manifest.json"); - - std::string_view manifest_payload = R"json({ - "version": 1, - "namespace": "kafka", - "topic": "test-topic", - "partition_count": 32, - "replication_factor": 3, - "revision_id": 10, - "compression": null, - "cleanup_policy_bitflags": null, - "compaction_strategy": null, - "timestamp_type": null, - "segment_size": null - })json"; - - set_expectations_and_listen({expectation{ - .url = manifest_url, .body = ss::sstring(manifest_payload)}}); - - auto topic = make_topic( - "test-topic", - std::nullopt, - std::nullopt, - std::map{ - {"redpanda.remote.readreplica", "true"}, - {"redpanda.remote.readreplica.bucket", "panda-bucket"}}); - - test_create_read_replica_topic(make_req({topic}), 32, 10); -} - -FIXTURE_TEST(s3bucket_is_missing, create_topic_fixture) { - auto topic = make_topic( - "topic1", - std::nullopt, - std::nullopt, - std::map{ - {"redpanda.remote.readreplica", "true"}}); - - auto req = make_req({topic}); - - auto client = make_kafka_client().get0(); - client.connect().get(); - auto resp = client.dispatch(req, kafka::api_version(2)).get0(); - - BOOST_CHECK( - resp.data.topics[0].error_code == kafka::error_code::invalid_config); - BOOST_CHECK( - resp.data.topics[0].error_message - == "s3 bucket should be provided for read replica topic"); - BOOST_CHECK(resp.data.topics[0].name == "topic1"); -} - -FIXTURE_TEST(s3bucket_but_not_read_replica, create_topic_fixture) { - auto topic = make_topic( - "topic1", - std::nullopt, - std::nullopt, - std::map{ - {"redpanda.remote.readreplica.bucket", "panda-bucket"}}); - - auto req = make_req({topic}); - - auto client = make_kafka_client().get0(); - client.connect().get(); - auto resp = client.dispatch(req, kafka::api_version(2)).get0(); - - BOOST_CHECK( - resp.data.topics[0].error_code == kafka::error_code::invalid_config); - BOOST_CHECK( - resp.data.topics[0].error_message - == "s3 bucket is supported only when redpanda.remote.readreplica is " - "enabled"); - BOOST_CHECK(resp.data.topics[0].name == "topic1"); -} - FIXTURE_TEST(read_replica_and_remote_write, create_topic_fixture) { auto topic = make_topic( "topic1", std::nullopt, std::nullopt, std::map{ - {"redpanda.remote.readreplica", "true"}, + {"redpanda.remote.readreplica", "panda-bucket"}, {"redpanda.remote.write", "true"}}); auto req = make_req({topic}); @@ -425,3 +374,25 @@ FIXTURE_TEST(read_replica_and_remote_write, create_topic_fixture) { == "remote read and write are not supported for read replicas"); BOOST_CHECK(resp.data.topics[0].name == "topic1"); } + +FIXTURE_TEST(test_v5_validate_configs_resp, create_topic_fixture) { + wait_for_controller_leadership().get(); + + /// Test conditions in create_topic_fixture::verify_metadata will run + test_create_topic( + make_req({make_topic("topicA"), make_topic("topicB")}, true), + kafka::api_version(5)); + + /// Test create topic with custom configs, verify that they have been set + /// and correctly returned in response + std::map config_map{ + {ss::sstring(kafka::topic_property_retention_bytes), "1234567"}, + {ss::sstring(kafka::topic_property_segment_size), "7654321"}}; + + test_create_topic( + make_req( + {make_topic("topicC", 3, 1, config_map), + make_topic("topicD", 3, 1, config_map)}, + false), + kafka::api_version(5)); +} diff --git a/src/v/kafka/server/tests/fetch_test.cc b/src/v/kafka/server/tests/fetch_test.cc index d69e407969434..af3bbbf539c91 100644 --- a/src/v/kafka/server/tests/fetch_test.cc +++ b/src/v/kafka/server/tests/fetch_test.cc @@ -418,7 +418,7 @@ FIXTURE_TEST(fetch_multi_partitions_debounce, redpanda_thread_fixture) { model::offset(0), 5); auto rdr = model::make_memory_record_batch_reader( std::move(batches)); - return partition->replicate( + return partition->raft()->replicate( std::move(rdr), raft::replicate_options( raft::consistency_level::quorum_ack)); @@ -483,7 +483,7 @@ FIXTURE_TEST(fetch_one_debounce, redpanda_thread_fixture) { model::offset(0), 5); auto rdr = model::make_memory_record_batch_reader( std::move(batches)); - return partition->replicate( + return partition->raft()->replicate( std::move(rdr), raft::replicate_options( raft::consistency_level::quorum_ack)); @@ -563,7 +563,7 @@ FIXTURE_TEST(fetch_multi_topics, redpanda_thread_fixture) { model::offset(0), 5); auto rdr = model::make_memory_record_batch_reader( std::move(batches)); - return partition->replicate( + return partition->raft()->replicate( std::move(rdr), raft::replicate_options( raft::consistency_level::quorum_ack)); @@ -615,7 +615,7 @@ FIXTURE_TEST(fetch_request_max_bytes, redpanda_thread_fixture) { model::offset(0), 20); auto rdr = model::make_memory_record_batch_reader( std::move(batches)); - return partition->replicate( + return partition->raft()->replicate( std::move(rdr), raft::replicate_options(raft::consistency_level::quorum_ack)); }) diff --git a/src/v/kafka/server/tests/handler_interface_test.cc b/src/v/kafka/server/tests/handler_interface_test.cc new file mode 100644 index 0000000000000..ab0a011f2805d --- /dev/null +++ b/src/v/kafka/server/tests/handler_interface_test.cc @@ -0,0 +1,49 @@ +/* + * Copyright 2022 Redpanda Data, Inc. + * + * Use of this software is governed by the Business Source License + * included in the file licenses/BSL.md + * + * As of the Change Date specified in that file, in accordance with + * the Business Source License, use of this software will be governed + * by the Apache License, Version 2.0 + */ +#include "kafka/server/handlers/handler_interface.h" +#include "kafka/server/handlers/handlers.h" + +#include + +template +void check_any_vs_static() { + BOOST_TEST_INFO("Testing " << H::api::name); + auto hopt = kafka::handler_for_key(H::api::key); + BOOST_REQUIRE(hopt.has_value()); + auto h = *hopt; + BOOST_CHECK_EQUAL(h->min_supported(), H::min_supported); + BOOST_CHECK_EQUAL(h->max_supported(), H::max_supported); + BOOST_CHECK_EQUAL(h->key(), H::api::key); + BOOST_CHECK_EQUAL(h->name(), H::api::name); +} + +template +void check_all_types(kafka::type_list) { + (check_any_vs_static(), ...); +} + +BOOST_AUTO_TEST_CASE(handler_all_types) { + check_all_types(kafka::request_types{}); +} + +BOOST_AUTO_TEST_CASE(handler_handler_for_key) { + // key too low + BOOST_CHECK(!kafka::handler_for_key(kafka::api_key(-1)).has_value()); + // key too high + const auto max_key = kafka::max_api_key(kafka::request_types{}); + BOOST_CHECK( + !kafka::handler_for_key(kafka::api_key(max_key + 1)).has_value()); + // last key should be present + BOOST_CHECK(kafka::handler_for_key(kafka::api_key(max_key)).has_value()); + // 34 is AlterReplicaLogDirs which we don't currently support, use it as a + // test case for handlers which fall in the valid range but we don't support + BOOST_CHECK(!kafka::handler_for_key(kafka::api_key(34)).has_value()); +} diff --git a/src/v/kafka/server/tests/request_parser_test.cc b/src/v/kafka/server/tests/request_parser_test.cc index 27f5b90a32924..afefbcfef7e6a 100644 --- a/src/v/kafka/server/tests/request_parser_test.cc +++ b/src/v/kafka/server/tests/request_parser_test.cc @@ -83,7 +83,7 @@ get_request_context(kafka::protocol& proto, ss::input_stream&& input) { net::server::resources(nullptr, nullptr), std::move(sasl), false, - false); + std::nullopt); return kafka::request_context( conn, diff --git a/src/v/kafka/server/tests/s3_imposter_fixture.cc b/src/v/kafka/server/tests/s3_imposter_fixture.cc deleted file mode 100644 index 5a45621dfa6f8..0000000000000 --- a/src/v/kafka/server/tests/s3_imposter_fixture.cc +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright 2022 Redpanda Data, Inc. - * - * Licensed as a Redpanda Enterprise file under the Redpanda Community - * License (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * https://github.com/redpanda-data/redpanda/blob/master/licenses/rcl.md - */ - -#include "s3_imposter_fixture.h" - -#include "bytes/iobuf.h" -#include "bytes/iobuf_parser.h" -#include "config/configuration.h" -#include "seastarx.h" -#include "test_utils/async.h" - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -using namespace std::chrono_literals; - -inline ss::logger fixt_log("fixture"); // NOLINT - -static constexpr int16_t httpd_port_number = 4430; -static constexpr const char* httpd_host_name = "127.0.0.1"; - -s3_imposter_fixture::s3_imposter_fixture() { - _server = ss::make_shared(); - _server->start().get(); - ss::ipv4_addr ip_addr = {httpd_host_name, httpd_port_number}; - _server_addr = ss::socket_address(ip_addr); -} - -s3_imposter_fixture::~s3_imposter_fixture() { _server->stop().get(); } - -const std::vector& -s3_imposter_fixture::get_requests() const { - return _requests; -} - -const std::multimap& -s3_imposter_fixture::get_targets() const { - return _targets; -} - -void s3_imposter_fixture::set_expectations_and_listen( - const std::vector& expectations) { - _server - ->set_routes([this, &expectations](ss::httpd::routes& r) { - set_routes(r, expectations); - }) - .get(); - _server->listen(_server_addr).get(); -} - -void s3_imposter_fixture::set_routes( - ss::httpd::routes& r, - const std::vector& expectations) { - using namespace ss::httpd; - struct content_handler { - content_handler( - const std::vector& exp, s3_imposter_fixture& imp) - : fixture(imp) { - for (const auto& e : exp) { - expectations[e.url] = e; - } - } - ss::sstring handle(const_req request, reply& repl) { - static const ss::sstring error_payload - = R"xml( - - NoSuchKey - Object not found - resource - requestid - )xml"; - fixture._requests.push_back(request); - fixture._targets.insert(std::make_pair(request._url, request)); - vlog( - fixt_log.trace, - "S3 imposter request {} - {} - {}", - request._url, - request.content_length, - request._method); - if (request._method == "GET") { - auto it = expectations.find(request._url); - if (it == expectations.end() || !it->second.body.has_value()) { - vlog(fixt_log.trace, "Reply GET request with error"); - repl.set_status(reply::status_type::not_found); - return error_payload; - } - return *it->second.body; - } else if (request._method == "PUT") { - expectations[request._url] = { - .url = request._url, .body = request.content}; - return ""; - } else if (request._method == "DELETE") { - auto it = expectations.find(request._url); - if (it == expectations.end() || !it->second.body.has_value()) { - vlog(fixt_log.trace, "Reply DELETE request with error"); - repl.set_status(reply::status_type::not_found); - return error_payload; - } - repl.set_status(reply::status_type::no_content); - it->second.body = std::nullopt; - return ""; - } else if (request._method == "HEAD") { - auto it = expectations.find(request._url); - if (it == expectations.end() || !it->second.body.has_value()) { - vlog(fixt_log.trace, "Reply HEAD request with error"); - repl.add_header("x-amz-request-id", "placeholder-id"); - repl.set_status(reply::status_type::not_found); - } else { - repl.add_header("ETag", "placeholder-etag"); - repl.add_header( - "Content-Length", - ssx::sformat("{}", it->second.body->size())); - repl.set_status(reply::status_type::ok); - } - vlog( - fixt_log.trace, - "S3 imposter response: {}", - repl.response_line()); - return ""; - } - BOOST_FAIL("Unexpected request"); - return ""; - } - std::map expectations; - s3_imposter_fixture& fixture; - }; - auto hd = ss::make_shared(expectations, *this); - _handler = std::make_unique( - [hd](const_req req, reply& repl) { return hd->handle(req, repl); }, - "txt"); - r.add_default_handler(_handler.get()); -} - -enable_cloud_storage_fixture::enable_cloud_storage_fixture() { - ss::smp::invoke_on_all([]() { - auto& cfg = config::shard_local_cfg(); - cfg.cloud_storage_enabled.set_value(true); - cfg.cloud_storage_disable_tls.set_value(true); - cfg.cloud_storage_api_endpoint.set_value( - std::optional{httpd_host_name}); - cfg.cloud_storage_api_endpoint_port.set_value(httpd_port_number); - cfg.cloud_storage_access_key.set_value( - std::optional{"access-key"}); - cfg.cloud_storage_secret_key.set_value( - std::optional{"secret-key"}); - cfg.cloud_storage_region.set_value( - std::optional{"us-east-1"}); - cfg.cloud_storage_bucket.set_value( - std::optional{"test-bucket"}); - }).get0(); -} - -enable_cloud_storage_fixture::~enable_cloud_storage_fixture() { - config::shard_local_cfg().cloud_storage_enabled.set_value(false); -} diff --git a/src/v/kafka/server/tests/s3_imposter_fixture.h b/src/v/kafka/server/tests/s3_imposter_fixture.h deleted file mode 100644 index 73f90fd61ba71..0000000000000 --- a/src/v/kafka/server/tests/s3_imposter_fixture.h +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright 2022 Redpanda Data, Inc. - * - * Licensed as a Redpanda Enterprise file under the Redpanda Community - * License (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * https://github.com/redpanda-data/redpanda/blob/master/licenses/rcl.md - */ - -#pragma once - -#include "seastarx.h" -#include "ssx/sformat.h" - -#include -#include -#include -#include - -#include -#include -#include -#include - -// TODO(https://github.com/redpanda-data/redpanda/issues/5240): -// Move s3_imposter_fixture to the common place and use one implementation -// throughout the code base - -/// Emulates S3 REST API for testing purposes. -/// The imposter is a simple KV-store that contains a set of expectations. -/// Expectations are accessible by url via GET, PUT, and DELETE http calls. -/// Expectations are provided before impster starts to listen. They have -/// two field - url and optional body. If body is set to nullopt, attemtp -/// to read it using GET or delete it using DELETE requests will trigger an -/// http response with error code 404 and xml formatted error message. -/// If the body of the expectation is set by the user or PUT request it can -/// be retrieved using the GET request or deleted using the DELETE request. -class s3_imposter_fixture { -public: - s3_imposter_fixture(); - ~s3_imposter_fixture(); - - s3_imposter_fixture(const s3_imposter_fixture&) = delete; - s3_imposter_fixture& operator=(const s3_imposter_fixture&) = delete; - s3_imposter_fixture(s3_imposter_fixture&&) = delete; - s3_imposter_fixture& operator=(s3_imposter_fixture&&) = delete; - - struct expectation { - ss::sstring url; - std::optional body; - }; - - /// Set expectaitions on REST API calls that supposed to be made - /// Only the requests that described in this call will be possible - /// to make. This method can only be called once per test run. - /// - /// \param expectations is a collection of access points that allow GET, - /// PUT, and DELETE requests, each expectation has url and body. The body - /// will be returned by GET call if it's set or trigger error if its null. - /// The expectations are statefull. If the body of the expectation was set - /// to null but there was PUT call that sent some data, subsequent GET call - /// will retrieve this data. - void - set_expectations_and_listen(const std::vector& expectations); - - /// Access all http requests ordered by time - const std::vector& get_requests() const; - - /// Access all http requests ordered by target url - const std::multimap& get_targets() const; - - // static s3::configuration get_configuration(); - -private: - void set_routes( - ss::httpd::routes& r, const std::vector& expectations); - - ss::socket_address _server_addr; - ss::shared_ptr _server; - - std::unique_ptr _handler; - /// Contains saved requests - std::vector _requests; - /// Contains all accessed target urls - std::multimap _targets; -}; - -class enable_cloud_storage_fixture { -public: - enable_cloud_storage_fixture(); - ~enable_cloud_storage_fixture(); -}; diff --git a/src/v/kafka/server/tests/topic_recreate_test.cc b/src/v/kafka/server/tests/topic_recreate_test.cc index fd081431ec407..8628cf183dc46 100644 --- a/src/v/kafka/server/tests/topic_recreate_test.cc +++ b/src/v/kafka/server/tests/topic_recreate_test.cc @@ -266,7 +266,7 @@ FIXTURE_TEST(test_recreated_topic_does_not_lose_data, recreate_test_fixture) { auto rdr = model::make_memory_record_batch_reader( std::move(batches)); auto p = pm.get(ntp); - return p + return p->raft() ->replicate( std::move(rdr), raft::replicate_options( diff --git a/src/v/model/fundamental.h b/src/v/model/fundamental.h index 3ecfdf1cb6887..86e92f865e14a 100644 --- a/src/v/model/fundamental.h +++ b/src/v/model/fundamental.h @@ -29,6 +29,12 @@ #include #include +namespace kafka { + +using offset = named_type; + +} // namespace kafka + namespace model { // Named after Kafka cleanup.policy topic property diff --git a/src/v/net/connection.cc b/src/v/net/connection.cc index 52bee6c65ac1e..8c617f1d5dca7 100644 --- a/src/v/net/connection.cc +++ b/src/v/net/connection.cc @@ -56,16 +56,14 @@ connection::connection( ss::connected_socket f, ss::socket_address a, server_probe& p, - std::optional in_max_buffer_size, - std::optional tls_pm) + std::optional in_max_buffer_size) : addr(a) , _hook(hook) , _name(std::move(name)) , _fd(std::move(f)) , _in(_fd.input()) , _out(_fd.output()) - , _probe(p) - , _tls_pm(std::move(tls_pm)) { + , _probe(p) { if (in_max_buffer_size.has_value()) { auto in_config = ss::connected_socket_input_stream_config{}; in_config.max_buffer_size = in_max_buffer_size.value(); diff --git a/src/v/net/connection.h b/src/v/net/connection.h index 9fb6fae352c58..d5fbad17a7033 100644 --- a/src/v/net/connection.h +++ b/src/v/net/connection.h @@ -14,7 +14,6 @@ #include "net/batched_output_stream.h" #include "net/server_probe.h" #include "seastarx.h" -#include "security/mtls.h" #include #include @@ -39,8 +38,7 @@ class connection : public boost::intrusive::list_base_hook<> { ss::connected_socket f, ss::socket_address a, server_probe& p, - std::optional in_max_buffer_size, - std::optional tls_pm); + std::optional in_max_buffer_size); ~connection() noexcept; connection(const connection&) = delete; connection& operator=(const connection&) = delete; @@ -64,11 +62,6 @@ class connection : public boost::intrusive::list_base_hook<> { return ss::tls::get_dn_information(_fd); } - const std::optional& - get_principal_mapping() const { - return _tls_pm; - } - private: boost::intrusive::list& _hook; ss::sstring _name; @@ -76,7 +69,6 @@ class connection : public boost::intrusive::list_base_hook<> { ss::input_stream _in; net::batched_output_stream _out; server_probe& _probe; - std::optional _tls_pm; }; } // namespace net diff --git a/src/v/net/probes.cc b/src/v/net/probes.cc index 69a226f9a2eac..79d905015deba 100644 --- a/src/v/net/probes.cc +++ b/src/v/net/probes.cc @@ -11,6 +11,7 @@ #include "net/client_probe.h" #include "net/server_probe.h" #include "prometheus/prometheus_sanitize.h" +#include "ssx/metrics.h" #include "ssx/sformat.h" #include @@ -20,13 +21,13 @@ namespace net { void server_probe::setup_metrics( - ss::metrics::metric_groups& mgs, const char* proto) { + ss::metrics::metric_groups& mgs, std::string_view proto) { namespace sm = ss::metrics; auto aggregate_labels = config::shard_local_cfg().aggregate_metrics() ? std::vector{sm::shard_label} : std::vector{}; mgs.add_group( - prometheus_sanitize::metrics_name(proto), + prometheus_sanitize::metrics_name(ss::sstring{proto}), { sm::make_gauge( "active_connections", @@ -110,6 +111,26 @@ void server_probe::setup_metrics( }); } +void server_probe::setup_public_metrics( + ss::metrics::metric_groups& mgs, std::string_view proto) { + namespace sm = ss::metrics; + + if (proto.ends_with("_rpc")) { + proto.remove_suffix(4); + } + + auto server_label = ssx::metrics::make_namespaced_label("server"); + + mgs.add_group( + "rpc", + {sm::make_counter( + "request_errors_total", + [this] { return _service_errors; }, + sm::description("Number of rpc errors"), + {server_label(proto)}) + .aggregate({sm::shard_label})}); +} + std::ostream& operator<<(std::ostream& o, const server_probe& p) { o << "{" << "connects: " << p._connects << ", " diff --git a/src/v/net/server.cc b/src/v/net/server.cc index 1f28c2bfd7ae1..ca3f675c058cb 100644 --- a/src/v/net/server.cc +++ b/src/v/net/server.cc @@ -16,6 +16,7 @@ #include "rpc/service.h" #include "seastar/core/coroutine.hh" #include "ssx/future-util.h" +#include "ssx/metrics.h" #include "ssx/sformat.h" #include "vassert.h" #include "vlog.h" @@ -31,7 +32,8 @@ namespace net { server::server(server_configuration c) : cfg(std::move(c)) - , _memory(cfg.max_service_memory_per_core) {} + , _memory(cfg.max_service_memory_per_core) + , _public_metrics(ssx::metrics::public_metrics_handle) {} server::server(ss::sharded* s) : server(s->local()) {} @@ -45,6 +47,11 @@ void server::start() { _probe.setup_metrics(_metrics, cfg.name.c_str()); } + if (!cfg.disable_public_metrics) { + setup_public_metrics(); + _probe.setup_public_metrics(_public_metrics, cfg.name.c_str()); + } + if (cfg.connection_rate_bindings) { connection_rate_bindings.emplace(cfg.connection_rate_bindings.value()); @@ -208,23 +215,13 @@ ss::future<> server::accept(listener& s) { } } - std::optional tls_pm; - auto se_it = std::find_if( - cfg.addrs.begin(), cfg.addrs.end(), [&name](const auto& a) { - return a.name == name; - }); - if (se_it != cfg.addrs.end()) { - tls_pm = se_it->principal_mapper; - } - auto conn = ss::make_lw_shared( _connections, name, std::move(ar.connection), ar.remote_address, _probe, - cfg.stream_recv_buf, - tls_pm); + cfg.stream_recv_buf); vlog( rpc::rpclog.trace, "{} - Incoming connection from {} on \"{}\"", @@ -318,6 +315,30 @@ void server::setup_metrics() { sm::description(ssx::sformat("{}: Latency ", cfg.name)))}); } +void server::setup_public_metrics() { + namespace sm = ss::metrics; + if (!_proto) { + return; + } + + std::string_view server_name(cfg.name); + + if (server_name.ends_with("_rpc")) { + server_name.remove_suffix(4); + } + + auto server_label = ssx::metrics::make_namespaced_label("server"); + + _public_metrics.add_group( + prometheus_sanitize::metrics_name("rpc:request"), + {sm::make_histogram( + "latency_seconds", + sm::description("RPC latency"), + {server_label(server_name)}, + [this] { return ssx::metrics::report_default_histogram(_hist); }) + .aggregate({sm::shard_label})}); +} + std::ostream& operator<<(std::ostream& o, const server_configuration& c) { o << "{"; for (auto& a : c.addrs) { diff --git a/src/v/net/server.h b/src/v/net/server.h index 6c966d057c713..487b870c92458 100644 --- a/src/v/net/server.h +++ b/src/v/net/server.h @@ -16,7 +16,6 @@ #include "net/connection.h" #include "net/connection_rate.h" #include "net/types.h" -#include "security/mtls.h" #include "utils/hdr_hist.h" #include @@ -43,7 +42,6 @@ struct server_endpoint { ss::sstring name; ss::socket_address addr; ss::shared_ptr credentials; - std::optional principal_mapper; server_endpoint(ss::sstring name, ss::socket_address addr) : name(std::move(name)) @@ -57,28 +55,11 @@ struct server_endpoint { , addr(addr) , credentials(std::move(creds)) {} - server_endpoint( - ss::sstring name, - ss::socket_address addr, - ss::shared_ptr creds, - std::optional principal_mapper) - : name(std::move(name)) - , addr(addr) - , credentials(std::move(creds)) - , principal_mapper(std::move(principal_mapper)) {} - server_endpoint( ss::socket_address addr, ss::shared_ptr creds) : server_endpoint("", addr, std::move(creds)) {} - server_endpoint( - ss::socket_address addr, - ss::shared_ptr creds, - security::tls::principal_mapper principal_mapper) - : server_endpoint( - "", addr, std::move(creds), std::move(principal_mapper)) {} - explicit server_endpoint(ss::socket_address addr) : server_endpoint("", addr) {} @@ -98,6 +79,8 @@ struct server_configuration { std::optional tcp_send_buf; std::optional stream_recv_buf; net::metrics_disabled disable_metrics = net::metrics_disabled::no; + net::public_metrics_disabled disable_public_metrics + = net::public_metrics_disabled::no; ss::sstring name; std::optional connection_rate_bindings; // we use the same default as seastar for load balancing algorithm @@ -193,6 +176,7 @@ class server { friend resources; ss::future<> accept(listener&); void setup_metrics(); + void setup_public_metrics(); std::unique_ptr _proto; ss::semaphore _memory; @@ -203,6 +187,7 @@ class server { hdr_hist _hist; server_probe _probe; ss::metrics::metric_groups _metrics; + ss::metrics::metric_groups _public_metrics; std::optional connection_rate_bindings; std::optional> _connection_rates; diff --git a/src/v/net/server_probe.h b/src/v/net/server_probe.h index 4eabbddb3f9e4..8b45b55813cc1 100644 --- a/src/v/net/server_probe.h +++ b/src/v/net/server_probe.h @@ -52,7 +52,10 @@ class server_probe { void waiting_for_conection_rate() { ++_connections_wait_rate; } - void setup_metrics(ss::metrics::metric_groups& mgs, const char* name); + void setup_metrics(ss::metrics::metric_groups& mgs, std::string_view proto); + + void setup_public_metrics( + ss::metrics::metric_groups& mgs, std::string_view proto); private: uint64_t _requests_completed = 0; diff --git a/src/v/pandaproxy/probe.cc b/src/v/pandaproxy/probe.cc index f1c1f976b1430..7b9fdd5c74ee5 100644 --- a/src/v/pandaproxy/probe.cc +++ b/src/v/pandaproxy/probe.cc @@ -21,47 +21,96 @@ namespace pandaproxy { probe::probe( ss::httpd::path_description& path_desc, const ss::sstring& group_name) - : _request_hist() + : _request_metrics() + , _path(path_desc) + , _group_name(group_name) , _metrics() , _public_metrics(ssx::metrics::public_metrics_handle) { + setup_metrics(); + setup_public_metrics(); +} + +void probe::setup_metrics() { namespace sm = ss::metrics; + if (config::shard_local_cfg().disable_metrics()) { + return; + } + auto operation_label = sm::label("operation"); std::vector labels{ - operation_label(path_desc.operations.nickname)}; + operation_label(_path.operations.nickname)}; auto aggregate_labels = std::vector{ sm::shard_label, operation_label}; - if (!config::shard_local_cfg().disable_metrics()) { - auto internal_aggregate_labels - = config::shard_local_cfg().aggregate_metrics() - ? aggregate_labels - : std::vector{}; - - _metrics.add_group( - "pandaproxy", - {sm::make_histogram( - "request_latency", - sm::description("Request latency"), - labels, - [this] { return _request_hist.seastar_histogram_logform(); }) - .aggregate(internal_aggregate_labels)}); - } + auto internal_aggregate_labels + = config::shard_local_cfg().aggregate_metrics() + ? aggregate_labels + : std::vector{}; - if (!config::shard_local_cfg().disable_public_metrics()) { - _public_metrics.add_group( - group_name, - {sm::make_histogram( - "request_latency_seconds", - sm::description( - ssx::sformat("Internal latency of request for {}", group_name)), - labels, - [this] { - return ssx::metrics::report_default_histogram(_request_hist); - }) - .aggregate(aggregate_labels)}); + _metrics.add_group( + "pandaproxy", + {sm::make_histogram( + "request_latency", + sm::description("Request latency"), + labels, + [this] { return _request_metrics.hist().seastar_histogram_logform(); }) + .aggregate(internal_aggregate_labels)}); +} + +void probe::setup_public_metrics() { + namespace sm = ss::metrics; + + if (config::shard_local_cfg().disable_public_metrics()) { + return; } + + auto operation_label = ssx::metrics::make_namespaced_label("operation"); + auto status_label = ssx::metrics::make_namespaced_label("status"); + + std::vector labels{ + operation_label(_path.operations.nickname)}; + + auto aggregate_labels = std::vector{ + sm::shard_label, operation_label}; + + _public_metrics.add_group( + _group_name, + {sm::make_histogram( + "request_latency_seconds", + sm::description( + ssx::sformat("Internal latency of request for {}", _group_name)), + labels, + [this] { + return ssx::metrics::report_default_histogram( + _request_metrics.hist()); + }) + .aggregate(aggregate_labels), + + sm::make_counter( + "request_errors_total", + [this] { return _request_metrics._5xx_count; }, + sm::description( + ssx::sformat("Total number of {} server errors", _group_name)), + {operation_label(_path.operations.nickname), status_label("5xx")}) + .aggregate(aggregate_labels), + + sm::make_counter( + "request_errors_total", + [this] { return _request_metrics._4xx_count; }, + sm::description( + ssx::sformat("Total number of {} client errors", _group_name)), + {operation_label(_path.operations.nickname), status_label("4xx")}) + .aggregate(aggregate_labels), + + sm::make_counter( + "request_errors_total", + [this] { return _request_metrics._3xx_count; }, + sm::description( + ssx::sformat("Total number of {} redirection errors", _group_name)), + {operation_label(_path.operations.nickname), status_label("3xx")}) + .aggregate(aggregate_labels)}); } } // namespace pandaproxy diff --git a/src/v/pandaproxy/probe.h b/src/v/pandaproxy/probe.h index 22c2436bc1065..f5f3455dab24e 100644 --- a/src/v/pandaproxy/probe.h +++ b/src/v/pandaproxy/probe.h @@ -15,17 +15,62 @@ #include #include +#include namespace pandaproxy { +/// If the request is good, measure latency, otherwise record the error. +class http_status_metric { +public: + class measurement { + public: + measurement( + http_status_metric* p, std::unique_ptr m) + : _p(p) + , _m(std::move(m)) {} + + void set_status(ss::httpd::reply::status_type s) { + using status_type = ss::httpd::reply::status_type; + if (s < status_type{300}) { + return; + } + if (s < status_type{400}) { + ++_p->_3xx_count; + } else if (s < status_type{500}) { + ++_p->_4xx_count; + } else { + ++_p->_5xx_count; + } + _m->set_trace(false); + } + + private: + http_status_metric* _p; + std::unique_ptr _m; + }; + hdr_hist& hist() { return _hist; } + auto auto_measure() { return measurement{this, _hist.auto_measure()}; } + + hdr_hist _hist; + int64_t _5xx_count; + int64_t _4xx_count; + int64_t _3xx_count; +}; + class probe { public: probe( ss::httpd::path_description& path_desc, const ss::sstring& group_name); - hdr_hist& hist() { return _request_hist; } + auto auto_measure() { return _request_metrics.auto_measure(); } + +private: + void setup_metrics(); + void setup_public_metrics(); private: - hdr_hist _request_hist; + http_status_metric _request_metrics; + const ss::httpd::path_description& _path; + const ss::sstring& _group_name; ss::metrics::metric_groups _metrics; ss::metrics::metric_groups _public_metrics; }; diff --git a/src/v/pandaproxy/reply.h b/src/v/pandaproxy/reply.h index 441245a75c161..5d6481c57dd20 100644 --- a/src/v/pandaproxy/reply.h +++ b/src/v/pandaproxy/reply.h @@ -104,12 +104,13 @@ inline std::unique_ptr exception_reply(std::exception_ptr e) { } catch (const schema_registry::exception_base& e) { return errored_body(e.code(), e.message()); } catch (const seastar::httpd::base_exception& e) { - return errored_body( - reply_error_code::kafka_bad_request, - e.what()); // TODO BP: Yarr!! + return errored_body(reply_error_code::kafka_bad_request, e.what()); } catch (...) { - vlog(plog.error, "{}", std::current_exception()); - throw; + vlog(plog.error, "exception_reply: {}", std::current_exception()); + auto ise = make_error_condition( + reply_error_code::internal_server_error); + return errored_body( + reply_error_code::internal_server_error, ise.message()); } } diff --git a/src/v/pandaproxy/rest/proxy.cc b/src/v/pandaproxy/rest/proxy.cc index acdd0cb946b96..2a3b9a7fe55e1 100644 --- a/src/v/pandaproxy/rest/proxy.cc +++ b/src/v/pandaproxy/rest/proxy.cc @@ -74,15 +74,15 @@ proxy::proxy( ss::api_registry_builder20(_config.api_doc_dir(), "/v1"), "header", "/definitions", - _ctx) {} + _ctx, + json::serialization_format::application_json) {} ss::future<> proxy::start() { _server.routes(get_proxy_routes()); return _server.start( _config.pandaproxy_api(), _config.pandaproxy_api_tls(), - _config.advertised_pandaproxy_api(), - json::serialization_format::application_json); + _config.advertised_pandaproxy_api()); } ss::future<> proxy::stop() { return _server.stop(); } diff --git a/src/v/pandaproxy/schema_registry/service.cc b/src/v/pandaproxy/schema_registry/service.cc index e7d8e9b4c74a8..35571893bb6f2 100644 --- a/src/v/pandaproxy/schema_registry/service.cc +++ b/src/v/pandaproxy/schema_registry/service.cc @@ -229,7 +229,8 @@ service::service( ss::api_registry_builder20(_config.api_doc_dir(), "/v1"), "schema_registry_header", "/schema_registry_definitions", - _ctx) + _ctx, + json::serialization_format::schema_registry_v1_json) , _store(store) , _writer(sequencer) , _ensure_started{[this]() { return do_start(); }} {} @@ -240,8 +241,7 @@ ss::future<> service::start() { return _server.start( _config.schema_registry_api(), _config.schema_registry_api_tls(), - not_advertised, - json::serialization_format::schema_registry_v1_json); + not_advertised); } ss::future<> service::stop() { diff --git a/src/v/pandaproxy/server.cc b/src/v/pandaproxy/server.cc index 0dd564a67ccf1..617dc820db082 100644 --- a/src/v/pandaproxy/server.cc +++ b/src/v/pandaproxy/server.cc @@ -70,49 +70,46 @@ struct handler_adaptor : ss::httpd::handler_base { server::context_t& ctx, server::function_handler&& handler, ss::httpd::path_description& path_desc, - const ss::sstring& metrics_group_name) + const ss::sstring& metrics_group_name, + json::serialization_format exceptional_mime_type) : _pending_requests(pending_requests) , _ctx(ctx) , _handler(std::move(handler)) - , _probe(path_desc, metrics_group_name) {} + , _probe(path_desc, metrics_group_name) + , _exceptional_mime_type(exceptional_mime_type) {} ss::future> handle( const ss::sstring&, std::unique_ptr req, std::unique_ptr rep) final { - return ss::try_with_gate( - _pending_requests, - [this, - req{std::move(req)}, - rep{std::move(rep)}, - m = _probe.hist().auto_measure()]() mutable { - server::request_t rq{std::move(req), this->_ctx}; - server::reply_t rp{std::move(rep)}; - auto req_size = get_request_size(*rq.req); - - return ss::with_semaphore( - _ctx.mem_sem, - req_size, - [this, rq{std::move(rq)}, rp{std::move(rp)}]() mutable { - if (_ctx.as.abort_requested()) { - set_reply_unavailable(*rp.rep); - return ss::make_ready_future< - std::unique_ptr>(std::move(rp.rep)); - } - return _handler(std::move(rq), std::move(rp)) - .then([](server::reply_t rp) { - set_mime_type(*rp.rep, rp.mime_type); - return std::move(rp.rep); - }); - }) - .finally([m{std::move(m)}]() {}); - }); + auto measure = _probe.auto_measure(); + auto guard = gate_guard(_pending_requests); + server::request_t rq{std::move(req), this->_ctx}; + server::reply_t rp{std::move(rep)}; + auto req_size = get_request_size(*rq.req); + auto sem_units = co_await ss::get_units(_ctx.mem_sem, req_size); + if (_ctx.as.abort_requested()) { + set_reply_unavailable(*rp.rep); + rp.mime_type = _exceptional_mime_type; + } else { + try { + rp = co_await _handler(std::move(rq), std::move(rp)); + } catch (...) { + rp = server::reply_t{ + exception_reply(std::current_exception()), + _exceptional_mime_type}; + } + } + set_mime_type(*rp.rep, rp.mime_type); + measure.set_status(rp.rep->_status); + co_return std::move(rp.rep); } ss::gate& _pending_requests; server::context_t& _ctx; server::function_handler _handler; probe _probe; + json::serialization_format _exceptional_mime_type; }; server::server( @@ -121,13 +118,15 @@ server::server( ss::api_registry_builder20&& api20, const ss::sstring& header, const ss::sstring& definitions, - context_t& ctx) + context_t& ctx, + json::serialization_format exceptional_mime_type) : _server(server_name) , _public_metrics_group_name(public_metrics_group_name) , _pending_reqs() , _api20(std::move(api20)) , _has_routes(false) - , _ctx(ctx) { + , _ctx(ctx) + , _exceptional_mime_type(exceptional_mime_type) { _api20.set_api_doc(_server._routes); _api20.register_api_file(_server._routes, header); _api20.add_definitions_file(_server._routes, definitions); @@ -144,7 +143,8 @@ void server::route(server::route_t r) { _ctx, std::move(r.handler), r.path_desc, - _public_metrics_group_name); + _public_metrics_group_name, + _exceptional_mime_type); r.path_desc.set(_server._routes, handler); } @@ -167,10 +167,9 @@ void server::routes(server::routes_t&& rts) { ss::future<> server::start( const std::vector& endpoints, const std::vector& endpoints_tls, - const std::vector& advertised, - json::serialization_format exceptional_mime_type) { + const std::vector& advertised) { _server._routes.register_exeption_handler( - exception_replier{ss::sstring{name(exceptional_mime_type)}}); + exception_replier{ss::sstring{name(_exceptional_mime_type)}}); _ctx.advertised_listeners.reserve(endpoints.size()); for (auto& server_endpoint : endpoints) { auto addr = co_await net::resolve_dns(server_endpoint.address); diff --git a/src/v/pandaproxy/server.h b/src/v/pandaproxy/server.h index 3790c081a78a1..92640a0728484 100644 --- a/src/v/pandaproxy/server.h +++ b/src/v/pandaproxy/server.h @@ -84,7 +84,8 @@ class server { ss::api_registry_builder20&& api20, const ss::sstring& header, const ss::sstring& definitions, - context_t& ctx); + context_t& ctx, + json::serialization_format exceptional_mime_type); void route(route_t route); void routes(routes_t&& routes); @@ -92,8 +93,7 @@ class server { ss::future<> start( const std::vector& endpoints, const std::vector& endpoints_tls, - const std::vector& advertised, - json::serialization_format exceptional_mime_type); + const std::vector& advertised); ss::future<> stop(); private: @@ -103,6 +103,7 @@ class server { ss::api_registry_builder20 _api20; bool _has_routes; context_t& _ctx; + json::serialization_format _exceptional_mime_type; }; template diff --git a/src/v/raft/consensus.cc b/src/v/raft/consensus.cc index b30bff1c0977b..9a9f7a4312e64 100644 --- a/src/v/raft/consensus.cc +++ b/src/v/raft/consensus.cc @@ -45,6 +45,7 @@ #include #include +#include template<> struct fmt::formatter final @@ -994,6 +995,18 @@ ss::future consensus::replace_configuration( }); } +ss::future consensus::replace_configuration( + std::vector new_brokers, + model::revision_id new_revision) { + return change_configuration( + [new_brokers = std::move(new_brokers), + new_revision](group_configuration current) mutable { + current.replace(std::move(new_brokers), new_revision); + current.set_revision(new_revision); + return result(std::move(current)); + }); +} + template ss::future consensus::interrupt_configuration_change(model::revision_id revision, Func f) { @@ -1020,12 +1033,23 @@ ss::future consensus::cancel_configuration_change(model::revision_id revision) { vlog( _ctxlog.info, - "requested revert of current configuration change - {}", + "requested cancellation of current configuration change - {}", config()); return interrupt_configuration_change( - revision, [revision](raft::group_configuration cfg) { - cfg.cancel_configuration_change(revision); - return cfg; + revision, + [revision](raft::group_configuration cfg) { + cfg.cancel_configuration_change(revision); + return cfg; + }) + .then([this](std::error_code ec) -> ss::future { + if (!ec) { + // current leader is not a voter, step down + if (!config().is_voter(_self)) { + auto u = co_await _op_lock.get_units(); + do_step_down("current leader is not voter"); + } + } + co_return ec; }); } @@ -1628,7 +1652,7 @@ consensus::do_append_entries(append_entries_request&& r) { // section 1 // For an entry to fit into our log, it must not leave a gap. if (r.meta.prev_log_index > last_log_offset) { - if (!r.batches.is_end_of_stream()) { + if (!r.batches().is_end_of_stream()) { vlog( _ctxlog.debug, "Rejecting append entries. Would leave gap in log, last log " @@ -1668,7 +1692,7 @@ consensus::do_append_entries(append_entries_request&& r) { // special case heartbeat case // we need to handle it early (before executing truncation) // as timeouts are asynchronous to append calls and can have stall data - if (r.batches.is_end_of_stream()) { + if (r.batches().is_end_of_stream()) { if (r.meta.prev_log_index < last_log_offset) { // do not tuncate on heartbeat just response with false reply.result = append_entries_reply::status::failure; @@ -1761,7 +1785,7 @@ consensus::do_append_entries(append_entries_request&& r) { // success. copy entries for each subsystem using offsets_ret = storage::append_result; - return disk_append(std::move(r.batches), update_last_quorum_index::no) + return disk_append(std::move(r.batches()), update_last_quorum_index::no) .then([this, m = r.meta, target = r.node_id](offsets_ret ofs) { auto f = ss::make_ready_future<>(); auto last_visible = std::min(ofs.last_offset, m.last_visible_index); diff --git a/src/v/raft/consensus.h b/src/v/raft/consensus.h index 87f7323d6072c..a852c819bc195 100644 --- a/src/v/raft/consensus.h +++ b/src/v/raft/consensus.h @@ -123,6 +123,11 @@ class consensus { // Replace configuration of raft group with given set of nodes ss::future replace_configuration(std::vector, model::revision_id); + /** + * Replace configuration, uses revision provided with brokers + */ + ss::future + replace_configuration(std::vector, model::revision_id); // Abort ongoing configuration change - may cause data loss ss::future abort_configuration_change(model::revision_id); // Revert current configuration change - this is safe and will never cause diff --git a/src/v/raft/group_configuration.cc b/src/v/raft/group_configuration.cc index 0f8790024b19f..76fa0635dad6e 100644 --- a/src/v/raft/group_configuration.cc +++ b/src/v/raft/group_configuration.cc @@ -313,6 +313,88 @@ void group_configuration::replace( } } +void group_configuration::replace( + std::vector brokers, model::revision_id rev) { + vassert(!_old, "can not replace joint configuration - {}", *this); + _revision = rev; + + /** + * If configurations are identical do nothing. For identical configuration + * we assume that brokers list hasn't changed (1) and current configuration + * contains all brokers in either voters of learners (2). + */ + // check list of brokers (1) + + // check if all brokers are assigned to current configuration (2) + bool brokers_are_equal + = brokers.size() == _brokers.size() + && std::all_of( + brokers.begin(), brokers.end(), [this](const broker_revision& b) { + // we may do linear lookup in _brokers collection as number of + // brokers is usually very small f.e. 3 or 5 + auto it = std::find_if( + _brokers.begin(), + _brokers.end(), + [&b](const model::broker& existing) { + return b.broker == existing; + }); + + return _current.contains(vnode(b.broker.id(), b.rev)) + && it != _brokers.end(); + }); + + // configurations are identical, do nothing + if (brokers_are_equal) { + return; + } + + _old = _current; + _current.learners.clear(); + _current.voters.clear(); + + for (auto& br : brokers) { + // check if broker is already a voter. voter will stay a voter + auto v_it = std::find_if( + _old->voters.cbegin(), _old->voters.cend(), [&br](const vnode& rni) { + return rni.id() == br.broker.id() && rni.revision() == br.rev; + }); + + if (v_it != _old->voters.cend()) { + _current.voters.push_back(*v_it); + continue; + } + + // check if broker was a learner. learner will stay a learner + auto l_it = std::find_if( + _old->learners.cbegin(), + _old->learners.cend(), + [&br](const vnode& rni) { + return rni.id() == br.broker.id() && rni.revision() == br.rev; + }); + + if (l_it != _old->learners.cend()) { + _current.learners.push_back(*l_it); + continue; + } + + // new broker, use broker revision + _current.learners.emplace_back(br.broker.id(), br.rev); + } + + // if both current and previous configurations are exactly the same, we do + // not need to enter joint consensus + if ( + _current.voters == _old->voters && _current.learners == _old->learners) { + _old.reset(); + } + + for (auto& b : brokers) { + if (!contains_broker(b.broker.id())) { + _brokers.push_back(std::move(b.broker)); + } + } +} + void group_configuration::promote_to_voter(vnode id) { auto it = std::find( _current.learners.cbegin(), _current.learners.cend(), id); diff --git a/src/v/raft/group_configuration.h b/src/v/raft/group_configuration.h index fda4638260671..f62fdddb21ad2 100644 --- a/src/v/raft/group_configuration.h +++ b/src/v/raft/group_configuration.h @@ -23,6 +23,11 @@ namespace raft { +struct broker_revision { + model::broker broker; + model::revision_id rev; +}; + static constexpr model::revision_id no_revision{}; class vnode : public serde::envelope> { public: @@ -118,6 +123,7 @@ class group_configuration final { void add(std::vector, model::revision_id); void remove(const std::vector&); void replace(std::vector, model::revision_id); + void replace(std::vector, model::revision_id); /** * Updating broker configuration. This operation does not require entering diff --git a/src/v/raft/replicate_entries_stm.cc b/src/v/raft/replicate_entries_stm.cc index c63749712b5f1..58c92ab420dea 100644 --- a/src/v/raft/replicate_entries_stm.cc +++ b/src/v/raft/replicate_entries_stm.cc @@ -33,10 +33,10 @@ using namespace std::chrono_literals; ss::future replicate_entries_stm::share_request() { // one extra copy is needed for retries return with_semaphore(_share_sem, 1, [this] { - return details::foreign_share_n(std::move(_req->batches), 2) + return details::foreign_share_n(std::move(_req->batches()), 2) .then([this](std::vector readers) { // keep a copy around until the end - _req->batches = std::move(readers.back()); + _req->batches() = std::move(readers.back()); readers.pop_back(); return append_entries_request( _req->node_id, @@ -188,7 +188,7 @@ replicate_entries_stm::append_to_self() { = _req->flush ? consistency_level::quorum_ack : consistency_level::leader_ack; return _ptr->disk_append( - std::move(req.batches), + std::move(req.batches()), _req->flush ? consensus::update_last_quorum_index::yes : consensus::update_last_quorum_index::no); }) diff --git a/src/v/raft/tests/raft_group_fixture.h b/src/v/raft/tests/raft_group_fixture.h index fa10a6f975aa9..2658ff82895b5 100644 --- a/src/v/raft/tests/raft_group_fixture.h +++ b/src/v/raft/tests/raft_group_fixture.h @@ -184,6 +184,7 @@ struct raft_node { scfg.addrs.emplace_back(net::resolve_dns(broker.rpc_address()).get()); scfg.max_service_memory_per_core = 1024 * 1024 * 1024; scfg.disable_metrics = net::metrics_disabled::yes; + scfg.disable_public_metrics = net::public_metrics_disabled::yes; server.start(std::move(scfg)).get0(); raft_manager.start().get0(); raft_manager diff --git a/src/v/raft/tests/type_serialization_tests.cc b/src/v/raft/tests/type_serialization_tests.cc index 83fa6ae49e4de..ec4c945c9e479 100644 --- a/src/v/raft/tests/type_serialization_tests.cc +++ b/src/v/raft/tests/type_serialization_tests.cc @@ -99,7 +99,7 @@ SEASTAR_THREAD_TEST_CASE(append_entries_requests) { auto batches_result = model::consume_reader_to_memory( std::move(readers.back()), model::no_timeout) .get0(); - d.batches + d.batches() .consume(checking_consumer(std::move(batches_result)), model::no_timeout) .get0(); } diff --git a/src/v/raft/types.cc b/src/v/raft/types.cc index 3a2229a7a469d..174daab975eaa 100644 --- a/src/v/raft/types.cc +++ b/src/v/raft/types.cc @@ -19,11 +19,105 @@ #include "vassert.h" #include "vlog.h" +#include + #include #include #include +namespace { +template +T decode_signed(T value) { + return value < T(0) ? T{} : value; +} + +template +T varlong_reader(iobuf_parser& in) { + auto [val, len] = in.read_varlong(); + return T(val); +} + +namespace internal { +struct hbeat_soa { + explicit hbeat_soa(size_t n) + : groups(n) + , commit_indices(n) + , terms(n) + , prev_log_indices(n) + , prev_log_terms(n) + , last_visible_indices(n) + , revisions(n) + , target_revisions(n) {} + + ~hbeat_soa() noexcept = default; + hbeat_soa(const hbeat_soa&) = delete; + hbeat_soa& operator=(const hbeat_soa&) = delete; + hbeat_soa(hbeat_soa&&) noexcept = default; + hbeat_soa& operator=(hbeat_soa&&) noexcept = default; + + std::vector groups; + std::vector commit_indices; + std::vector terms; + std::vector prev_log_indices; + std::vector prev_log_terms; + std::vector last_visible_indices; + std::vector revisions; + std::vector target_revisions; +}; + +struct hbeat_response_array { + explicit hbeat_response_array(size_t n) + : groups(n) + , terms(n) + , last_flushed_log_index(n) + , last_dirty_log_index(n) + , last_term_base_offset(n) + , revisions(n) + , target_revisions(n) {} + + std::vector groups; + std::vector terms; + std::vector last_flushed_log_index; + std::vector last_dirty_log_index; + std::vector last_term_base_offset; + std::vector revisions; + std::vector target_revisions; +}; +template +void encode_one_vint(iobuf& out, const T& t) { + auto b = vint::to_bytes(t); + // NOLINTNEXTLINE + out.append(reinterpret_cast(b.data()), b.size()); +} + +template +void encode_varint_delta(iobuf& out, const T& prev, const T& current) { + // TODO: use delta-delta: + // https://github.com/facebookarchive/beringei/blob/92784ec6e2/beringei/lib/BitUtil.cpp + auto delta = current - prev; + encode_one_vint(out, delta); +} + +template +void encode_one_delta_array(iobuf& o, const std::vector& v) { + if (v.empty()) { + return; + } + const size_t max = v.size(); + encode_one_vint(o, v[0]); + for (size_t i = 1; i < max; ++i) { + encode_varint_delta(o, v[i - 1], v[i]); + } +} +template +T read_one_varint_delta(iobuf_parser& in, const T& prev) { + auto dst = varlong_reader(in); + return prev + dst; +} +} // namespace internal +} // namespace + namespace raft { replicate_stages::replicate_stages( @@ -157,35 +251,361 @@ std::ostream& operator<<(std::ostream& o, const install_snapshot_reply& r) { return o; } -} // namespace raft +ss::future<> heartbeat_request::serde_async_write(iobuf& dst) { + vassert(!heartbeats.empty(), "cannot serialize empty heartbeats request"); -namespace reflection { + struct sorter_fn { + constexpr bool operator()( + const raft::heartbeat_metadata& lhs, + const raft::heartbeat_metadata& rhs) const { + return lhs.meta.commit_index < rhs.meta.commit_index; + } + }; + + iobuf out; + auto& request = *this; -struct rpc_model_reader_consumer { - explicit rpc_model_reader_consumer(iobuf& oref) - : ref(oref) {} - ss::future operator()(model::record_batch batch) { - reflection::serialize(ref, batch.header()); - if (!batch.compressed()) { - reflection::serialize(ref, 0); - batch.for_each_record([this](model::record r) { - reflection::serialize(ref, std::move(r)); - }); - } else { - reflection::serialize(ref, 1); - reflection::serialize(ref, std::move(batch).release_data()); + std::sort( + request.heartbeats.begin(), request.heartbeats.end(), sorter_fn{}); + + co_await ss::coroutine::maybe_yield(); + + internal::hbeat_soa encodee(request.heartbeats.size()); + // target physical node id is always the same it differs only by + // revision + + const size_t size = request.heartbeats.size(); + for (size_t i = 0; i < size; ++i) { + const auto& m = request.heartbeats[i].meta; + const raft::vnode node = request.heartbeats[i].node_id; + const raft::vnode target_node = request.heartbeats[i].target_node_id; + vassert(m.group() >= 0, "Negative raft group detected. {}", m.group); + encodee.groups[i] = m.group; + encodee.commit_indices[i] = std::max(model::offset(-1), m.commit_index); + encodee.terms[i] = std::max(model::term_id(-1), m.term); + encodee.prev_log_indices[i] = std::max( + model::offset(-1), m.prev_log_index); + encodee.prev_log_terms[i] = std::max( + model::term_id(-1), m.prev_log_term); + encodee.last_visible_indices[i] = std::max( + model::offset(-1), m.last_visible_index); + encodee.revisions[i] = std::max( + model::revision_id(-1), node.revision()); + encodee.target_revisions[i] = std::max( + model::revision_id(-1), target_node.revision()); + + co_await ss::coroutine::maybe_yield(); + } + // important to release this memory after this function + // request.meta = {}; // release memory + + using serde::write; + + // physical node ids are the same for all requests + write(out, request.heartbeats.front().node_id.id()); + write(out, request.heartbeats.front().target_node_id.id()); + write(out, static_cast(size)); + + internal::encode_one_delta_array(out, encodee.groups); + internal::encode_one_delta_array( + out, encodee.commit_indices); + internal::encode_one_delta_array(out, encodee.terms); + internal::encode_one_delta_array( + out, encodee.prev_log_indices); + internal::encode_one_delta_array( + out, encodee.prev_log_terms); + internal::encode_one_delta_array( + out, encodee.last_visible_indices); + internal::encode_one_delta_array( + out, encodee.revisions); + internal::encode_one_delta_array( + out, encodee.target_revisions); + + write(dst, std::move(out)); +} + +void heartbeat_request::serde_read( + iobuf_parser& src, const serde::header& hdr) { + using serde::read_nested; + auto tmp = read_nested(src, hdr._bytes_left_limit); + iobuf_parser in(std::move(tmp)); + + auto& req = *this; + auto node_id = read_nested(in, 0U); + auto target_node = read_nested(in, 0U); + req.heartbeats = std::vector( + read_nested(in, 0U)); + if (req.heartbeats.empty()) { + return; + } + const size_t max = req.heartbeats.size(); + req.heartbeats[0].meta.group = varlong_reader(in); + for (size_t i = 1; i < max; ++i) { + req.heartbeats[i].meta.group + = internal::read_one_varint_delta( + in, req.heartbeats[i - 1].meta.group); + } + req.heartbeats[0].meta.commit_index = varlong_reader(in); + for (size_t i = 1; i < max; ++i) { + req.heartbeats[i].meta.commit_index + = internal::read_one_varint_delta( + in, req.heartbeats[i - 1].meta.commit_index); + } + req.heartbeats[0].meta.term = varlong_reader(in); + for (size_t i = 1; i < max; ++i) { + req.heartbeats[i].meta.term + = internal::read_one_varint_delta( + in, req.heartbeats[i - 1].meta.term); + } + req.heartbeats[0].meta.prev_log_index = varlong_reader(in); + for (size_t i = 1; i < max; ++i) { + req.heartbeats[i].meta.prev_log_index + = internal::read_one_varint_delta( + in, req.heartbeats[i - 1].meta.prev_log_index); + } + req.heartbeats[0].meta.prev_log_term = varlong_reader(in); + for (size_t i = 1; i < max; ++i) { + req.heartbeats[i].meta.prev_log_term + = internal::read_one_varint_delta( + in, req.heartbeats[i - 1].meta.prev_log_term); + } + req.heartbeats[0].meta.last_visible_index = varlong_reader( + in); + for (size_t i = 1; i < max; ++i) { + req.heartbeats[i].meta.last_visible_index + = internal::read_one_varint_delta( + in, req.heartbeats[i - 1].meta.last_visible_index); + } + + req.heartbeats[0].node_id = raft::vnode( + node_id, varlong_reader(in)); + for (size_t i = 1; i < max; ++i) { + req.heartbeats[i].node_id = raft::vnode( + node_id, + internal::read_one_varint_delta( + in, req.heartbeats[i - 1].node_id.revision())); + } + + req.heartbeats[0].target_node_id = raft::vnode( + target_node, varlong_reader(in)); + for (size_t i = 1; i < max; ++i) { + req.heartbeats[i].target_node_id = raft::vnode( + target_node, + internal::read_one_varint_delta( + in, req.heartbeats[i - 1].target_node_id.revision())); + } + + for (auto& hb : req.heartbeats) { + hb.meta.prev_log_index = decode_signed(hb.meta.prev_log_index); + hb.meta.commit_index = decode_signed(hb.meta.commit_index); + hb.meta.prev_log_term = decode_signed(hb.meta.prev_log_term); + hb.meta.last_visible_index = decode_signed(hb.meta.last_visible_index); + hb.node_id = raft::vnode( + hb.node_id.id(), decode_signed(hb.node_id.revision())); + hb.target_node_id = raft::vnode( + hb.target_node_id.id(), decode_signed(hb.target_node_id.revision())); + } +} + +void heartbeat_reply::serde_write(iobuf& dst) { + using serde::write; + + auto& reply = *this; + iobuf out; + + struct sorter_fn { + constexpr bool operator()( + const raft::append_entries_reply& lhs, + const raft::append_entries_reply& rhs) const { + return lhs.last_flushed_log_index < rhs.last_flushed_log_index; } - return ss::make_ready_future( - ss::stop_iteration::no); + }; + + write(out, static_cast(reply.meta.size())); + // no requests + if (reply.meta.empty()) { + return; } - void end_of_stream(){}; - iobuf& ref; -}; + + // replies are comming from the same physical node + write(out, reply.meta.front().node_id.id()); + // replies are addressed to the same physical node + write(out, reply.meta.front().target_node_id.id()); + std::sort(reply.meta.begin(), reply.meta.end(), sorter_fn{}); + internal::hbeat_response_array encodee(reply.meta.size()); + + for (size_t i = 0; i < reply.meta.size(); ++i) { + encodee.groups[i] = reply.meta[i].group; + encodee.terms[i] = std::max(model::term_id(-1), reply.meta[i].term); + + encodee.last_flushed_log_index[i] = std::max( + model::offset(-1), reply.meta[i].last_flushed_log_index); + encodee.last_dirty_log_index[i] = std::max( + model::offset(-1), reply.meta[i].last_dirty_log_index); + encodee.last_term_base_offset[i] = std::max( + model::offset(-1), reply.meta[i].last_term_base_offset); + encodee.revisions[i] = std::max( + model::revision_id(-1), reply.meta[i].node_id.revision()); + encodee.target_revisions[i] = std::max( + model::revision_id(-1), reply.meta[i].target_node_id.revision()); + } + internal::encode_one_delta_array(out, encodee.groups); + internal::encode_one_delta_array(out, encodee.terms); + + internal::encode_one_delta_array( + out, encodee.last_flushed_log_index); + internal::encode_one_delta_array( + out, encodee.last_dirty_log_index); + internal::encode_one_delta_array( + out, encodee.last_term_base_offset); + internal::encode_one_delta_array( + out, encodee.revisions); + internal::encode_one_delta_array( + out, encodee.target_revisions); + for (auto& m : reply.meta) { + write(out, m.result); + } + + write(dst, std::move(out)); +} + +void heartbeat_reply::serde_read(iobuf_parser& src, const serde::header& hdr) { + using serde::read_nested; + auto tmp = read_nested(src, hdr._bytes_left_limit); + iobuf_parser in(std::move(tmp)); + + auto& reply = *this; + reply.meta = std::vector( + read_nested(in, 0U)); + + // empty reply + if (reply.meta.empty()) { + return; + } + + auto node_id = read_nested(in, 0U); + auto target_node_id = read_nested(in, 0U); + + size_t size = reply.meta.size(); + reply.meta[0].group = varlong_reader(in); + for (size_t i = 1; i < size; ++i) { + reply.meta[i].group = internal::read_one_varint_delta( + in, reply.meta[i - 1].group); + } + reply.meta[0].term = varlong_reader(in); + for (size_t i = 1; i < size; ++i) { + reply.meta[i].term = internal::read_one_varint_delta( + in, reply.meta[i - 1].term); + } + + reply.meta[0].last_flushed_log_index = varlong_reader(in); + for (size_t i = 1; i < size; ++i) { + reply.meta[i].last_flushed_log_index + = internal::read_one_varint_delta( + in, reply.meta[i - 1].last_flushed_log_index); + } + + reply.meta[0].last_dirty_log_index = varlong_reader(in); + for (size_t i = 1; i < size; ++i) { + reply.meta[i].last_dirty_log_index + = internal::read_one_varint_delta( + in, reply.meta[i - 1].last_dirty_log_index); + } + + reply.meta[0].last_term_base_offset = varlong_reader(in); + for (size_t i = 1; i < size; ++i) { + reply.meta[i].last_term_base_offset + = internal::read_one_varint_delta( + in, reply.meta[i - 1].last_term_base_offset); + } + + reply.meta[0].node_id = raft::vnode( + node_id, varlong_reader(in)); + for (size_t i = 1; i < size; ++i) { + reply.meta[i].node_id = raft::vnode( + node_id, + internal::read_one_varint_delta( + in, reply.meta[i - 1].node_id.revision())); + } + + reply.meta[0].target_node_id = raft::vnode( + target_node_id, varlong_reader(in)); + for (size_t i = 1; i < size; ++i) { + reply.meta[i].target_node_id = raft::vnode( + target_node_id, + internal::read_one_varint_delta( + in, reply.meta[i - 1].target_node_id.revision())); + } + + for (size_t i = 0; i < size; ++i) { + reply.meta[i].result = read_nested( + in, 0U); + } + + for (auto& m : reply.meta) { + m.last_flushed_log_index = decode_signed(m.last_flushed_log_index); + m.last_dirty_log_index = decode_signed(m.last_dirty_log_index); + m.last_term_base_offset = decode_signed(m.last_term_base_offset); + m.node_id = raft::vnode( + m.node_id.id(), decode_signed(m.node_id.revision())); + m.target_node_id = raft::vnode( + m.target_node_id.id(), decode_signed(m.target_node_id.revision())); + } +} + +ss::future<> append_entries_request::serde_async_write(iobuf& dst) { + auto mem_batches = co_await model::consume_reader_to_memory( + std::move(batches()), model::no_timeout); + + iobuf out; + using serde::write; + + write(out, static_cast(mem_batches.size())); + for (auto& batch : mem_batches) { + // intentionally using reflection here for batches which are not yet + // supported with serde, but also have largely solidified. + reflection::serialize(out, std::move(batch)); + co_await ss::coroutine::maybe_yield(); + } + + write(out, node_id); + write(out, target_node_id); + write(out, meta); + write(out, flush); + + write(dst, std::move(out)); +} + +ss::future<> append_entries_request::serde_async_read( + iobuf_parser& src, const serde::header& hdr) { + using serde::read_nested; + auto tmp = read_nested(src, hdr._bytes_left_limit); + iobuf_parser in(std::move(tmp)); + + auto batch_count = read_nested(in, 0U); + auto batches = ss::circular_buffer{}; + batches.reserve(batch_count); + for (uint32_t i = 0; i < batch_count; ++i) { + batches.push_back(reflection::adl{}.from(in)); + co_await ss::coroutine::maybe_yield(); + } + + _batches = model::make_memory_record_batch_reader(std::move(batches)); + node_id = read_nested(in, 0U); + target_node_id = read_nested(in, 0U); + meta = read_nested(in, 0U); + flush = read_nested( + in, 0U); +} + +} // namespace raft + +namespace reflection { ss::future<> async_adl::to( iobuf& out, raft::append_entries_request&& request) { return model::consume_reader_to_memory( - std::move(request.batches), model::no_timeout) + std::move(request.batches()), model::no_timeout) .then([&out, request = std::move(request)]( ss::circular_buffer batches) { reflection::adl{}.to(out, batches.size()); @@ -240,12 +660,6 @@ void adl::to( idx); } -template -T varlong_reader(iobuf_parser& in) { - auto [val, len] = in.read_varlong(); - return T(val); -} - raft::protocol_metadata adl::from(iobuf_parser& in) { raft::protocol_metadata ret; ret.group = varlong_reader(in); @@ -256,87 +670,11 @@ raft::protocol_metadata adl::from(iobuf_parser& in) { ret.last_visible_index = varlong_reader(in); return ret; } -namespace internal { -struct hbeat_soa { - explicit hbeat_soa(size_t n) - : groups(n) - , commit_indices(n) - , terms(n) - , prev_log_indices(n) - , prev_log_terms(n) - , last_visible_indices(n) - , revisions(n) - , target_revisions(n) {} - - ~hbeat_soa() noexcept = default; - hbeat_soa(const hbeat_soa&) = delete; - hbeat_soa& operator=(const hbeat_soa&) = delete; - hbeat_soa(hbeat_soa&&) noexcept = default; - hbeat_soa& operator=(hbeat_soa&&) noexcept = default; - - std::vector groups; - std::vector commit_indices; - std::vector terms; - std::vector prev_log_indices; - std::vector prev_log_terms; - std::vector last_visible_indices; - std::vector revisions; - std::vector target_revisions; -}; - -struct hbeat_response_array { - explicit hbeat_response_array(size_t n) - : groups(n) - , terms(n) - , last_flushed_log_index(n) - , last_dirty_log_index(n) - , last_term_base_offset(n) - , revisions(n) - , target_revisions(n) {} - - std::vector groups; - std::vector terms; - std::vector last_flushed_log_index; - std::vector last_dirty_log_index; - std::vector last_term_base_offset; - std::vector revisions; - std::vector target_revisions; -}; -template -void encode_one_vint(iobuf& out, const T& t) { - auto b = vint::to_bytes(t); - // NOLINTNEXTLINE - out.append(reinterpret_cast(b.data()), b.size()); -} - -template -void encode_varint_delta(iobuf& out, const T& prev, const T& current) { - // TODO: use delta-delta: - // https://github.com/facebookarchive/beringei/blob/92784ec6e2/beringei/lib/BitUtil.cpp - auto delta = current - prev; - encode_one_vint(out, delta); -} - -template -void encode_one_delta_array(iobuf& o, const std::vector& v) { - if (v.empty()) { - return; - } - const size_t max = v.size(); - encode_one_vint(o, v[0]); - for (size_t i = 1; i < max; ++i) { - encode_varint_delta(o, v[i - 1], v[i]); - } -} -template -T read_one_varint_delta(iobuf_parser& in, const T& prev) { - auto dst = varlong_reader(in); - return prev + dst; -} -} // namespace internal ss::future<> async_adl::to( iobuf& out, raft::heartbeat_request&& request) { + vassert( + !request.heartbeats.empty(), "cannot serialize empty heartbeats request"); struct sorter_fn { constexpr bool operator()( const raft::heartbeat_metadata& lhs, @@ -405,11 +743,6 @@ ss::future<> async_adl::to( }); } -template -T decode_signed(T value) { - return value < T(0) ? T{} : value; -} - ss::future async_adl::from(iobuf_parser& in) { raft::heartbeat_request req; diff --git a/src/v/raft/types.h b/src/v/raft/types.h index a017935fbb3d6..46d4d730637fd 100644 --- a/src/v/raft/types.h +++ b/src/v/raft/types.h @@ -44,7 +44,9 @@ static constexpr clock_type::time_point no_timeout = clock_type::time_point::max(); using group_id = named_type; -struct protocol_metadata { + +struct protocol_metadata + : serde::envelope> { group_id group; model::offset commit_index; model::term_id term; @@ -54,6 +56,19 @@ struct protocol_metadata { friend std::ostream& operator<<(std::ostream& o, const protocol_metadata& m); + + friend bool operator==(const protocol_metadata&, const protocol_metadata&) + = default; + + auto serde_fields() { + return std::tie( + group, + commit_index, + term, + prev_log_index, + prev_log_term, + last_visible_index); + } }; // The sequence used to track the order of follower append entries request @@ -177,9 +192,16 @@ struct follower_metrics { bool under_replicated; }; -struct append_entries_request { +struct append_entries_request + : serde::envelope> { using flush_after_append = ss::bool_class; + /* + * default initialize with no record batch reader. default construction + * should only be used by serialization frameworks. + */ + append_entries_request() noexcept = default; + // required for the cases where we will set the target node id before // sending request to the node append_entries_request( @@ -189,8 +211,8 @@ struct append_entries_request { flush_after_append f = flush_after_append::yes) noexcept : node_id(src) , meta(m) - , batches(std::move(r)) - , flush(f){}; + , flush(f) + , _batches(std::move(r)) {} append_entries_request( vnode src, @@ -201,8 +223,8 @@ struct append_entries_request { : node_id(src) , target_node_id(target) , meta(m) - , batches(std::move(r)) - , flush(f){}; + , flush(f) + , _batches(std::move(r)) {} ~append_entries_request() noexcept = default; append_entries_request(const append_entries_request&) = delete; append_entries_request& operator=(const append_entries_request&) = delete; @@ -215,19 +237,48 @@ struct append_entries_request { vnode node_id; vnode target_node_id; protocol_metadata meta; - model::record_batch_reader batches; + model::record_batch_reader& batches() { + /* + * note that some call sites do: + * + * auto b = std::move(req.batches()) + * + * which does not reset the std::optional value. so this assertion is + * merely here to protect against use of a default constructed request. + */ + vassert(_batches.has_value(), "request contains no batches"); + return _batches.value(); + } flush_after_append flush; static append_entries_request make_foreign(append_entries_request&& req) { return append_entries_request( req.node_id, req.target_node_id, std::move(req.meta), - model::make_foreign_record_batch_reader(std::move(req.batches)), + model::make_foreign_record_batch_reader(std::move(req.batches())), req.flush); } + + ss::future<> serde_async_write(iobuf& out); + ss::future<> serde_async_read(iobuf_parser&, const serde::header&); + +private: + /* + * batches is optional to allow append_entries_request to have a default + * constructor and integrate with serde until serde provides a more powerful + * interface for dealing with this. + */ + std::optional _batches; }; -struct append_entries_reply { +/* + * append_entries_reply uses two different types of serialization: when + * encoding/decoding directly normal adl/serde per-field serialization is used. + * the second type is a custom encoding used by heartbeat_reply for more + * efficient encoding of a vectory of append_entries_reply. + */ +struct append_entries_reply + : serde::envelope> { enum class status : uint8_t { success, failure, @@ -254,12 +305,31 @@ struct append_entries_reply { friend std::ostream& operator<<(std::ostream& o, const append_entries_reply& r); + + friend bool + operator==(const append_entries_reply&, const append_entries_reply&) + = default; + + auto serde_fields() { + return std::tie( + target_node_id, + node_id, + group, + term, + last_flushed_log_index, + last_dirty_log_index, + last_term_base_offset, + result); + } }; struct heartbeat_metadata { protocol_metadata meta; vnode node_id; vnode target_node_id; + + friend bool operator==(const heartbeat_metadata&, const heartbeat_metadata&) + = default; }; /// \brief this is our _biggest_ modification to how raft works @@ -268,14 +338,38 @@ struct heartbeat_metadata { /// at a time, as well as the receiving side will trigger the /// individual raft responses one at a time - for example to start replaying the /// log at some offset -struct heartbeat_request { +struct heartbeat_request + : serde::envelope> { std::vector heartbeats; + + heartbeat_request() noexcept = default; + explicit heartbeat_request(std::vector heartbeats) + : heartbeats(std::move(heartbeats)) {} + friend std::ostream& operator<<(std::ostream& o, const heartbeat_request& r); + + friend bool operator==(const heartbeat_request&, const heartbeat_request&) + = default; + + ss::future<> serde_async_write(iobuf& out); + void serde_read(iobuf_parser&, const serde::header&); }; -struct heartbeat_reply { + +struct heartbeat_reply : serde::envelope> { std::vector meta; + + heartbeat_reply() noexcept = default; + explicit heartbeat_reply(std::vector meta) + : meta(std::move(meta)) {} + friend std::ostream& operator<<(std::ostream& o, const heartbeat_reply& r); + + friend bool operator==(const heartbeat_reply&, const heartbeat_reply&) + = default; + + void serde_write(iobuf& out); + void serde_read(iobuf_parser&, const serde::header&); }; struct vote_request : serde::envelope> { @@ -811,4 +905,41 @@ struct adl { }; } }; + +template<> +struct adl { + void to(iobuf& out, raft::append_entries_reply&& r) { + serialize( + out, + r.target_node_id, + r.node_id, + r.group, + r.term, + r.last_flushed_log_index, + r.last_dirty_log_index, + r.last_term_base_offset, + r.result); + } + raft::append_entries_reply from(iobuf_parser& in) { + auto target_node_id = adl{}.from(in); + auto node_id = adl{}.from(in); + auto group = adl{}.from(in); + auto term = adl{}.from(in); + auto last_flushed_log_index = adl{}.from(in); + auto last_dirty_log_index = adl{}.from(in); + auto last_term_base_offset = adl{}.from(in); + auto result = adl{}.from(in); + return { + .target_node_id = target_node_id, + .node_id = node_id, + .group = group, + .term = term, + .last_flushed_log_index = last_flushed_log_index, + .last_dirty_log_index = last_dirty_log_index, + .last_term_base_offset = last_term_base_offset, + .result = result, + }; + } +}; + } // namespace reflection diff --git a/src/v/redpanda/admin_server.cc b/src/v/redpanda/admin_server.cc index bb75946a1927d..59e7348e40367 100644 --- a/src/v/redpanda/admin_server.cc +++ b/src/v/redpanda/admin_server.cc @@ -2465,10 +2465,9 @@ void admin_server::register_partition_routes() { replica.core = bs.shard; r.previous_replicas.push(replica); } - co_await ss::coroutine::maybe_yield(); ret.push_back(std::move(r)); } - co_return std::move(ret); + co_return ret; }); } diff --git a/src/v/redpanda/application.cc b/src/v/redpanda/application.cc index a5fecfcab7316..bc5f02d47261f 100644 --- a/src/v/redpanda/application.cc +++ b/src/v/redpanda/application.cc @@ -58,6 +58,7 @@ #include "redpanda/admin_server.h" #include "resource_mgmt/io_priority.h" #include "rpc/simple_protocol.h" +#include "ssx/metrics.h" #include "storage/backlog_controller.h" #include "storage/chunk_cache.h" #include "storage/compaction_controller.h" @@ -313,6 +314,16 @@ void application::initialize( } void application::setup_metrics() { + if (!config::shard_local_cfg().disable_public_metrics()) { + seastar::metrics::replicate_metric_families( + seastar::metrics::default_handle(), + {{"io_queue_total_read_ops", ssx::metrics::public_metrics_handle}, + {"io_queue_total_write_ops", ssx::metrics::public_metrics_handle}, + {"memory_allocated_memory", ssx::metrics::public_metrics_handle}, + {"memory_free_memory", ssx::metrics::public_metrics_handle}}) + .get(); + } + if (config::shard_local_cfg().disable_metrics()) { return; } @@ -726,7 +737,8 @@ void application::wire_up_redpanda_services() { std::ref(tx_gateway_frontend), std::ref(partition_recovery_manager), std::ref(cloud_storage_api), - std::ref(shadow_index_cache)) + std::ref(shadow_index_cache), + std::ref(_feature_table)) .get(); vlog(_log.info, "Partition manager started"); @@ -930,6 +942,8 @@ void application::wire_up_redpanda_services() { c.max_service_memory_per_core = memory_groups::rpc_total_memory(); c.disable_metrics = net::metrics_disabled( config::shard_local_cfg().disable_metrics()); + c.disable_public_metrics = net::public_metrics_disabled( + config::shard_local_cfg().disable_public_metrics()); c.listen_backlog = config::shard_local_cfg().rpc_server_listen_backlog; c.tcp_recv_buf @@ -1072,7 +1086,6 @@ void application::wire_up_redpanda_services() { auto& tls_config = config::node().kafka_api_tls.value(); for (const auto& ep : config::node().kafka_api()) { ss::shared_ptr credentails; - std::optional tls_pm; // find credentials for this endpoint auto it = find_if( tls_config.begin(), @@ -1101,24 +1114,16 @@ void application::wire_up_redpanda_services() { }) .get0() : nullptr; - - auto tls_pm_rules - = it->config.get_principal_mapping_rules(); - if (tls_pm_rules) { - tls_pm = security::tls::principal_mapper( - tls_pm_rules); - } } c.addrs.emplace_back( - ep.name, - net::resolve_dns(ep.address).get0(), - credentails, - std::move(tls_pm)); + ep.name, net::resolve_dns(ep.address).get0(), credentails); } c.disable_metrics = net::metrics_disabled( config::shard_local_cfg().disable_metrics()); + c.disable_public_metrics = net::public_metrics_disabled( + config::shard_local_cfg().disable_public_metrics()); net::config_connection_rate_bindings bindings{ .config_general_rate @@ -1326,6 +1331,7 @@ void application::start_redpanda(::stop_signal& app_signal) { if (!config::shard_local_cfg().disable_metrics()) { proto->setup_metrics(); } + s.set_protocol(std::move(proto)); }) .get(); diff --git a/src/v/redpanda/tests/fixture.h b/src/v/redpanda/tests/fixture.h index b5986f36ecaf7..99be33f5c414a 100644 --- a/src/v/redpanda/tests/fixture.h +++ b/src/v/redpanda/tests/fixture.h @@ -19,6 +19,7 @@ #include "cluster/shard_table.h" #include "cluster/topics_frontend.h" #include "cluster/types.h" +#include "config/broker_authn_endpoint.h" #include "config/node_config.h" #include "coproc/api.h" #include "kafka/client/transport.h" @@ -183,9 +184,10 @@ class redpanda_thread_fixture { node_config.get("rpc_server") .set_value(net::unresolved_address("127.0.0.1", rpc_port)); node_config.get("kafka_api") - .set_value( - std::vector{model::broker_endpoint( - net::unresolved_address("127.0.0.1", kafka_port))}); + .set_value(std::vector{ + config::broker_authn_endpoint{ + .address = net::unresolved_address( + "127.0.0.1", kafka_port)}}); node_config.get("data_directory") .set_value(config::data_directory_path{.path = base_path}); node_config.get("coproc_supervisor_server") @@ -412,7 +414,7 @@ class redpanda_thread_fixture { net::server::resources(nullptr, nullptr), std::move(sasl), false, - false); + std::nullopt); kafka::request_header header; auto encoder_context = kafka::request_context( diff --git a/src/v/rpc/parse_utils.h b/src/v/rpc/parse_utils.h index 7bd0b8c7dcef9..2566533d59d3e 100644 --- a/src/v/rpc/parse_utils.h +++ b/src/v/rpc/parse_utils.h @@ -82,7 +82,191 @@ inline void validate_payload_and_header(const iobuf& io, const header& h) { } } +/* + * the transition from adl to serde encoding in rpc requires a period of time + * where both encodings are supported for all message types. however, we do not + * want to extend this requirement to brand new messages / services, nor to rpc + * types used in coproc which will remain in legacy adl format for now. + * + * we use the type system to enforce these rules and allow types to be opt-out + * on a case-by-case basis for adl (new messages) or serde (legacy like coproc). + * + * the `rpc_adl_exempt` and `rpc_serde_exempt` type trait helpers can be used to + * opt-out a type T from adl or serde support. a type is marked exempt by + * defining the type `T::rpc_(adl|serde)_exempt`. the typedef may be defined as + * any type such as std::{void_t, true_type}. + * + * Example: + * + * struct exempt_msg { + * using rpc_adl_exempt = std::true_type; + * ... + * }; + * + * then use the `is_rpc_adl_exempt` or `is_rpc_serde_exempt` concept to test. + */ +template +concept is_rpc_adl_exempt = requires { + typename T::rpc_adl_exempt; +}; + +template +concept is_rpc_serde_exempt = requires { + typename T::rpc_serde_exempt; +}; + +/* + * Encode a client request for the given transport version. + * + * Unless the message type T is explicitly exempt from adl<> support, type T + * must be supported by both adl<> and serde encoding frameworks. When the type + * is not exempt from adl<> support, serde is used when the version >= v2. + * + * The returned version indicates what level of encoding is used. This is always + * equal to the input version, except for serde-only messags which return v2. + * Callers are expected to further validate the runtime implications of this. + */ template +ss::future +encode_for_version(iobuf& out, T msg, transport_version version) { + static_assert(!is_rpc_adl_exempt || !is_rpc_serde_exempt); + + if constexpr (is_rpc_serde_exempt) { + return reflection::async_adl{}.to(out, std::move(msg)).then([] { + return transport_version::v0; + }); + } else if constexpr (is_rpc_adl_exempt) { + return ss::do_with(std::move(msg), [&out](T& msg) { + return serde::write_async(out, std::move(msg)).then([] { + return transport_version::v2; + }); + }); + } else { + if (version < transport_version::v2) { + return reflection::async_adl{} + .to(out, std::move(msg)) + .then([version] { return version; }); + } else { + return ss::do_with(std::move(msg), [&out, version](T& msg) { + return serde::write_async(out, std::move(msg)).then([version] { + return version; + }); + }); + } + } +} + +/* + * Decode a client request at the given transport version. + */ +template +ss::future +decode_for_version(iobuf_parser& parser, transport_version version) { + static_assert(!is_rpc_adl_exempt || !is_rpc_serde_exempt); + + if constexpr (is_rpc_serde_exempt) { + if (version != transport_version::v0) { + return ss::make_exception_future(std::runtime_error(fmt::format( + "Unexpected adl-only message {} at {} != v0", + typeid(T).name(), + version))); + } + return reflection::async_adl{}.from(parser); + } else if constexpr (is_rpc_adl_exempt) { + if (version < transport_version::v2) { + return ss::make_exception_future(std::runtime_error(fmt::format( + "Unexpected serde-only message {} at {} < v2", + typeid(T).name(), + version))); + } + return serde::read_async(parser); + } else { + if (version < transport_version::v2) { + return reflection::async_adl{}.from(parser); + } else { + return serde::read_async(parser); + } + } +} + +/* + * type used to factor out version-specific functionality from request handling + * in services. this is used so that tests can specialize behavior. + * + * this is the default mixin that is used by the code generator. + */ +struct default_message_codec { + /* + * decodes a request (server) or response (client) + */ + template + static ss::future + decode(iobuf_parser& parser, transport_version version) { + return decode_for_version(parser, version); + } + + /* + * Used by the server to determine which version use when sending a response + * back to the client. The default behavior is maintain the same version as + * the received request. + */ + static transport_version response_version(const header& h) { + return h.version; + } + + /* + * encodes a request (client) or response (server) + */ + template + static ss::future + encode(iobuf& out, T msg, transport_version version) { + return encode_for_version(out, std::move(msg), version); + } +}; + +/* + * service specialization mixin to create a v0 compliant service. a v0 service + * encodes and decodes using adl, ignores versions on requests, and sends + * replies with v0 in the header. + * + * example: + * using echo_service_v0 = echo_service_base; + * + * Note that for serde-supported messages a vassert(false) is generated. First, + * the v0_message_encoder is only used in tests. Second, serde usage is not + * possible in v0 servers, so this restriction is realistic. And from a + * practical standpoint this allows us to avoid bifurcation of services (or more + * sfinae magic) in tests so that serde-only types were never present within a + * service configured with a v0_message_encoder. + */ +struct v0_message_codec { + template + static ss::future decode(iobuf_parser& parser, transport_version) { + if constexpr (is_rpc_adl_exempt) { + vassert(false, "Cannot use serde-only types in v0 server"); + } else { + return reflection::async_adl{}.from(parser); + } + } + + static transport_version response_version(const header&) { + return transport_version::v0; + } + + template + static ss::future + encode(iobuf& out, T msg, transport_version) { + if constexpr (is_rpc_adl_exempt) { + vassert(false, "Cannot use serde-only types in v0 server"); + } else { + return reflection::async_adl{}.to(out, std::move(msg)).then([] { + return transport_version::v0; + }); + } + } +}; + +template ss::future parse_type(ss::input_stream& in, const header& h) { return read_iobuf_exactly(in, h.payload_size).then([h](iobuf io) { validate_payload_and_header(io, h); @@ -104,8 +288,8 @@ ss::future parse_type(ss::input_stream& in, const header& h) { auto p = std::make_unique(std::move(io)); auto raw = p.get(); - return reflection::async_adl{}.from(*raw).finally( - [p = std::move(p)] {}); + return Codec::template decode(*raw, h.version) + .finally([p = std::move(p)] {}); }); } diff --git a/src/v/rpc/service.h b/src/v/rpc/service.h index f61bbb3a922b8..e1d4abd615cd9 100644 --- a/src/v/rpc/service.h +++ b/src/v/rpc/service.h @@ -26,7 +26,7 @@ namespace rpc { /// \brief most service implementations will be codegenerated struct service { - template + template struct execution_helper; service() = default; @@ -50,7 +50,7 @@ class rpc_internal_body_parsing_exception : public std::exception { seastar::sstring _what; }; -template +template struct service::execution_helper { using input = Input; using output = Output; @@ -63,7 +63,7 @@ struct service::execution_helper { Func&& f) { return ctx.permanent_memory_reservation(ctx.get_header().payload_size) .then([f = std::forward(f), method_id, &in, &ctx]() mutable { - return parse_type(in, ctx.get_header()) + return parse_type(in, ctx.get_header()) .then_wrapped([f = std::forward(f), &ctx](ss::future input_f) mutable { if (input_f.failed()) { @@ -74,13 +74,29 @@ struct service::execution_helper { auto input = input_f.get0(); return f(std::move(input), ctx); }) - .then([method_id](Output out) mutable { + .then([method_id, &ctx](Output out) mutable { + const auto version = Codec::response_version( + ctx.get_header()); auto b = std::make_unique(); auto raw_b = b.get(); raw_b->set_service_method_id(method_id); - return reflection::async_adl{} - .to(raw_b->buffer(), std::move(out)) - .then([b = std::move(b)] { return std::move(*b); }); + raw_b->set_version(version); + return Codec::encode( + raw_b->buffer(), std::move(out), version) + .then([version, b = std::move(b)]( + transport_version effective_version) { + /* + * this assertion is safe because the conditions under + * which this assertion would fail should have been + * verified in parse_type above. + */ + vassert( + effective_version == version, + "Unexpected encoding at effective {} != {}", + effective_version, + version); + return std::move(*b); + }); }); }); } diff --git a/src/v/rpc/test/echo_service.json b/src/v/rpc/test/echo_service.json index ec1b0fea1e5bb..64813b3021fc9 100644 --- a/src/v/rpc/test/echo_service.json +++ b/src/v/rpc/test/echo_service.json @@ -34,6 +34,21 @@ "name": "throw_exception", "input_type": "throw_req", "output_type": "throw_resp" + }, + { + "name": "echo_adl_only", + "input_type": "echo_req_adl_only", + "output_type": "echo_resp_adl_only" + }, + { + "name": "echo_adl_serde", + "input_type": "echo_req_adl_serde", + "output_type": "echo_resp_adl_serde" + }, + { + "name": "echo_serde_only", + "input_type": "echo_req_serde_only", + "output_type": "echo_resp_serde_only" } ] } diff --git a/src/v/rpc/test/netbuf_tests.cc b/src/v/rpc/test/netbuf_tests.cc index 3d8ac0112643c..0690da1a09544 100644 --- a/src/v/rpc/test/netbuf_tests.cc +++ b/src/v/rpc/test/netbuf_tests.cc @@ -21,8 +21,9 @@ namespace rpc { /// \brief expects the inputstream to be prefixed by an rpc::header template ss::future parse_framed(ss::input_stream& in) { - return parse_header(in).then( - [&in](std::optional
o) { return parse_type(in, o.value()); }); + return parse_header(in).then([&in](std::optional
o) { + return parse_type(in, o.value()); + }); } } // namespace rpc diff --git a/src/v/rpc/test/rpc_gen_cycling_test.cc b/src/v/rpc/test/rpc_gen_cycling_test.cc index 63f50e9542b2b..b95c5b893ef35 100644 --- a/src/v/rpc/test/rpc_gen_cycling_test.cc +++ b/src/v/rpc/test/rpc_gen_cycling_test.cc @@ -10,6 +10,7 @@ #include "model/timeout_clock.h" #include "random/generators.h" #include "rpc/exceptions.h" +#include "rpc/parse_utils.h" #include "rpc/test/cycling_service.h" #include "rpc/test/echo_service.h" #include "rpc/test/rpc_gen_types.h" @@ -35,9 +36,10 @@ using namespace std::chrono_literals; // NOLINT // Test services -struct movistar final : cycling::team_movistar_service { +template +struct movistar final : cycling::team_movistar_service_base { movistar(ss::scheduling_group& sc, ss::smp_service_group& ssg) - : cycling::team_movistar_service(sc, ssg) {} + : cycling::team_movistar_service_base(sc, ssg) {} ss::future ibis_hakka(cycling::san_francisco&&, rpc::streaming_context&) final { return ss::make_ready_future( @@ -50,9 +52,10 @@ struct movistar final : cycling::team_movistar_service { } }; -struct echo_impl final : echo::echo_service { +template +struct echo_impl final : echo::echo_service_base { echo_impl(ss::scheduling_group& sc, ss::smp_service_group& ssg) - : echo::echo_service(sc, ssg) {} + : echo::echo_service_base(sc, ssg) {} ss::future echo(echo::echo_req&& req, rpc::streaming_context&) final { return ss::make_ready_future( @@ -95,6 +98,24 @@ struct echo_impl final : echo::echo_service { } } + ss::future echo_adl_only( + echo::echo_req_adl_only&& req, rpc::streaming_context&) final { + return ss::make_ready_future( + echo::echo_resp_adl_only{.str = req.str}); + } + + ss::future echo_adl_serde( + echo::echo_req_adl_serde&& req, rpc::streaming_context&) final { + return ss::make_ready_future( + echo::echo_resp_adl_serde{.str = req.str}); + } + + ss::future echo_serde_only( + echo::echo_req_serde_only&& req, rpc::streaming_context&) final { + return ss::make_ready_future( + echo::echo_resp_serde_only{.str = req.str}); + } + uint64_t cnt = 0; }; @@ -104,8 +125,13 @@ class rpc_integration_fixture : public rpc_simple_integration_fixture { : rpc_simple_integration_fixture(redpanda_rpc_port) {} void register_services() { - register_service(); - register_service(); + register_service>(); + register_service>(); + } + + void register_services_v0() { + register_service>(); + register_service>(); } static constexpr uint16_t redpanda_rpc_port = 32147; @@ -133,8 +159,7 @@ FIXTURE_TEST(echo_round_trip_tls, rpc_integration_fixture) { true, config::key_cert{"redpanda.key", "redpanda.crt"}, "root_certificate_authority.chain_cert", - false, - std::nullopt) + false) .get_credentials_builder() .get0(); @@ -210,8 +235,7 @@ FIXTURE_TEST(rpcgen_reload_credentials_integration, rpc_integration_fixture) { config::key_cert{ client_key.native(), client_crt.native()}, client_ca.native(), - true, - std::nullopt) + true) .get_credentials_builder() .get0(); // server credentials @@ -224,8 +248,7 @@ FIXTURE_TEST(rpcgen_reload_credentials_integration, rpc_integration_fixture) { config::key_cert{ server_key.native(), server_crt.native()}, server_ca.native(), - true, - std::nullopt) + true) .get_credentials_builder() .get0(); @@ -399,6 +422,7 @@ FIXTURE_TEST(missing_method_test, rpc_integration_fixture) { rpc::transport t(client_config()); t.connect(model::no_timeout).get(); + auto stop = ss::defer([&t] { t.stop().get(); }); auto client = echo::echo_client_protocol(t); const auto check_missing = [&] { @@ -444,8 +468,6 @@ FIXTURE_TEST(missing_method_test, rpc_integration_fixture) { } ss::when_all_succeed(requests.begin(), requests.end()).get(); - - t.stop().get(); } FIXTURE_TEST(corrupted_header_at_client_test, rpc_integration_fixture) { @@ -534,6 +556,14 @@ FIXTURE_TEST(corrupted_data_at_server, rpc_integration_fixture) { } } +/* + * the not_supported_version test uses the echo_adl_serde variant rather than + * the original version whose types cause it to be treated as adl-only. Because + * adl-only messages are sent at v0 and the test specifically requires sending + * messages at an arbitrarily higher value to trigger the error, a type was + * needed that supports a dynamic version range. When encoding adl/serde + * supported types the version is passed through. + */ FIXTURE_TEST(version_not_supported, rpc_integration_fixture) { configure_server(); register_services(); @@ -541,12 +571,15 @@ FIXTURE_TEST(version_not_supported, rpc_integration_fixture) { rpc::transport t(client_config()); t.connect(model::no_timeout).get(); + auto stop = ss::defer([&t] { t.stop().get(); }); auto client = echo::echo_client_protocol(t); const auto check_unsupported = [&] { - auto f = t.send_typed_versioned( - echo::echo_req{.str = "testing..."}, - 960598415, + auto f = t.send_typed_versioned< + echo::echo_req_adl_serde, + echo::echo_resp_adl_serde>( + echo::echo_req_adl_serde{.str = "testing..."}, + echo::echo_service::echo_adl_serde_method_id, rpc::client_opts(rpc::no_timeout), rpc::transport_version::unsupported); return f.then([&](auto ret) { @@ -564,12 +597,17 @@ FIXTURE_TEST(version_not_supported, rpc_integration_fixture) { }; const auto check_supported = [&] { - auto f = client.echo( - echo::echo_req{.str = "testing..."}, + auto f = client.echo_adl_serde( + echo::echo_req_adl_serde{.str = "testing..."}, rpc::client_opts(rpc::no_timeout)); return f.then([&](auto ret) { BOOST_REQUIRE(ret.has_value()); - BOOST_REQUIRE_EQUAL(ret.value().data.str, "testing..."); + // could be either one. depends on timing of transport upgrade + BOOST_REQUIRE( + ret.value().data.str + == "testing..._to_aas_from_aas_to_aas_from_aas" + || ret.value().data.str + == "testing..._to_sas_from_sas_to_sas_from_sas"); }); }; @@ -595,8 +633,6 @@ FIXTURE_TEST(version_not_supported, rpc_integration_fixture) { } ss::when_all_succeed(requests.begin(), requests.end()).get(); - - t.stop().get(); } class erroneous_protocol_exception : public std::exception {}; @@ -630,8 +666,8 @@ class erroneous_service_fixture : rpc_fixture_swappable_proto(redpanda_rpc_port) {} void register_services() { - register_service(); - register_service(); + register_service>(); + register_service>(); } static constexpr uint16_t redpanda_rpc_port = 32147; @@ -653,3 +689,253 @@ FIXTURE_TEST(unhandled_throw_in_proto_apply, erroneous_service_fixture) { .get(); t.stop().get(); } + +/* + * new client, new server + * client has initial transport version v1 + * sends adl+serde message at (adl,v1) + * client has transport upgraded to v2 + * client transport remains at v2 + */ +FIXTURE_TEST(nc_ns_adl_serde_client_upgraded, rpc_integration_fixture) { + configure_server(); + register_services(); + start_server(); + + rpc::transport t(client_config()); + t.connect(model::no_timeout).get(); + auto stop = ss::defer([&t] { t.stop().get(); }); + auto client = echo::echo_client_protocol(t); + + BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v1); + + // first messages are sent with adl + { + const auto payload = random_generators::gen_alphanum_string(100); + auto f = client.echo_adl_serde( + echo::echo_req_adl_serde{.str = payload}, + rpc::client_opts(rpc::no_timeout)); + auto ret = f.get(); + BOOST_REQUIRE(ret.has_value()); + BOOST_REQUIRE_EQUAL( + ret.value().data.str, payload + "_to_aas_from_aas_to_aas_from_aas"); + } + + // subsequent messages use serde + for (int i = 0; i < 10; i++) { + const auto payload = random_generators::gen_alphanum_string(100); + auto f = client.echo_adl_serde( + echo::echo_req_adl_serde{.str = payload}, + rpc::client_opts(rpc::no_timeout)); + auto ret = f.get(); + BOOST_REQUIRE(ret.has_value()); + BOOST_REQUIRE_EQUAL( + ret.value().data.str, payload + "_to_sas_from_sas_to_sas_from_sas"); + + // upgraded and remains at v2 + BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v2); + } +} + +/* + * new client, new server + * client has initial transport version v1 + * sends serde-only message at (serde,v2) + * client has transport upgraded to v2 + * client transport remains at v2 + */ +FIXTURE_TEST(nc_ns_serde_only_client_upgraded, rpc_integration_fixture) { + configure_server(); + register_services(); + start_server(); + + rpc::transport t(client_config()); + t.connect(model::no_timeout).get(); + auto stop = ss::defer([&t] { t.stop().get(); }); + auto client = echo::echo_client_protocol(t); + + BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v1); + + for (int i = 0; i < 10; i++) { + const auto payload = random_generators::gen_alphanum_string(100); + auto f = client.echo_serde_only( + echo::echo_req_serde_only{.str = payload}, + rpc::client_opts(rpc::no_timeout)); + auto ret = f.get(); + BOOST_REQUIRE(ret.has_value()); + BOOST_REQUIRE_EQUAL( + ret.value().data.str, payload + "_to_sso_from_sso_to_sso_from_sso"); + + // upgraded and remains at v2 + BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v2); + } +} + +/* + * new client, new server + * client sends adl-only message (adl,v1) + * client remains pinned at v1 + * + * client will not be upgraded. adl-only messages are always set at v0 and the + * server will always respond with v0 messages. upgrade doesn't happen because + * client only upgrades in response to a v1 or v2 message. + * + * this case is for the interim development period where we are allowing types + * with only adl support until all types have serde support added. + */ +FIXTURE_TEST(nc_ns_adl_only_no_client_upgrade, rpc_integration_fixture) { + configure_server(); + register_services(); + start_server(); + + rpc::transport t(client_config()); + t.connect(model::no_timeout).get(); + auto stop = ss::defer([&t] { t.stop().get(); }); + auto client = echo::echo_client_protocol(t); + + BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v1); + + for (int i = 0; i < 10; i++) { + const auto payload = random_generators::gen_alphanum_string(100); + auto f = client.echo_adl_only( + echo::echo_req_adl_only{.str = payload}, + rpc::client_opts(rpc::no_timeout)); + auto ret = f.get(); + BOOST_REQUIRE(ret.has_value()); + BOOST_REQUIRE_EQUAL( + ret.value().data.str, payload + "_to_aao_from_aao_to_aao_from_aao"); + + // no upgrade + BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v1); + } +} + +/* + * new client, old server + * client has initial transport version v1 + * [sends adl+serde message at (adl,v1)] * N + * client transport version is not upgraded + */ +FIXTURE_TEST(nc_os_adl_serde_no_client_upgrade, rpc_integration_fixture) { + configure_server(); + register_services_v0(); + start_server(); + + rpc::transport t(client_config()); + t.connect(model::no_timeout).get(); + auto stop = ss::defer([&t] { t.stop().get(); }); + auto client = echo::echo_client_protocol(t); + + // client initially at v1 + BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v1); + + for (int i = 0; i < 10; i++) { + const auto payload = random_generators::gen_alphanum_string(100); + auto f = client.echo_adl_serde( + echo::echo_req_adl_serde{.str = payload}, + rpc::client_opts(rpc::no_timeout)); + auto ret = f.get(); + BOOST_REQUIRE(ret.has_value()); + BOOST_REQUIRE_EQUAL( + ret.value().data.str, payload + "_to_aas_from_aas_to_aas_from_aas"); + + // client stays at v1 without upgrade to v2 + BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v1); + } +} + +/* + * new client, old server + * client has initial transport version v1 + * [sends adl-only message at (adl,v1)] * N + * client transport verison is not upgraded + */ +FIXTURE_TEST(nc_os_adl_only_no_client_upgrade, rpc_integration_fixture) { + configure_server(); + register_services_v0(); + start_server(); + + rpc::transport t(client_config()); + t.connect(model::no_timeout).get(); + auto stop = ss::defer([&t] { t.stop().get(); }); + auto client = echo::echo_client_protocol(t); + + // client initially at v1 + BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v1); + + for (int i = 0; i < 10; i++) { + const auto payload = random_generators::gen_alphanum_string(100); + auto f = client.echo_adl_only( + echo::echo_req_adl_only{.str = payload}, + rpc::client_opts(rpc::no_timeout)); + auto ret = f.get(); + BOOST_REQUIRE(ret.has_value()); + BOOST_REQUIRE_EQUAL( + ret.value().data.str, payload + "_to_aao_from_aao_to_aao_from_aao"); + + // client stays at v1 without upgrade to v2 + BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v1); + } +} + +/* + * old client, new server + * sends an adl encoded message which the server understands but also has serde + * support for. communication should continue to use adl. + */ +FIXTURE_TEST(oc_ns_adl_serde_no_upgrade, rpc_integration_fixture) { + configure_server(); + register_services(); + start_server(); + + rpc::transport t(client_config()); + t.set_version(rpc::transport_version::v0); + t.connect(model::no_timeout).get(); + auto stop = ss::defer([&t] { t.stop().get(); }); + auto client = echo::echo_client_protocol(t); + + BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v0); + + for (int i = 0; i < 10; i++) { + const auto payload = random_generators::gen_alphanum_string(100); + auto f = client.echo_adl_serde( + echo::echo_req_adl_serde{.str = payload}, + rpc::client_opts(rpc::no_timeout)); + auto ret = f.get(); + BOOST_REQUIRE(ret.has_value()); + BOOST_REQUIRE_EQUAL( + ret.value().data.str, payload + "_to_aas_from_aas_to_aas_from_aas"); + BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v0); + } +} + +/* + * old client, new server + * adl-only. verifies behavior for intermediate state when we support adl-only + * messages. + */ +FIXTURE_TEST(oc_ns_adl_only_no_upgrade, rpc_integration_fixture) { + configure_server(); + register_services(); + start_server(); + + rpc::transport t(client_config()); + t.set_version(rpc::transport_version::v0); + t.connect(model::no_timeout).get(); + auto stop = ss::defer([&t] { t.stop().get(); }); + auto client = echo::echo_client_protocol(t); + + BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v0); + + for (int i = 0; i < 10; i++) { + const auto payload = random_generators::gen_alphanum_string(100); + auto f = client.echo_adl_only( + echo::echo_req_adl_only{.str = payload}, + rpc::client_opts(rpc::no_timeout)); + auto ret = f.get(); + BOOST_REQUIRE(ret.has_value()); + BOOST_REQUIRE_EQUAL( + ret.value().data.str, payload + "_to_aao_from_aao_to_aao_from_aao"); + BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v0); + } +} diff --git a/src/v/rpc/test/rpc_gen_types.h b/src/v/rpc/test/rpc_gen_types.h index 7527d8e9c2f87..bdd61b46315b5 100644 --- a/src/v/rpc/test/rpc_gen_types.h +++ b/src/v/rpc/test/rpc_gen_types.h @@ -11,7 +11,11 @@ #pragma once +#include "reflection/adl.h" +#include "rpc/parse_utils.h" #include "seastarx.h" +#include "serde/envelope.h" +#include "serde/serde.h" #include @@ -19,33 +23,41 @@ namespace cycling { struct ultimate_cf_slx { + using rpc_serde_exempt = std::true_type; int x = 42; }; struct nairo_quintana { + using rpc_serde_exempt = std::true_type; int x = 43; }; struct san_francisco { + using rpc_serde_exempt = std::true_type; int x = 44; }; struct mount_tamalpais { + using rpc_serde_exempt = std::true_type; int x = 45; }; } // namespace cycling namespace echo { struct echo_req { + using rpc_serde_exempt = std::true_type; ss::sstring str; }; struct echo_resp { + using rpc_serde_exempt = std::true_type; ss::sstring str; }; struct cnt_req { + using rpc_serde_exempt = std::true_type; uint64_t expected; }; struct cnt_resp { + using rpc_serde_exempt = std::true_type; uint64_t expected; uint64_t current; }; @@ -55,7 +67,177 @@ enum class failure_type { throw_exception, exceptional_future, none }; using throw_req = failure_type; struct throw_resp { + using rpc_serde_exempt = std::true_type; ss::sstring reply; }; +/* + * echo methods with req/resp that support encodings: + * - adl only + * - serde only + * - serde and adl + */ +struct echo_req_adl_only { + using rpc_serde_exempt = std::true_type; + ss::sstring str; +}; + +struct echo_resp_adl_only { + using rpc_serde_exempt = std::true_type; + ss::sstring str; +}; + +// an adl-only type should not have serde support +static_assert(!serde::is_serde_compatible_v); +static_assert(!serde::is_serde_compatible_v); + +// an adl-only type should not be exempt from adl support +static_assert(!rpc::is_rpc_adl_exempt); +static_assert(!rpc::is_rpc_adl_exempt); + +struct echo_req_adl_serde + : serde::envelope> { + ss::sstring str; + + void serde_write(iobuf& out) const { + // serialize with serde an adl-serde type + using serde::write; + write(out, str + "_to_sas"); + } + + void serde_read(iobuf_parser& in, const serde::header& h) { + // deserialize with serde an adl-serde type + using serde::read_nested; + str = read_nested(in, h._bytes_left_limit); + str += "_from_sas"; + } +}; + +struct echo_resp_adl_serde + : serde::envelope> { + ss::sstring str; + + void serde_write(iobuf& out) const { + // serialize with serde an adl-serde type + using serde::write; + write(out, str + "_to_sas"); + } + + void serde_read(iobuf_parser& in, const serde::header& h) { + // deserialize with serde an adl-serde type + using serde::read_nested; + str = read_nested(in, h._bytes_left_limit); + str += "_from_sas"; + } +}; + +static_assert(serde::is_serde_compatible_v); +static_assert(serde::is_serde_compatible_v); +static_assert(!rpc::is_rpc_adl_exempt); +static_assert(!rpc::is_rpc_adl_exempt); + +struct echo_req_serde_only + : serde::envelope> { + using rpc_adl_exempt = std::true_type; + ss::sstring str; + + void serde_write(iobuf& out) const { + // serialize with serde a serde-only type + using serde::write; + write(out, str + "_to_sso"); + } + + void serde_read(iobuf_parser& in, const serde::header& h) { + // deserialize with serde a serde-only type + using serde::read_nested; + str = read_nested(in, h._bytes_left_limit); + str += "_from_sso"; + } +}; + +struct echo_resp_serde_only + : serde::envelope> { + using rpc_adl_exempt = std::true_type; + ss::sstring str; + + void serde_write(iobuf& out) const { + // serialize with serde a serde-only type + using serde::write; + write(out, str + "_to_sso"); + } + + void serde_read(iobuf_parser& in, const serde::header& h) { + // deserialize with serde a serde-only type + using serde::read_nested; + str = read_nested(in, h._bytes_left_limit); + str += "_from_sso"; + } +}; + +// serde-only type needs to have serde support +static_assert(serde::is_serde_compatible_v); +static_assert(serde::is_serde_compatible_v); + +// serde-only type needs to be example from adl +static_assert(rpc::is_rpc_adl_exempt); +static_assert(rpc::is_rpc_adl_exempt); + } // namespace echo + +namespace reflection { +template<> +struct adl { + void to(iobuf& out, echo::echo_req_adl_only&& r) { + // serialize with adl an adl-only type + reflection::serialize(out, r.str + "_to_aao"); + } + echo::echo_req_adl_only from(iobuf_parser& in) { + // deserialize with adl an adl-only type + return echo::echo_req_adl_only{ + .str = adl{}.from(in) + "_from_aao", + }; + } +}; + +template<> +struct adl { + void to(iobuf& out, echo::echo_resp_adl_only&& r) { + // serialize with adl an adl-only type + reflection::serialize(out, r.str + "_to_aao"); + } + echo::echo_resp_adl_only from(iobuf_parser& in) { + // deserialize with adl an adl-only type + return echo::echo_resp_adl_only{ + .str = adl{}.from(in) + "_from_aao", + }; + } +}; + +template<> +struct adl { + void to(iobuf& out, echo::echo_req_adl_serde&& r) { + // serialize with adl an adl-serde type + reflection::serialize(out, r.str + "_to_aas"); + } + echo::echo_req_adl_serde from(iobuf_parser& in) { + // deserialize with adl an adl-serde type + return echo::echo_req_adl_serde{ + .str = adl{}.from(in) + "_from_aas", + }; + } +}; + +template<> +struct adl { + void to(iobuf& out, echo::echo_resp_adl_serde&& r) { + // serialize with adl an adl-serde type + reflection::serialize(out, r.str + "_to_aas"); + } + echo::echo_resp_adl_serde from(iobuf_parser& in) { + // deserialize with adl an adl-serde type + return echo::echo_resp_adl_serde{ + .str = adl{}.from(in) + "_from_aas", + }; + } +}; +} // namespace reflection diff --git a/src/v/rpc/test/rpc_integration_fixture.h b/src/v/rpc/test/rpc_integration_fixture.h index 40bd817d3bafc..81d3c4976e5ec 100644 --- a/src/v/rpc/test/rpc_integration_fixture.h +++ b/src/v/rpc/test/rpc_integration_fixture.h @@ -140,6 +140,7 @@ class rpc_sharded_integration_fixture : public rpc_base_integration_fixture { ss::tls::reload_callback&& cb = {}) override { net::server_configuration scfg("unit_test_rpc_sharded"); scfg.disable_metrics = net::metrics_disabled::yes; + scfg.disable_public_metrics = net::public_metrics_disabled::yes; auto resolved = net::resolve_dns(_listen_address).get(); scfg.addrs.emplace_back( resolved, diff --git a/src/v/rpc/test/test_types.h b/src/v/rpc/test/test_types.h index a6e08af0ac290..2c4169bd03296 100644 --- a/src/v/rpc/test/test_types.h +++ b/src/v/rpc/test/test_types.h @@ -21,6 +21,7 @@ #include struct pod { + using rpc_serde_exempt = std::true_type; int16_t x = 1; int32_t y = 2; int64_t z = 3; diff --git a/src/v/rpc/transport.h b/src/v/rpc/transport.h index 7a5304ee19d97..abbdf6d078cd0 100644 --- a/src/v/rpc/transport.h +++ b/src/v/rpc/transport.h @@ -38,6 +38,9 @@ #include #include +class rpc_integration_fixture_oc_ns_adl_serde_no_upgrade; +class rpc_integration_fixture_oc_ns_adl_only_no_upgrade; + namespace rpc { struct client_context_impl; @@ -68,6 +71,8 @@ class transport final : public net::base_transport { void reset_state() final; + transport_version version() const { return _version; } + private: using sequence_t = named_type; struct entry { @@ -102,6 +107,18 @@ class transport final : public net::base_transport { requests_queue_t _requests_queue; sequence_t _seq; sequence_t _last_seq; + + /* + * version level used when dispatching requests. this value may change + * during the lifetime of the transport. for example the version may be + * upgraded if it is discovered that a server supports a newer version. + */ + transport_version _version{transport_version::v1}; + + friend class ::rpc_integration_fixture_oc_ns_adl_serde_no_upgrade; + friend class ::rpc_integration_fixture_oc_ns_adl_only_no_upgrade; + void set_version(transport_version v) { _version = v; } + friend std::ostream& operator<<(std::ostream&, const transport&); }; @@ -124,38 +141,63 @@ inline errc map_server_error(status status) { template ss::future>> parse_result( - ss::input_stream& in, std::unique_ptr sctx) { + ss::input_stream& in, + std::unique_ptr sctx, + transport_version req_ver) { using ret_t = result>; - // check status first - auto st = static_cast(sctx->get_header().meta); - - // success case - if (st == status::success) { - return parse_type(in, sctx->get_header()) - .then_wrapped([sctx = std::move(sctx)](ss::future data_fut) { - if (data_fut.failed()) { - const auto ex = data_fut.get_exception(); - sctx->body_parse_exception(ex); - /** - * we want to throw an exception when body parsing failed. - * this will invalidate the connection since it may not be - * valid any more. - */ - std::rethrow_exception(ex); - } - sctx->signal_body_parse(); - return ret_t(rpc::client_context( - sctx->get_header(), std::move(data_fut.get()))); - }); - } - /** - * signal that request body is parsed since it is empty when status - * indicates server error. + const auto st = static_cast(sctx->get_header().meta); + const auto rep_ver = sctx->get_header().version; + + /* + * the reply version should always be the same as the request version, + * otherwise this is non-compliant behavior. the exception to this + * rule is a v0 reply to a v1 request (ie talking to old v0 server). */ - sctx->signal_body_parse(); + const auto protocol_violation + = rep_ver != req_ver + && (req_ver != transport_version::v1 || rep_ver != transport_version::v0); + + if (unlikely(st != status::success || protocol_violation)) { + sctx->signal_body_parse(); + if (st == status::version_not_supported) { + /* + * let version_not_supported take precedence over error handling for + * protocol violations because the protocol violation may be due to + * the unsupported version scenario. + */ + return ss::make_ready_future(map_server_error(st)); + } + if (protocol_violation) { + vlog( + rpclog.warn, + "Protocol violation: request version {} incompatible with " + "reply version {}", + req_ver, + rep_ver); + } + if (st == status::success) { + return ss::make_ready_future(errc::service_error); + } + return ss::make_ready_future(map_server_error(st)); + } - return ss::make_ready_future(map_server_error(st)); + return parse_type(in, sctx->get_header()) + .then_wrapped([sctx = std::move(sctx)](ss::future data_fut) { + if (data_fut.failed()) { + const auto ex = data_fut.get_exception(); + sctx->body_parse_exception(ex); + /** + * we want to throw an exception when body parsing failed. + * this will invalidate the connection since it may not be + * valid any more. + */ + std::rethrow_exception(ex); + } + sctx->signal_body_parse(); + return ret_t(rpc::client_context( + sctx->get_header(), std::move(data_fut.get()))); + }); } } // namespace internal @@ -165,7 +207,7 @@ inline ss::future>> transport::send_typed(Input r, uint32_t method_id, rpc::client_opts opts) { using ret_t = result>; return send_typed_versioned( - std::move(r), method_id, std::move(opts), transport_version::v0) + std::move(r), method_id, std::move(opts), _version) .then([](result> res) { if (!res) { return ss::make_ready_future(res.error()); @@ -182,6 +224,7 @@ transport::send_typed_versioned( rpc::client_opts opts, transport_version version) { using ret_t = result>; + using ctx_t = result>; _probe.request(); auto b = std::make_unique(); @@ -189,22 +232,49 @@ transport::send_typed_versioned( b->set_min_compression_bytes(opts.min_compression_bytes); auto raw_b = b.get(); raw_b->set_service_method_id(method_id); - raw_b->set_version(version); auto& target_buffer = raw_b->buffer(); auto seq = ++_seq; - return reflection::async_adl{} - .to(target_buffer, std::move(r)) - .then([this, b = std::move(b), seq, opts = std::move(opts)]() mutable { - return do_send(seq, std::move(*b.get()), std::move(opts)); + return encode_for_version(target_buffer, std::move(r), version) + .then([this, version, b = std::move(b), seq, opts = std::move(opts)]( + transport_version effective_version) mutable { + /* + * enforce the rule that a transport configured as v0 behaves like + * a v0 client transport and sends v0 messages. + */ + vassert( + version != transport_version::v0 + || effective_version == transport_version::v0, + "Request type {} cannot be encoded at version {} (effective {}).", + typeid(Input).name(), + version, + effective_version); + b->set_version(effective_version); + return do_send(seq, std::move(*b.get()), std::move(opts)) + .then([effective_version](ctx_t ctx) { + return std::make_tuple(std::move(ctx), effective_version); + }); }) - .then([this](result> sctx) mutable { + .then_unpack([this](ctx_t sctx, transport_version req_ver) { if (!sctx) { return ss::make_ready_future(sctx.error()); } const auto version = sctx.value()->get_header().version; - return internal::parse_result(_in, std::move(sctx.value())) - .then([version](result> r) { + return internal::parse_result( + _in, std::move(sctx.value()), req_ver) + .then([this, version](result> r) { + /* + * upgrade transport to v2 when: + * - at version v1 (do not upgrade from v0 -- for testing) + * - the response was handled/contains no errors + * - the response is v1,v2 (from a new server) + */ + if ( + _version == transport_version::v1 && r.has_value() + && (version == transport_version::v1 || version == transport_version::v2)) { + vlog(rpclog.debug, "Upgrading connection from v1 to v2"); + _version = transport_version::v2; + } return ret_t(result_context{version, std::move(r)}); }); }); diff --git a/src/v/rpc/types.cc b/src/v/rpc/types.cc index 503a0d5aa0a7c..d048de021b512 100644 --- a/src/v/rpc/types.cc +++ b/src/v/rpc/types.cc @@ -67,12 +67,11 @@ std::ostream& operator<<(std::ostream& o, const status& s) { } std::ostream& operator<<(std::ostream& o, transport_version v) { - switch (v) { - case transport_version::v0: - return o << "rpc::transport_version::v0"; - case transport_version::unsupported: - return o << "rpc::transport_version::unsupported"; - } + fmt::print( + o, + "rpc::transport_version::v{}", + static_cast>(v)); + return o; } } // namespace rpc diff --git a/src/v/rpc/types.h b/src/v/rpc/types.h index de447992f5e1c..987cb08b6480d 100644 --- a/src/v/rpc/types.h +++ b/src/v/rpc/types.h @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -72,14 +73,27 @@ enum class status : uint32_t { }; enum class transport_version : uint8_t { + /* + * the first version used by rpc simple protocol. at this version level + * clients and servers (1) assume adl encoding, (2) ignore the version when + * handling a request, and (3) always respond with version 0. + */ v0 = 0, - max_supported = v0, + + /* + * starting with version v1 clients and servers no longer ignore the + * version. v1 indicates adl encoding and v2 indicates serde encoding. + */ + v1 = 1, + v2 = 2, + + max_supported = v2, /* * unsupported is a convenience name used in tests to construct a message * with an unsupported version. the bits should not be considered reserved. */ - unsupported = max_supported + 1, + unsupported = std::numeric_limits::max() }; /// \brief core struct for communications. sent with _each_ payload diff --git a/src/v/security/errc.h b/src/v/security/errc.h index af7ef3e1645d3..0373cfd90de40 100644 --- a/src/v/security/errc.h +++ b/src/v/security/errc.h @@ -8,6 +8,8 @@ * the Business Source License, use of this software will be governed * by the Apache License, Version 2.0 */ +#pragma once + #include "outcome.h" namespace security { diff --git a/src/v/security/mtls.cc b/src/v/security/mtls.cc index 5f5e7c465da5a..708fe612d4741 100644 --- a/src/v/security/mtls.cc +++ b/src/v/security/mtls.cc @@ -26,7 +26,7 @@ namespace detail { static constexpr const char* const rule_pattern{ R"((DEFAULT)|RULE:((\\.|[^\\/])*)\/((\\.|[^\\/])*)\/([LU]?).*?|(.*?))"}; static constexpr const char* const rule_pattern_splitter{ - R"(\s*((DEFAULT)|RULE:((\\.|[^\\/])*)\/((\\.|[^\\/])*)\/([LU]?).*?|(.*?))\s*(,\s*|$))"}; + R"(\s*((DEFAULT)|RULE:((\\.|[^\\/])*)\/((\\.|[^\\/])*)\/([LU]?).*?|(.*?))\s*([,\n]\s*|$))"}; std::regex make_regex(std::string_view sv) { return std::regex{ @@ -68,11 +68,15 @@ constexpr std::optional make_sv(const std::csub_match& sm) { : std::optional{std::nullopt}; } -std::vector parse_rules(std::optional unparsed_rules) { +std::vector +parse_rules(std::optional> unparsed_rules) { static const std::regex rule_splitter = make_regex(rule_pattern_splitter); static const std::regex rule_parser = make_regex(rule_pattern); - std::string_view rules{trim(unparsed_rules.value_or("DEFAULT"))}; + std::string rules + = unparsed_rules.has_value() ? fmt::format( + "{}", fmt::join(unparsed_rules->begin(), unparsed_rules->end(), ",")) + : "DEFAULT"; std::vector result; std::cmatch rules_match; @@ -147,6 +151,16 @@ std::optional rule::apply(std::string_view dn) const { return result; } +std::optional +validate_rules(const std::optional>& r) noexcept { + try { + security::tls::detail::parse_rules(r); + } catch (const std::exception& e) { + return e.what(); + } + return std::nullopt; +} + std::ostream& operator<<(std::ostream& os, const rule& r) { fmt::print(os, "{}", r); return os; @@ -157,6 +171,22 @@ std::ostream& operator<<(std::ostream& os, const principal_mapper& p) { return os; } +principal_mapper::principal_mapper( + config::binding>> cb) + : _binding(std::move(cb)) + , _rules{detail::parse_rules(_binding())} { + _binding.watch([this]() { _rules = detail::parse_rules(_binding()); }); +} + +std::optional principal_mapper::apply(std::string_view sv) const { + for (const auto& r : _rules) { + if (auto p = r.apply(sv); p.has_value()) { + return {std::move(p).value()}; + } + } + return std::nullopt; +} + } // namespace security::tls // explicit instantiations so as to avoid bringing in in the diff --git a/src/v/security/mtls.h b/src/v/security/mtls.h index 492f7d30dd719..56be6921aea98 100644 --- a/src/v/security/mtls.h +++ b/src/v/security/mtls.h @@ -11,6 +11,7 @@ #pragma once +#include "config/property.h" #include "seastarx.h" #include @@ -18,7 +19,6 @@ #include -#include #include #include #include @@ -53,25 +53,11 @@ class rule { make_upper _to_upper{false}; }; -namespace detail { - -std::vector parse_rules(std::optional unparsed_rules); - -} // namespace detail - class principal_mapper { public: - explicit principal_mapper(std::optional sv) - : _rules{detail::parse_rules(sv)} {} - - std::optional apply(std::string_view sv) const { - for (const auto& r : _rules) { - if (auto p = r.apply(sv); p.has_value()) { - return {std::move(p).value()}; - } - } - return std::nullopt; - } + explicit principal_mapper( + config::binding>> cb); + std::optional apply(std::string_view sv) const; private: friend struct fmt::formatter; @@ -79,9 +65,24 @@ class principal_mapper { friend std::ostream& operator<<(std::ostream& os, const principal_mapper& p); + config::binding>> _binding; std::vector _rules; }; +class mtls_state { +public: + explicit mtls_state(ss::sstring principal) + : _principal{std::move(principal)} {} + + const ss::sstring& principal() { return _principal; } + +private: + ss::sstring _principal; +}; + +std::optional +validate_rules(const std::optional>& r) noexcept; + } // namespace security::tls template<> diff --git a/src/v/security/tests/mtls_test.cc b/src/v/security/tests/mtls_test.cc index c69c544edd6a7..da3d2214fd90a 100644 --- a/src/v/security/tests/mtls_test.cc +++ b/src/v/security/tests/mtls_test.cc @@ -6,6 +6,7 @@ // As of the Change Date specified in that file, in accordance with // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0 +#include "config/property.h" #include "random/generators.h" #include "security/mtls.h" #include "utils/base64.h" @@ -33,7 +34,7 @@ namespace security::tls { namespace bdata = boost::unit_test::data; -std::array mtls_valid_rules{ +std::array mtls_valid_rules{ "DEFAULT", "RULE:^CN=(.*?),OU=ServiceUsers.*$/$1/", "RULE:^CN=(.*?),OU=ServiceUsers.*$/$1/L, DEFAULT", @@ -44,10 +45,11 @@ std::array mtls_valid_rules{ "RULE:^CN=([^,DEFAULT,]+)(,.*|$)/$1/"}; BOOST_DATA_TEST_CASE(test_mtls_valid_rules, bdata::make(mtls_valid_rules), c) { - BOOST_REQUIRE_NO_THROW(principal_mapper{c}); + BOOST_REQUIRE_NO_THROW(principal_mapper{ + config::mock_binding(std::optional>{{c}})}); } -std::array mtls_invalid_rules{ +std::array mtls_invalid_rules{ "default", "DEFAUL", "DEFAULT/L", @@ -61,7 +63,10 @@ std::array mtls_invalid_rules{ BOOST_DATA_TEST_CASE( test_mtls_invalid_rules, bdata::make(mtls_invalid_rules), c) { - BOOST_REQUIRE_THROW(principal_mapper{c}, std::runtime_error); + BOOST_REQUIRE_THROW( + principal_mapper{ + config::mock_binding(std::optional>{{c}})}, + std::runtime_error); } struct record { @@ -87,15 +92,16 @@ static std::array mtls_principal_mapper_data{ BOOST_DATA_TEST_CASE( test_mtls_principal_mapper, bdata::make(mtls_principal_mapper_data), c) { security::tls::principal_mapper mapper{ - "RULE:^CN=(.*?),OU=ServiceUsers.*$/$1/L, " - "RULE:^CN=(.*?),OU=(.*?),O=(.*?),L=(.*?),ST=(.*?),C=(.*?)$/$1@$2/L, " - "RULE:^cn=(.*?),ou=(.*?),dc=(.*?),dc=(.*?)$/$1@$2/U, " - "RULE:^.*[Cc][Nn]=([a-zA-Z0-9.]*).*$/$1/U, " - "DEFAULT"}; + config::mock_binding(std::optional>{ + {"RULE:^CN=(.*?),OU=ServiceUsers.*$/$1/L, " + "RULE:^CN=(.*?),OU=(.*?),O=(.*?),L=(.*?),ST=(.*?),C=(.*?)$/$1@$2/L, " + "RULE:^cn=(.*?),ou=(.*?),dc=(.*?),dc=(.*?)$/$1@$2/U, " + "RULE:^.*[Cc][Nn]=([a-zA-Z0-9.]*).*$/$1/U, " + "DEFAULT"}})}; BOOST_REQUIRE_EQUAL(c.expected, *mapper.apply(c.input)); } -static std::array mtls_rule_splitting_data{ +static std::array mtls_rule_splitting_data{ record{"[]", ""}, {"[DEFAULT]", "DEFAULT"}, {"[RULE:/]", "RULE://"}, @@ -118,18 +124,46 @@ static std::array mtls_rule_splitting_data{ "DEFAULT, /DEFAULT, DEFAULT]", "RULE:,RULE:,/,RULE:,\\//U,RULE:,/RULE:,/,RULE:,RULE:,/L,RULE:,/L,RULE:, " "DEFAULT, /DEFAULT/,DEFAULT"}, + {"[RULE:/, DEFAULT]", "RULE://\nDEFAULT"}, }; BOOST_DATA_TEST_CASE( test_mtls_rule_splitting, bdata::make(mtls_rule_splitting_data), c) { - BOOST_CHECK_EQUAL(c.expected, fmt::format("{}", principal_mapper(c.input))); + BOOST_CHECK_EQUAL( + c.expected, + fmt::format( + "{}", + principal_mapper(config::mock_binding( + std::optional>{{ss::sstring{c.input}}})))); } BOOST_AUTO_TEST_CASE(test_mtls_comma_with_whitespace) { BOOST_CHECK_EQUAL( "Tkac\\, Adam", - principal_mapper("RULE:^CN=((\\\\, *|\\w)+)(,.*|$)/$1/,DEFAULT") + principal_mapper( + config::mock_binding(std::optional>{ + {"RULE:^CN=((\\\\, *|\\w)+)(,.*|$)/$1/,DEFAULT"}})) .apply("CN=Tkac\\, Adam,OU=ITZ,DC=geodis,DC=cz") .value_or("")); } +BOOST_AUTO_TEST_CASE(test_mtls_parsing_with_multiline) { + BOOST_CHECK_EQUAL( + "test_cn", + principal_mapper( + config::mock_binding(std::optional>{ + {{"RULE:^OU=(.*)/$1/"}, {"RULE:^CN=(.*)/$1/"}}})) + .apply("CN=test_cn") + .value_or("")); +} + +BOOST_AUTO_TEST_CASE(test_mtls_parsing_with_newline) { + BOOST_CHECK_EQUAL( + "test_cn", + principal_mapper( + config::mock_binding(std::optional>{ + {"RULE:^OU=(.*)/$1/\nRULE:^CN=(.*)/$1/"}})) + .apply("CN=test_cn") + .value_or("")); +} + } // namespace security::tls diff --git a/src/v/serde/envelope.h b/src/v/serde/envelope.h index 83c8d24f606df..c4c02a960c04b 100644 --- a/src/v/serde/envelope.h +++ b/src/v/serde/envelope.h @@ -10,6 +10,7 @@ #pragma once #include +#include #include namespace serde { @@ -67,77 +68,17 @@ struct checksum_envelope { static constexpr auto redpanda_serde_build_checksum = true; }; -namespace detail { - -template -struct has_compat_attribute : std::false_type {}; - -template -struct has_compat_attribute< - T, - std::void_t().redpanda_serde_compat_version)>> - : std::true_type {}; - -template -struct has_version_attribute : std::false_type {}; - -template -struct has_version_attribute< - T, - std::void_t().redpanda_serde_version)>> - : std::true_type {}; - -template -struct inherits_from_envelope : std::false_type {}; - -template -struct inherits_from_envelope< - T, - std::void_t().redpanda_inherits_from_envelope)>> - : std::true_type {}; - -template -struct compat_version_has_serde_version_type { - static constexpr auto const value = std::is_same_v< - std::decay_t().redpanda_serde_compat_version)>, - version_t>; +template +concept is_envelope = requires { + { T::redpanda_serde_version } -> std::same_as; + { T::redpanda_serde_compat_version } -> std::same_as; }; template -struct version_has_serde_version_type { - static constexpr auto const value = std::is_same_v< - std::decay_t().redpanda_serde_version)>, - version_t>; -}; - -template -struct has_checksum_attribute : std::false_type {}; - -template -struct has_checksum_attribute< - T, - std::void_t().redpanda_serde_build_checksum)>> - : std::true_type {}; - -} // namespace detail - -template -inline constexpr auto const is_envelope_v = std::conjunction_v< - detail::has_compat_attribute, - detail::has_version_attribute, - detail::compat_version_has_serde_version_type, - detail::version_has_serde_version_type>; - -template -inline constexpr auto const is_checksum_envelope_v = std::conjunction_v< - detail::has_compat_attribute, - detail::has_version_attribute, - detail::compat_version_has_serde_version_type, - detail::version_has_serde_version_type, - detail::has_checksum_attribute>; +concept is_checksum_envelope + = is_envelope && T::redpanda_serde_build_checksum; template -inline constexpr auto const inherits_from_envelope_v - = detail::inherits_from_envelope::value; +concept inherits_from_envelope = T::redpanda_inherits_from_envelope; } // namespace serde diff --git a/src/v/serde/envelope_for_each_field.h b/src/v/serde/envelope_for_each_field.h index 772c4f12397f4..2fd696c96e2d2 100644 --- a/src/v/serde/envelope_for_each_field.h +++ b/src/v/serde/envelope_for_each_field.h @@ -19,31 +19,21 @@ namespace serde { namespace detail { -template -struct has_serde_fields : std::false_type {}; - -template -struct has_serde_fields< - T, - std::void_t>().serde_fields())>> - : std::true_type {}; - template -inline constexpr auto const has_serde_fields_v = has_serde_fields::value; +concept has_serde_fields = requires(T t) { + t.serde_fields(); +}; } // namespace detail -template< - typename T, - std::enable_if_t, void*> = nullptr> +template constexpr inline auto envelope_to_tuple(T&& t) { return t.serde_fields(); } -template< - typename T, - std::enable_if_t, void*> = nullptr> -constexpr inline auto envelope_to_tuple(T& t) { +template +requires(!detail::has_serde_fields) constexpr inline auto envelope_to_tuple( + T& t) { static_assert(std::is_aggregate_v); static_assert(std::is_standard_layout_v); static_assert(!std::is_polymorphic_v); @@ -212,11 +202,14 @@ constexpr inline auto envelope_to_tuple(T& t) { } } -template -inline auto envelope_for_each_field(T& t, Fn&& fn) -> std::enable_if_t< - !std::is_convertible_v())), bool>> { - static_assert(is_envelope_v>); - if constexpr (inherits_from_envelope_v>) { +template +concept check_for_more_fn = requires(Fn&& fn, int& f) { + { fn(f) } -> std::convertible_to; +}; + +template +inline auto envelope_for_each_field(T& t, Fn&& fn) { + if constexpr (inherits_from_envelope>) { std::apply( [&](auto&&... args) { (fn(args), ...); }, envelope_to_tuple(t)); } else { @@ -225,11 +218,9 @@ inline auto envelope_for_each_field(T& t, Fn&& fn) -> std::enable_if_t< } } -template -inline auto envelope_for_each_field(T& t, Fn&& fn) -> std::enable_if_t< - std::is_convertible_v())), bool>> { - static_assert(is_envelope_v>); - if constexpr (inherits_from_envelope_v>) { +template +inline auto envelope_for_each_field(T& t, Fn&& fn) { + if constexpr (inherits_from_envelope>) { std::apply( [&](auto&&... args) { (void)(fn(args) && ...); }, envelope_to_tuple(t)); diff --git a/src/v/serde/serde.h b/src/v/serde/serde.h index 4174ef5a53471..5fb9c76d00e7d 100644 --- a/src/v/serde/serde.h +++ b/src/v/serde/serde.h @@ -25,6 +25,7 @@ #include "utils/named_type.h" #include "vlog.h" +#include #include #include @@ -69,57 +70,26 @@ struct header { checksum_t _checksum; }; -template -struct help_has_serde_read : std::false_type {}; - -template -struct help_has_serde_read< - T, - std::void_t().serde_read( - std::declval>(), - std::declval
()))>> : std::true_type {}; - -template -inline constexpr auto const has_serde_read = help_has_serde_read::value; - -template -struct help_has_serde_write : std::false_type {}; - -template -struct help_has_serde_write< - T, - std::void_t().serde_write( - std::declval>()))>> : std::true_type {}; - template -inline constexpr auto const has_serde_write = help_has_serde_write::value; - -template -struct help_has_serde_async_read : std::false_type {}; - -template -struct help_has_serde_async_read< - T, - std::void_t().serde_async_read( - std::declval>(), - std::declval
()))>> : std::true_type {}; +concept has_serde_read = requires(T t, iobuf_parser& in, const header& h) { + t.serde_read(in, h); +}; template -inline constexpr auto const has_serde_async_read - = help_has_serde_async_read::value; - -template -struct help_has_serde_async_write : std::false_type {}; +concept has_serde_write = requires(T t, iobuf& out) { + t.serde_write(out); +}; template -struct help_has_serde_async_write< - T, - std::void_t().serde_async_write( - std::declval>()))>> : std::true_type {}; +concept has_serde_async_read + = requires(T t, iobuf_parser& in, const header& h) { + { t.serde_async_read(in, h) } -> seastar::Future; +}; template -inline constexpr auto const has_serde_async_write - = help_has_serde_async_write::value; +concept has_serde_async_write = requires(T t, iobuf& out) { + { t.serde_async_write(out) } -> seastar::Future; +}; using serde_enum_serialized_t = int32_t; @@ -159,7 +129,7 @@ inline constexpr bool is_absl_node_hash_map_v = is_absl_node_hash_map::value; template inline constexpr auto const is_serde_compatible_v - = is_envelope_v + = is_envelope || (std::is_scalar_v // && (!std::is_same_v || std::numeric_limits::is_iec559) && (!std::is_same_v || std::numeric_limits::is_iec559) @@ -190,14 +160,14 @@ void write(iobuf& out, T t) { static_assert(are_bytes_and_string_different); static_assert(has_serde_write || is_serde_compatible_v); - if constexpr (is_envelope_v) { + if constexpr (is_envelope) { write(out, Type::redpanda_serde_version); write(out, Type::redpanda_serde_compat_version); auto size_placeholder = out.reserve(sizeof(serde_size_t)); auto checksum_placeholder = iobuf::placeholder{}; - if constexpr (is_checksum_envelope_v) { + if constexpr (is_checksum_envelope) { checksum_placeholder = out.reserve(sizeof(checksum_t)); } @@ -218,7 +188,7 @@ void write(iobuf& out, T t) { size_placeholder.write( reinterpret_cast(&size), sizeof(serde_size_t)); - if constexpr (is_checksum_envelope_v) { + if constexpr (is_checksum_envelope) { auto crc = crc::crc32c{}; auto in = iobuf_const_parser{out}; in.skip(size_before); @@ -388,7 +358,7 @@ header read_header(iobuf_parser& in, std::size_t const bytes_left_limit) { auto const size = read_nested(in, bytes_left_limit); auto checksum = checksum_t{}; - if constexpr (is_checksum_envelope_v) { + if constexpr (is_checksum_envelope) { checksum = read_nested(in, bytes_left_limit); } @@ -443,10 +413,10 @@ void read_nested(iobuf_parser& in, T& t, std::size_t const bytes_left_limit) { static_assert(are_bytes_and_string_different); static_assert(has_serde_read || is_serde_compatible_v); - if constexpr (is_envelope_v) { + if constexpr (is_envelope) { auto const h = read_header(in, bytes_left_limit); - if constexpr (is_checksum_envelope_v) { + if constexpr (is_checksum_envelope) { auto const shared = in.share(in.bytes_left() - h._bytes_left_limit); auto read_only_in = iobuf_const_parser{shared}; auto crc = crc::crc32c{}; @@ -679,7 +649,7 @@ template ss::future> read_async(iobuf_parser& in) { return read_async_nested(in, 0).then([&](std::decay_t&& t) { if (likely(in.bytes_left() == 0)) { - return ss::make_ready_future>(t); + return ss::make_ready_future>(std::move(t)); } else { return ss::make_exception_future>( serde_exception{fmt_with_ctx( @@ -693,33 +663,40 @@ ss::future> read_async(iobuf_parser& in) { } template -ss::future<> write_async(iobuf& out, T const& t) { +ss::future<> write_async(iobuf& out, T t) { using Type = std::decay_t; - if constexpr (is_envelope_v && has_serde_async_write) { + if constexpr (is_envelope && has_serde_async_write) { write(out, Type::redpanda_serde_version); write(out, Type::redpanda_serde_compat_version); auto size_placeholder = out.reserve(sizeof(serde_size_t)); auto const size_before = out.size_bytes(); - return t.serde_async_write(out).then( - [&out, - size_before, - size_placeholder = std::move(size_placeholder)]() mutable { - auto const written_size = out.size_bytes() - size_before; - if (unlikely( - written_size > std::numeric_limits::max())) { - throw serde_exception{"envelope too big"}; - } - auto const size = ss::cpu_to_le( - static_cast(written_size)); - size_placeholder.write( - reinterpret_cast(&size), sizeof(serde_size_t)); - - return ss::make_ready_future<>(); + return ss::do_with( + std::move(t), + [&out, size_before, size_placeholder = std::move(size_placeholder)]( + T& t) mutable { + return t.serde_async_write(out).then( + [&out, + size_before, + size_placeholder = std::move(size_placeholder)]() mutable { + auto const written_size = out.size_bytes() - size_before; + if (unlikely( + written_size + > std::numeric_limits::max())) { + throw serde_exception{"envelope too big"}; + } + auto const size = ss::cpu_to_le( + static_cast(written_size)); + size_placeholder.write( + reinterpret_cast(&size), + sizeof(serde_size_t)); + + return ss::make_ready_future<>(); + }); }); } else { - write(out, t); + write(out, std::move(t)); return ss::make_ready_future<>(); } } diff --git a/src/v/serde/test/fuzz.cc b/src/v/serde/test/fuzz.cc index 91840047863ae..115dc184a9cf2 100644 --- a/src/v/serde/test/fuzz.cc +++ b/src/v/serde/test/fuzz.cc @@ -18,12 +18,7 @@ bool eq( return ((std::get(a) == std::get(b)) && ...); } -template< - typename T1, - typename T2, - typename std::enable_if_t< - serde::is_envelope_v && serde::is_envelope_v, - void*> = nullptr> +template bool operator==(T1 const& a, T2 const& b) { return eq( envelope_to_tuple(a), @@ -69,7 +64,7 @@ void init( data_gen& gen, std::index_sequence generations, int depth = 0) { - if constexpr (serde::is_envelope_v) { + if constexpr (serde::is_envelope) { ((std::apply( [&](auto&&... args) { (init(args, gen, generations, depth + 1), ...); diff --git a/src/v/serde/test/serde_test.cc b/src/v/serde/test/serde_test.cc index a631f6c672ce3..f8ce5abf50708 100644 --- a/src/v/serde/test/serde_test.cc +++ b/src/v/serde/test/serde_test.cc @@ -90,10 +90,10 @@ struct test_msg1_new_manual { }; struct not_an_envelope {}; -static_assert(!serde::is_envelope_v); -static_assert(serde::is_envelope_v); -static_assert(serde::inherits_from_envelope_v); -static_assert(!serde::inherits_from_envelope_v); +static_assert(!serde::is_envelope); +static_assert(serde::is_envelope); +static_assert(serde::inherits_from_envelope); +static_assert(!serde::inherits_from_envelope); static_assert(test_msg1::redpanda_serde_version == 4); static_assert(test_msg1::redpanda_serde_compat_version == 0); @@ -234,7 +234,7 @@ struct complex_msg : serde::envelope> { int32_t _x; }; -static_assert(serde::is_envelope_v); +static_assert(serde::is_envelope); SEASTAR_THREAD_TEST_CASE(complex_msg_test) { auto b = iobuf(); @@ -386,7 +386,7 @@ struct test_snapshot_header int32_t metadata_size; }; -static_assert(serde::is_envelope_v); +static_assert(serde::is_envelope); static_assert(serde::has_serde_async_read); static_assert(serde::has_serde_async_write); diff --git a/src/v/ssx/async-clear.h b/src/v/ssx/async-clear.h index 0ea00807aa1e3..e6d6391c01960 100644 --- a/src/v/ssx/async-clear.h +++ b/src/v/ssx/async-clear.h @@ -11,6 +11,8 @@ #pragma once +#include "seastarx.h" + #include #include @@ -28,7 +30,7 @@ namespace ssx { template class async_clear { public: - async_clear(absl::flat_hash_map& c) + explicit async_clear(absl::flat_hash_map& c) : _container(c) {} /** diff --git a/src/v/ssx/metrics.h b/src/v/ssx/metrics.h index 3858980c00284..4f3afd9ae8db0 100644 --- a/src/v/ssx/metrics.h +++ b/src/v/ssx/metrics.h @@ -34,4 +34,10 @@ inline ss::metrics::histogram report_default_histogram(const hdr_hist& hist) { num_buckets, first_value, log_base, scale); } +constexpr auto label_namespace = "redpanda"; + +inline ss::metrics::label make_namespaced_label(const seastar::sstring& name) { + return ss::metrics::label(ssx::sformat("{}_{}", label_namespace, name)); +} + } // namespace ssx::metrics diff --git a/src/v/storage/compacted_index.h b/src/v/storage/compacted_index.h index 6906243899134..05f9325328f6b 100644 --- a/src/v/storage/compacted_index.h +++ b/src/v/storage/compacted_index.h @@ -12,6 +12,7 @@ #pragma once #include "bytes/bytes.h" #include "model/fundamental.h" +#include "model/record_batch_types.h" #include #include @@ -20,6 +21,28 @@ namespace storage { // simple types shared among readers and writers +/** + * Type representing a record key prefixed with batch_type + */ +struct compaction_key : bytes { + explicit compaction_key(bytes b) + : bytes(std::move(b)) {} +}; + +inline compaction_key +prefix_with_batch_type(model::record_batch_type type, bytes_view key) { + auto bt_le = ss::cpu_to_le( + static_cast::type>(type)); + auto enriched_key = ss::uninitialized_string( + sizeof(bt_le) + key.size()); + auto out = enriched_key.begin(); + out = std::copy_n( + reinterpret_cast(&bt_le), sizeof(bt_le), out); + std::copy_n(key.begin(), key.size(), out); + + return compaction_key(std::move(enriched_key)); +} + struct compacted_index { static constexpr const size_t max_entry_size = size_t( std::numeric_limits::max()); @@ -40,12 +63,17 @@ struct compacted_index { self_compaction = 1U << 1U, }; struct footer { + // initial version of footer + static constexpr int8_t base_version = 0; + // introduced a key being a tuple of batch_type and the key content + static constexpr int8_t key_prefixed_with_batch_type = 1; + uint32_t size{0}; uint32_t keys{0}; footer_flags flags{0}; uint32_t crc{0}; // crc32 // version *must* be the last value - int8_t version{0}; + int8_t version{key_prefixed_with_batch_type}; friend std::ostream& operator<<(std::ostream& o, const compacted_index::footer& f) { @@ -55,14 +83,23 @@ struct compacted_index { } }; enum class recovery_state { - // happens during a crash - missing, - // needs rebuilding - when user 'touch' a file or during a crash - needsrebuild, - // already recovered - nothing to do - after a reboot - recovered, - // we need to compact next - nonrecovered + /** + * Index may be missing when either was deleted or not stored when + * redpanda crashed + */ + index_missing, + /** + * Index may needs a rebuild when it is corrupted + */ + index_needs_rebuild, + /** + * Segment is already compacted + */ + already_compacted, + /** + * Compaction index is recovered, ready to compaction + */ + index_recovered }; static constexpr size_t footer_size = sizeof(footer::size) + sizeof(footer::keys) @@ -71,18 +108,22 @@ struct compacted_index { + sizeof(footer::version); // for the readers and friends struct entry { - entry(entry_type t, bytes k, model::offset o, int32_t d) noexcept + entry( + entry_type t, compaction_key k, model::offset o, int32_t d) noexcept : type(t) , key(std::move(k)) , offset(o) , delta(d) {} entry_type type; - bytes key; + compaction_key key; model::offset offset; int32_t delta; }; }; + +std::ostream& operator<<(std::ostream&, compacted_index::recovery_state); + [[gnu::always_inline]] inline compacted_index::footer_flags operator|(compacted_index::footer_flags a, compacted_index::footer_flags b) { return compacted_index::footer_flags( diff --git a/src/v/storage/compacted_index_chunk_reader.cc b/src/v/storage/compacted_index_chunk_reader.cc index 0ca818ebe99a6..b113e1c5c9242 100644 --- a/src/v/storage/compacted_index_chunk_reader.cc +++ b/src/v/storage/compacted_index_chunk_reader.cc @@ -212,7 +212,8 @@ compacted_index_chunk_reader::load_slice(model::timeout_clock::time_point t) { auto type = reflection::adl{}.from(p); auto [offset, _1] = p.read_varlong(); auto [delta, _2] = p.read_varlong(); - auto key = p.read_bytes(p.bytes_left()); + auto bytes = p.read_bytes(p.bytes_left()); + auto key = compaction_key(std::move(bytes)); slice.push_back(compacted_index::entry( compacted_index::entry_type(type), std::move(key), diff --git a/src/v/storage/compacted_index_writer.h b/src/v/storage/compacted_index_writer.h index 75f54e594daef..73c6090be00e2 100644 --- a/src/v/storage/compacted_index_writer.h +++ b/src/v/storage/compacted_index_writer.h @@ -12,6 +12,7 @@ #pragma once #include "bytes/bytes.h" #include "model/fundamental.h" +#include "model/record_batch_types.h" #include "storage/compacted_index.h" #include "storage/types.h" @@ -56,18 +57,21 @@ class compacted_index_writer { impl& operator=(const impl&) = delete; virtual ss::future<> index( - bytes_view, // convert from bytes which is the key-type in map + const compaction_key&, // convert from bytes which is the key-type in + // map model::offset base_offset, int32_t offset_delta) = 0; virtual ss::future<> index( + model::record_batch_type, const iobuf& key, // default format in record batch model::offset base_offset, int32_t offset_delta) = 0; virtual ss::future<> index( + model::record_batch_type, bytes&& key, // default format in record batch model::offset base_offset, int32_t offset_delta) @@ -92,9 +96,13 @@ class compacted_index_writer { explicit compacted_index_writer(std::unique_ptr i) : _impl(std::move(i)) {} - ss::future<> index(bytes_view, model::offset, int32_t); - ss::future<> index(const iobuf& key, model::offset, int32_t); - ss::future<> index(bytes&&, model::offset, int32_t); + // accepts a compaction_key which is already prefixed with batch_type + ss::future<> index(const compaction_key& b, model::offset, int32_t); + + ss::future<> + index(model::record_batch_type, const iobuf& key, model::offset, int32_t); + ss::future<> + index(model::record_batch_type, bytes&&, model::offset, int32_t); ss::future<> append(compacted_index::entry); @@ -127,16 +135,22 @@ compacted_index_writer::release() && { return std::move(_impl); } inline ss::future<> compacted_index_writer::index( - const iobuf& b, model::offset base_offset, int32_t delta) { - return _impl->index(b, base_offset, delta); + model::record_batch_type batch_type, + const iobuf& b, + model::offset base_offset, + int32_t delta) { + return _impl->index(batch_type, b, base_offset, delta); } inline ss::future<> compacted_index_writer::index( - bytes_view b, model::offset base_offset, int32_t delta) { + const compaction_key& b, model::offset base_offset, int32_t delta) { return _impl->index(b, base_offset, delta); } inline ss::future<> compacted_index_writer::index( - bytes&& b, model::offset base_offset, int32_t delta) { - return _impl->index(std::move(b), base_offset, delta); + model::record_batch_type batch_type, + bytes&& b, + model::offset base_offset, + int32_t delta) { + return _impl->index(batch_type, std::move(b), base_offset, delta); } inline ss::future<> compacted_index_writer::truncate(model::offset o) { return _impl->truncate(o); diff --git a/src/v/storage/compaction_reducers.cc b/src/v/storage/compaction_reducers.cc index 6ac4eb8836528..391fb21204083 100644 --- a/src/v/storage/compaction_reducers.cc +++ b/src/v/storage/compaction_reducers.cc @@ -97,8 +97,7 @@ index_filtered_copy_reducer::operator()(compacted_index::entry&& e) { const bool should_add = _bm.contains(_natural_index); ++_natural_index; if (should_add) { - bytes_view bv = e.key; - return _writer->index(bv, e.offset, e.delta) + return _writer->index(e.key, e.offset, e.delta) .then([k = std::move(e.key)] { return ss::make_ready_future(stop_t::no); }); @@ -118,12 +117,7 @@ std::optional copy_data_segment_reducer::filter(model::record_batch&& batch) { // do not compact raft configuration and archival metadata as they shift // offset translation - if ( - batch.header().type == model::record_batch_type::raft_configuration - || batch.header().type == model::record_batch_type::archival_metadata - || batch.header().type == model::record_batch_type::group_abort_tx - || batch.header().type == model::record_batch_type::group_commit_tx - || batch.header().type == model::record_batch_type::group_prepare_tx) { + if (!is_compactible(batch)) { return std::move(batch); } @@ -299,8 +293,9 @@ index_rebuilder_reducer::operator()(model::record_batch&& b) { ss::future<> index_rebuilder_reducer::do_index(model::record_batch&& b) { return ss::do_with(std::move(b), [this](model::record_batch& b) { return model::for_each_record( - b, [this, o = b.base_offset()](model::record& r) { - return _w->index(r.key(), o, r.offset_delta()); + b, + [this, bt = b.header().type, o = b.base_offset()](model::record& r) { + return _w->index(bt, r.key(), o, r.offset_delta()); }); }); } diff --git a/src/v/storage/ntp_config.h b/src/v/storage/ntp_config.h index 4a0e6113a0570..b9e205840e366 100644 --- a/src/v/storage/ntp_config.h +++ b/src/v/storage/ntp_config.h @@ -47,6 +47,8 @@ class ntp_config { model::shadow_indexing_mode shadow_indexing_mode = model::shadow_indexing_mode::disabled; + std::optional read_replica; + friend std::ostream& operator<<(std::ostream&, const default_overrides&); }; @@ -150,6 +152,11 @@ class ntp_config { && model::is_fetch_enabled(_overrides->shadow_indexing_mode); } + bool is_read_replica_mode_enabled() const { + return _overrides != nullptr && _overrides->read_replica + && _overrides->read_replica.value(); + } + private: model::ntp _ntp; /// \brief currently this is the basedir. In the future diff --git a/src/v/storage/segment.cc b/src/v/storage/segment.cc index 21f5f63e5be49..28cd1290bcaea 100644 --- a/src/v/storage/segment.cc +++ b/src/v/storage/segment.cc @@ -179,7 +179,7 @@ ss::future<> segment::release_appender(readers_cache* readers_cache) { * An exception safe variant of try write lock is simulated since seastar * does not have such primitives available on the semaphore. The fast path * of try_write_lock is combined with immediately releasing the lock (which - * will not also not signal any waiters--there cannot be any!) to guarnatee + * will not also not signal any waiters--there cannot be any!) to guarantee * that the blocking get_units version will find the lock uncontested. * * TODO: we should upstream get_units try-variants for semaphore and rwlock. @@ -357,14 +357,21 @@ ss::future<> segment::do_compaction_index_batch(const model::record_batch& b) { vassert(!b.compressed(), "wrong method. Call compact_index_batch. {}", b); auto& w = compaction_index(); return model::for_each_record( - b, [o = b.base_offset(), &w](const model::record& r) { - return w.index(r.key(), o, r.offset_delta()); + b, + [o = b.base_offset(), batch_type = b.header().type, &w]( + const model::record& r) { + return w.index(batch_type, r.key(), o, r.offset_delta()); }); } ss::future<> segment::compaction_index_batch(const model::record_batch& b) { if (!has_compaction_index()) { return ss::now(); } + // do not index not compactible batches + if (!internal::is_compactible(b)) { + return ss::now(); + } + if (!b.compressed()) { return do_compaction_index_batch(b); } @@ -447,7 +454,7 @@ ss::future segment::append(const model::record_batch& b) { auto index_err = std::move(index_fut).get_exception(); vlog( stlog.error, - "segment::append index: {}. ignorning append: {}", + "segment::append index: {}. ignoring append: {}", index_err, ret); return ss::make_exception_future(index_err); diff --git a/src/v/storage/segment_utils.cc b/src/v/storage/segment_utils.cc index df6eec938b835..d7056f9eab77e 100644 --- a/src/v/storage/segment_utils.cc +++ b/src/v/storage/segment_utils.cc @@ -54,31 +54,6 @@ #include #include -template<> -struct fmt::formatter { - using recovery_state = storage::compacted_index::recovery_state; - constexpr auto parse(format_parse_context& ctx) { return ctx.end(); } - template - auto format(const recovery_state& s, FormatContext& ctx) const { - std::string_view str = "unknown"; - switch (s) { - case recovery_state::missing: - str = "missing"; - break; - case recovery_state::needsrebuild: - str = "needsrebuild"; - break; - case recovery_state::recovered: - str = "recovered"; - break; - case recovery_state::nonrecovered: - str = "nonrecovered"; - break; - } - return format_to(ctx.out(), "{}", str); - } -}; - namespace storage::internal { using namespace storage; // NOLINT @@ -306,9 +281,16 @@ ss::future do_detect_compaction_index_state( .then([reader]() mutable { return reader.load_footer(); }) .then([](compacted_index::footer footer) { if (bool(footer.flags & flags::self_compaction)) { - return compacted_index::recovery_state::recovered; + return compacted_index::recovery_state::already_compacted; + } + // if we deal with old version of index that is not yet + // compacted request a rebuild + if ( + footer.version + < compacted_index::footer::key_prefixed_with_batch_type) { + return compacted_index::recovery_state::index_needs_rebuild; } - return compacted_index::recovery_state::nonrecovered; + return compacted_index::recovery_state::index_recovered; }) .finally([reader]() mutable { return reader.close(); }); }) @@ -318,7 +300,7 @@ ss::future do_detect_compaction_index_state( "detected error while attempting recovery, {}. marking as 'needs " "rebuild'. Common situation during crashes or hard shutdowns.", e); - return compacted_index::recovery_state::needsrebuild; + return compacted_index::recovery_state::index_needs_rebuild; }); } @@ -329,7 +311,7 @@ detect_compaction_index_state(std::filesystem::path p, compaction_config cfg) { return do_detect_compaction_index_state(p, cfg); } return ss::make_ready_future( - compacted_index::recovery_state::missing); + compacted_index::recovery_state::index_missing); }); } @@ -560,20 +542,20 @@ ss::future self_compact_segment( compacted_index::recovery_state state) mutable { vlog(gclog.trace, "segment {} compaction state: {}", idx_path, state); switch (state) { - case compacted_index::recovery_state::recovered: { + case compacted_index::recovery_state::already_compacted: { vlog(gclog.debug, "detected {} is already compacted", idx_path); return ss::make_ready_future(s->size_bytes()); } - case compacted_index::recovery_state::nonrecovered: + case compacted_index::recovery_state::index_recovered: return do_self_compact_segment( s, cfg, pb, readers_cache, resources) .then([before = s->size_bytes(), &pb](size_t sz_after) { pb.segment_compacted(); return compaction_result(before, sz_after); }); - case compacted_index::recovery_state::missing: + case compacted_index::recovery_state::index_missing: [[fallthrough]]; - case compacted_index::recovery_state::needsrebuild: { + case compacted_index::recovery_state::index_needs_rebuild: { vlog(gclog.info, "Rebuilding index file... ({})", idx_path); pb.corrupted_compaction_index(); return s->read_lock() diff --git a/src/v/storage/segment_utils.h b/src/v/storage/segment_utils.h index 711af52bda1a1..233d343fd3db8 100644 --- a/src/v/storage/segment_utils.h +++ b/src/v/storage/segment_utils.h @@ -194,4 +194,13 @@ struct clean_segment_value ss::sstring segment_name; }; +inline bool is_compactible(const model::record_batch& b) { + return !( + b.header().type == model::record_batch_type::raft_configuration + || b.header().type == model::record_batch_type::archival_metadata + || b.header().type == model::record_batch_type::group_abort_tx + || b.header().type == model::record_batch_type::group_commit_tx + || b.header().type == model::record_batch_type::group_prepare_tx); +} + } // namespace storage::internal diff --git a/src/v/storage/spill_key_index.cc b/src/v/storage/spill_key_index.cc index 5cd1b8536019a..0feaf59bfdbf6 100644 --- a/src/v/storage/spill_key_index.cc +++ b/src/v/storage/spill_key_index.cc @@ -12,6 +12,7 @@ #include "bytes/bytes.h" #include "random/generators.h" #include "reflection/adl.h" +#include "storage/compacted_index.h" #include "storage/compacted_index_writer.h" #include "storage/logger.h" #include "storage/segment_utils.h" @@ -68,8 +69,8 @@ spill_key_index::~spill_key_index() { _midx.size()); } -ss::future<> -spill_key_index::index(bytes_view v, model::offset base_offset, int32_t delta) { +ss::future<> spill_key_index::index( + const compaction_key& v, model::offset base_offset, int32_t delta) { if (auto it = _midx.find(v); it != _midx.end()) { auto& pair = it->second; if (base_offset > pair.base_offset) { @@ -79,10 +80,10 @@ spill_key_index::index(bytes_view v, model::offset base_offset, int32_t delta) { return ss::now(); } // not found - return add_key(bytes(v), value_type{base_offset, delta}); + return add_key(v, value_type{base_offset, delta}); } -ss::future<> spill_key_index::add_key(bytes b, value_type v) { +ss::future<> spill_key_index::add_key(compaction_key b, value_type v) { auto f = ss::now(); auto const key_size = b.size(); auto const expected_size = idx_mem_usage() + _keys_mem_usage + key_size; @@ -121,9 +122,13 @@ ss::future<> spill_key_index::add_key(bytes b, value_type v) { }); } -ss::future<> -spill_key_index::index(bytes&& b, model::offset base_offset, int32_t delta) { - if (auto it = _midx.find(b); it != _midx.end()) { +ss::future<> spill_key_index::index( + model::record_batch_type batch_type, + bytes&& b, + model::offset base_offset, + int32_t delta) { + auto key = prefix_with_batch_type(batch_type, b); + if (auto it = _midx.find(key); it != _midx.end()) { auto& pair = it->second; // must use both base+delta, since we only want to keep the latest // which might be inserted into the batch multiple times by client @@ -136,11 +141,15 @@ spill_key_index::index(bytes&& b, model::offset base_offset, int32_t delta) { return ss::now(); } // not found - return add_key(std::move(b), value_type{base_offset, delta}); + return add_key(std::move(key), value_type{base_offset, delta}); } ss::future<> spill_key_index::index( - const iobuf& key, model::offset base_offset, int32_t delta) { + model::record_batch_type batch_type, + const iobuf& key, + model::offset base_offset, + int32_t delta) { return index( + batch_type, iobuf_to_bytes(key), // makes a copy, but we need deterministic keys base_offset, delta); diff --git a/src/v/storage/spill_key_index.h b/src/v/storage/spill_key_index.h index d4939d04e4448..20f64a213a45c 100644 --- a/src/v/storage/spill_key_index.h +++ b/src/v/storage/spill_key_index.h @@ -14,6 +14,7 @@ #include "hashing/crc32c.h" #include "hashing/xx.h" #include "model/fundamental.h" +#include "model/record_batch_types.h" #include "storage/compacted_index.h" #include "storage/compacted_index_writer.h" #include "storage/segment_appender.h" @@ -38,7 +39,7 @@ class spill_key_index final : public compacted_index_writer::impl { static constexpr size_t max_key_size = compacted_index::max_entry_size - (2 * vint::max_length); using underlying_t = absl::node_hash_map< - bytes, + compaction_key, value_type, bytes_hasher, bytes_type_eq>; @@ -66,9 +67,11 @@ class spill_key_index final : public compacted_index_writer::impl { ss::future<> maybe_open(); ss::future<> open(); - ss::future<> index(const iobuf& key, model::offset, int32_t) final; - ss::future<> index(bytes_view, model::offset, int32_t) final; - ss::future<> index(bytes&&, model::offset, int32_t) final; + ss::future<> index( + model::record_batch_type, const iobuf& key, model::offset, int32_t) final; + ss::future<> index(const compaction_key& b, model::offset, int32_t) final; + ss::future<> + index(model::record_batch_type, bytes&&, model::offset, int32_t) final; ss::future<> truncate(model::offset) final; ss::future<> append(compacted_index::entry) final; ss::future<> close() final; @@ -88,7 +91,7 @@ class spill_key_index final : public compacted_index_writer::impl { return debug::AllocatedByteSize(_midx); } ss::future<> drain_all_keys(); - ss::future<> add_key(bytes b, value_type); + ss::future<> add_key(compaction_key, value_type); ss::future<> spill(compacted_index::entry_type, bytes_view, value_type); storage::debug_sanitize_files _debug; diff --git a/src/v/storage/tests/compaction_idx_bench.cc b/src/v/storage/tests/compaction_idx_bench.cc index 6afeae1cf6569..950613db78faf 100644 --- a/src/v/storage/tests/compaction_idx_bench.cc +++ b/src/v/storage/tests/compaction_idx_bench.cc @@ -30,7 +30,10 @@ PERF_TEST_F(reducer_bench, compaction_key_reducer_test) { auto key = random_generators::get_bytes(20); storage::compacted_index::entry entry( - storage::compacted_index::entry_type::key, std::move(key), o, 0); + storage::compacted_index::entry_type::key, + storage::compaction_key(std::move(key)), + o, + 0); perf_tests::start_measuring_time(); return reducer(std::move(entry)).discard_result().finally([] { diff --git a/src/v/storage/tests/compaction_index_format_tests.cc b/src/v/storage/tests/compaction_index_format_tests.cc index 16c931a662fc3..594c720e30362 100644 --- a/src/v/storage/tests/compaction_index_format_tests.cc +++ b/src/v/storage/tests/compaction_index_format_tests.cc @@ -24,13 +24,10 @@ #include -class - - storage::compacted_index_writer - make_dummy_compacted_index( - tmpbuf_file::store_t& index_data, - size_t max_mem, - storage::storage_resources& resources) { +storage::compacted_index_writer make_dummy_compacted_index( + tmpbuf_file::store_t& index_data, + size_t max_mem, + storage::storage_resources& resources) { auto f = ss::file(ss::make_shared(tmpbuf_file(index_data))); return storage::compacted_index_writer( std::make_unique( @@ -41,16 +38,52 @@ struct compacted_topic_fixture { storage::storage_resources resources; }; +model::record_batch_type random_batch_type() { + return random_generators::random_choice( + std::vector{ + model::record_batch_type::raft_data, + model::record_batch_type::raft_configuration, + model::record_batch_type::controller, + model::record_batch_type::kvstore, + model::record_batch_type::checkpoint, + model::record_batch_type::topic_management_cmd, + model::record_batch_type::ghost_batch, + model::record_batch_type::id_allocator, + model::record_batch_type::tx_prepare, + model::record_batch_type::tx_fence, + model::record_batch_type::tm_update, + model::record_batch_type::user_management_cmd, + model::record_batch_type::acl_management_cmd, + model::record_batch_type::group_prepare_tx, + model::record_batch_type::group_commit_tx, + model::record_batch_type::group_abort_tx, + model::record_batch_type::node_management_cmd, + model::record_batch_type::data_policy_management_cmd, + model::record_batch_type::archival_metadata, + model::record_batch_type::cluster_config_cmd, + model::record_batch_type::feature_update, + }); +} + +bytes extract_record_key(bytes prefixed_key) { + size_t sz = prefixed_key.size() - 1; + auto read_key = ss::uninitialized_string(sz); + + std::copy_n(prefixed_key.begin() + 1, sz, read_key.begin()); + return read_key; +} + FIXTURE_TEST(format_verification, compacted_topic_fixture) { tmpbuf_file::store_t index_data; auto idx = make_dummy_compacted_index(index_data, 1_KiB, resources); const auto key = random_generators::get_bytes(1024); - idx.index(key, model::offset(42), 66).get(); + auto bt = random_batch_type(); + idx.index(bt, bytes(key), model::offset(42), 66).get(); idx.close().get(); info("{}", idx); iobuf data = std::move(index_data).release_iobuf(); - BOOST_REQUIRE_EQUAL(data.size_bytes(), 1047); + BOOST_REQUIRE_EQUAL(data.size_bytes(), 1048); iobuf_parser p(data.share(0, data.size_bytes())); (void)p.consume_type(); // SIZE (void)p.consume_type(); // TYPE @@ -58,30 +91,36 @@ FIXTURE_TEST(format_verification, compacted_topic_fixture) { BOOST_REQUIRE_EQUAL(model::offset(offset), model::offset(42)); auto [delta, _2] = p.read_varlong(); BOOST_REQUIRE_EQUAL(delta, 66); - const auto key_result = p.read_bytes(1024); - BOOST_REQUIRE_EQUAL(key, key_result); + const auto key_result = p.read_bytes(1025); + + auto read_key = extract_record_key(key_result); + BOOST_REQUIRE_EQUAL(key, read_key); auto footer = reflection::adl{}.from(p); info("{}", footer); BOOST_REQUIRE_EQUAL(footer.keys, 1); BOOST_REQUIRE_EQUAL( footer.size, - sizeof(uint16_t) - + 1 /*type*/ + 1 /*offset*/ + 2 /*delta*/ + 1024 /*key*/); - BOOST_REQUIRE_EQUAL(footer.version, 0); + sizeof(uint16_t) + 1 /*type*/ + 1 /*offset*/ + 2 /*delta*/ + + 1 /*batch_type*/ + 1024 /*key*/); + BOOST_REQUIRE_EQUAL( + footer.version, + storage::compacted_index::footer::key_prefixed_with_batch_type); BOOST_REQUIRE(footer.crc != 0); } FIXTURE_TEST(format_verification_max_key, compacted_topic_fixture) { tmpbuf_file::store_t index_data; auto idx = make_dummy_compacted_index(index_data, 1_MiB, resources); const auto key = random_generators::get_bytes(1_MiB); - idx.index(key, model::offset(42), 66).get(); + auto bt = random_batch_type(); + idx.index(bt, bytes(key), model::offset(42), 66).get(); idx.close().get(); info("{}", idx); /** * Length of an entry is equal to * - * max_key_size + sizeof(uint8_t) + sizeof(uint16_t) + vint(42) + vint(66) + * max_key_size + sizeof(uint8_t) + sizeof(uint16_t) + vint(42) + + * vint(66) */ iobuf data = std::move(index_data).release_iobuf(); @@ -104,7 +143,8 @@ FIXTURE_TEST(format_verification_roundtrip, compacted_topic_fixture) { tmpbuf_file::store_t index_data; auto idx = make_dummy_compacted_index(index_data, 1_MiB, resources); const auto key = random_generators::get_bytes(20); - idx.index(key, model::offset(42), 66).get(); + auto bt = random_batch_type(); + idx.index(bt, bytes(key), model::offset(42), 66).get(); idx.close().get(); info("{}", idx); @@ -115,20 +155,23 @@ FIXTURE_TEST(format_verification_roundtrip, compacted_topic_fixture) { 32_KiB); auto footer = rdr.load_footer().get0(); BOOST_REQUIRE_EQUAL(footer.keys, 1); - BOOST_REQUIRE_EQUAL(footer.version, 0); + BOOST_REQUIRE_EQUAL( + footer.version, + storage::compacted_index::footer::key_prefixed_with_batch_type); BOOST_REQUIRE(footer.crc != 0); auto vec = compaction_index_reader_to_memory(std::move(rdr)).get0(); BOOST_REQUIRE_EQUAL(vec.size(), 1); BOOST_REQUIRE_EQUAL(vec[0].offset, model::offset(42)); BOOST_REQUIRE_EQUAL(vec[0].delta, 66); - BOOST_REQUIRE_EQUAL(vec[0].key, key); + BOOST_REQUIRE_EQUAL(extract_record_key(vec[0].key), key); } FIXTURE_TEST( format_verification_roundtrip_exceeds_capacity, compacted_topic_fixture) { tmpbuf_file::store_t index_data; auto idx = make_dummy_compacted_index(index_data, 1_MiB, resources); const auto key = random_generators::get_bytes(1_MiB); - idx.index(key, model::offset(42), 66).get(); + auto bt = random_batch_type(); + idx.index(bt, bytes(key), model::offset(42), 66).get(); idx.close().get(); info("{}", idx); @@ -139,7 +182,9 @@ FIXTURE_TEST( 32_KiB); auto footer = rdr.load_footer().get0(); BOOST_REQUIRE_EQUAL(footer.keys, 1); - BOOST_REQUIRE_EQUAL(footer.version, 0); + BOOST_REQUIRE_EQUAL( + footer.version, + storage::compacted_index::footer::key_prefixed_with_batch_type); BOOST_REQUIRE(footer.crc != 0); auto vec = compaction_index_reader_to_memory(std::move(rdr)).get0(); BOOST_REQUIRE_EQUAL(vec.size(), 1); @@ -147,7 +192,8 @@ FIXTURE_TEST( BOOST_REQUIRE_EQUAL(vec[0].delta, 66); auto max_sz = storage::internal::spill_key_index::max_key_size; BOOST_REQUIRE_EQUAL(vec[0].key.size(), max_sz); - BOOST_REQUIRE_EQUAL(vec[0].key, bytes_view(key.data(), max_sz)); + BOOST_REQUIRE_EQUAL( + extract_record_key(vec[0].key), bytes_view(key.data(), max_sz - 1)); } FIXTURE_TEST(key_reducer_no_truncate_filter, compacted_topic_fixture) { @@ -157,6 +203,7 @@ FIXTURE_TEST(key_reducer_no_truncate_filter, compacted_topic_fixture) { const auto key1 = random_generators::get_bytes(1_KiB); const auto key2 = random_generators::get_bytes(1_KiB); + auto bt = random_batch_type(); for (auto i = 0; i < 100; ++i) { bytes_view put_key; if (i % 2) { @@ -164,7 +211,7 @@ FIXTURE_TEST(key_reducer_no_truncate_filter, compacted_topic_fixture) { } else { put_key = key2; } - idx.index(put_key, model::offset(i), 0).get(); + idx.index(bt, bytes(put_key), model::offset(i), 0).get(); } idx.close().get(); info("{}", idx); @@ -197,6 +244,7 @@ FIXTURE_TEST(key_reducer_max_mem, compacted_topic_fixture) { const auto key1 = random_generators::get_bytes(1_KiB); const auto key2 = random_generators::get_bytes(1_KiB); + auto bt = random_batch_type(); for (auto i = 0; i < 100; ++i) { bytes_view put_key; if (i % 2) { @@ -204,7 +252,7 @@ FIXTURE_TEST(key_reducer_max_mem, compacted_topic_fixture) { } else { put_key = key2; } - idx.index(put_key, model::offset(i), 0).get(); + idx.index(bt, bytes(put_key), model::offset(i), 0).get(); } idx.close().get(); info("{}", idx); @@ -262,6 +310,7 @@ FIXTURE_TEST(index_filtered_copy_tests, compacted_topic_fixture) { const auto key1 = random_generators::get_bytes(128_KiB); const auto key2 = random_generators::get_bytes(1_KiB); + auto bt = random_batch_type(); for (auto i = 0; i < 100; ++i) { bytes_view put_key; if (i % 2) { @@ -269,7 +318,7 @@ FIXTURE_TEST(index_filtered_copy_tests, compacted_topic_fixture) { } else { put_key = key2; } - idx.index(put_key, model::offset(i), 0).get(); + idx.index(bt, bytes(put_key), model::offset(i), 0).get(); } idx.close().get(); info("{}", idx); diff --git a/src/v/storage/tests/storage_e2e_test.cc b/src/v/storage/tests/storage_e2e_test.cc index b7b9237449f19..71a540dcde785 100644 --- a/src/v/storage/tests/storage_e2e_test.cc +++ b/src/v/storage/tests/storage_e2e_test.cc @@ -2086,3 +2086,108 @@ FIXTURE_TEST(test_querying_term_last_offset, storage_test_fixture) { BOOST_REQUIRE(!log.get_term_last_offset(model::term_id(0)).has_value()); } + +void write_batch( + storage::log log, + ss::sstring key, + int value, + model::record_batch_type batch_type) { + storage::record_batch_builder builder(batch_type, model::offset(0)); + + builder.add_raw_kv(serde::to_iobuf(std::move(key)), serde::to_iobuf(value)); + + auto batch = std::move(builder).build(); + batch.set_term(model::term_id(0)); + auto reader = model::make_memory_record_batch_reader({std::move(batch)}); + storage::log_append_config cfg{ + .should_fsync = storage::log_append_config::fsync::no, + .io_priority = ss::default_priority_class(), + .timeout = model::no_timeout, + }; + + std::move(reader).for_each_ref(log.make_appender(cfg), cfg.timeout).get0(); +} + +absl::flat_hash_map, int> +compact_in_memory(storage::log log) { + auto rdr = log + .make_reader(storage::log_reader_config( + model::offset(0), + model::offset::max(), + ss::default_priority_class())) + .get(); + + absl::flat_hash_map, int> + ret; + auto batches = model::consume_reader_to_memory( + std::move(rdr), model::no_timeout) + .get(); + + for (auto& b : batches) { + b.for_each_record([&ret, bt = b.header().type](model::record r) { + auto k = std::make_pair( + bt, serde::from_iobuf(r.key().copy())); + ret.insert_or_assign(k, serde::from_iobuf(r.value().copy())); + }); + } + + return ret; +} + +FIXTURE_TEST(test_compacting_batches_of_different_types, storage_test_fixture) { + auto cfg = default_log_config(test_dir); + cfg.max_compacted_segment_size = config::mock_binding(100_MiB); + cfg.stype = storage::log_config::storage_type::disk; + cfg.cache = storage::with_cache::no; + storage::ntp_config::default_overrides overrides; + overrides.cleanup_policy_bitflags + = model::cleanup_policy_bitflags::compaction; + + ss::abort_source as; + storage::log_manager mgr = make_log_manager(cfg); + auto deferred = ss::defer([&mgr]() mutable { mgr.stop().get0(); }); + auto ntp = model::ntp("default", "test", 0); + auto log = mgr + .manage(storage::ntp_config( + ntp, + mgr.config().base_dir, + std::make_unique( + overrides))) + .get0(); + + auto disk_log = get_disk_log(log); + + // the same key but three different batch types + write_batch(log, "key_1", 1, model::record_batch_type::raft_data); + write_batch(log, "key_1", 10, model::record_batch_type::tm_update); + write_batch(log, "key_1", 100, model::record_batch_type::tx_fence); + + write_batch(log, "key_1", 2, model::record_batch_type::raft_data); + write_batch(log, "key_1", 3, model::record_batch_type::raft_data); + write_batch(log, "key_1", 4, model::record_batch_type::raft_data); + + write_batch(log, "key_1", 20, model::record_batch_type::tm_update); + write_batch(log, "key_1", 30, model::record_batch_type::tm_update); + write_batch(log, "key_1", 40, model::record_batch_type::tm_update); + + write_batch(log, "key_1", 200, model::record_batch_type::tm_update); + write_batch(log, "key_1", 300, model::record_batch_type::tm_update); + write_batch(log, "key_1", 400, model::record_batch_type::tm_update); + + disk_log->force_roll(ss::default_priority_class()).get(); + + log.flush().get0(); + + BOOST_REQUIRE_EQUAL(disk_log->segment_count(), 2); + + storage::compaction_config c_cfg( + model::timestamp::min(), std::nullopt, ss::default_priority_class(), as); + auto before_compaction = compact_in_memory(log); + + BOOST_REQUIRE_EQUAL(before_compaction.size(), 3); + // compact + log.compact(c_cfg).get0(); + auto after_compaction = compact_in_memory(log); + + BOOST_REQUIRE(before_compaction == after_compaction); +} diff --git a/src/v/storage/types.cc b/src/v/storage/types.cc index 65bc010f929be..b3d76868da19a 100644 --- a/src/v/storage/types.cc +++ b/src/v/storage/types.cc @@ -9,6 +9,7 @@ #include "storage/types.h" +#include "storage/compacted_index.h" #include "storage/ntp_config.h" #include "utils/human.h" #include "utils/to_string.h" @@ -169,4 +170,18 @@ std::ostream& operator<<(std::ostream& o, const compaction_result& r) { return o; } +std::ostream& +operator<<(std::ostream& o, compacted_index::recovery_state state) { + switch (state) { + case compacted_index::recovery_state::index_missing: + return o << "index_missing"; + case compacted_index::recovery_state::already_compacted: + return o << "already_compacted"; + case compacted_index::recovery_state::index_needs_rebuild: + return o << "index_needs_rebuild"; + case compacted_index::recovery_state::index_recovered: + return o << "index_recovered"; + } + __builtin_unreachable(); +} } // namespace storage diff --git a/tests/docker/docker-compose.yml b/tests/docker/docker-compose.yml index c4687c13e74d7..14de928b1ba02 100644 --- a/tests/docker/docker-compose.yml +++ b/tests/docker/docker-compose.yml @@ -42,5 +42,6 @@ services: - minio volumes: - '${BUILD_ROOT}:${BUILD_ROOT}' + - '${BUILD_ROOT}/redpanda_installs:/opt/redpanda_installs' networks: - redpanda-test diff --git a/tests/rptest/clients/rpk.py b/tests/rptest/clients/rpk.py index feb0be56fde81..c6c266ac89041 100644 --- a/tests/rptest/clients/rpk.py +++ b/tests/rptest/clients/rpk.py @@ -713,3 +713,26 @@ def cluster_metadata_id(self): return None else: return lines[2] + + def license_set(self, path, license=""): + cmd = [ + self._rpk_binary(), "--api-urls", + self._admin_host(), "cluster", "license", "set" + ] + + if license: + cmd += [license] + if path: + cmd += ["--path", path] + + return self._execute(cmd) + + def license_info(self): + + cmd = [ + self._rpk_binary(), "--api-urls", + self._admin_host(), "cluster", "license", "info", "--format", + "json" + ] + + return self._execute(cmd) diff --git a/tests/rptest/clients/rpk_remote.py b/tests/rptest/clients/rpk_remote.py index 6b93906e8a2e2..c124374cc93ce 100644 --- a/tests/rptest/clients/rpk_remote.py +++ b/tests/rptest/clients/rpk_remote.py @@ -47,6 +47,12 @@ def cluster_config_force_reset(self, property_name): def cluster_config_lint(self): return self._execute([self._rpk_binary(), 'cluster', 'config', 'lint']) + def tune(self, tuner): + return self._execute([self._rpk_binary(), 'redpanda', 'tune', tuner]) + + def mode_set(self, mode): + return self._execute([self._rpk_binary(), 'redpanda', 'mode', mode]) + def _run_config(self, cmd, path=None, timeout=30): cmd = [self._rpk_binary(), 'redpanda', 'config'] + cmd diff --git a/tests/rptest/services/admin.py b/tests/rptest/services/admin.py index 9d36a385acc3e..23ca749d8b205 100644 --- a/tests/rptest/services/admin.py +++ b/tests/rptest/services/admin.py @@ -439,6 +439,14 @@ def decommission_broker(self, id, node=None): self.redpanda.logger.debug(f"decommissioning {path}") return self._request('put', path, node=node) + def recommission_broker(self, id, node=None): + """ + Recommission broker i.e. abort ongoing decommissioning + """ + path = f"brokers/{id}/recommission" + self.redpanda.logger.debug(f"recommissioning {id}") + return self._request('put', path, node=node) + def list_reconfigurations(self, node=None): """ List pending reconfigurations @@ -565,6 +573,18 @@ def delete_user(self, username): self._request("delete", path) + def update_user(self, username, password, algorithm): + self.redpanda.logger.info( + f"Updating user {username}:{password}:{algorithm}") + + self._request("PUT", + f"security/users/{username}", + json=dict( + username=username, + password=password, + algorithm=algorithm, + )) + def list_users(self, node=None): return self._request("get", "security/users", node=node).json() diff --git a/tests/rptest/services/redpanda.py b/tests/rptest/services/redpanda.py index a17e1c27bee7d..df39db9514567 100644 --- a/tests/rptest/services/redpanda.py +++ b/tests/rptest/services/redpanda.py @@ -222,23 +222,24 @@ class SISettings: GLOBAL_S3_REGION_KEY = "s3_region" def __init__( - self, - *, - log_segment_size: int = 16 * 1000000, - cloud_storage_access_key: str = 'panda-user', - cloud_storage_secret_key: str = 'panda-secret', - cloud_storage_region: str = 'panda-region', - cloud_storage_bucket: Optional[str] = None, - cloud_storage_api_endpoint: str = 'minio-s3', - cloud_storage_api_endpoint_port: int = 9000, - cloud_storage_cache_size: int = 160 * 1000000, - cloud_storage_enable_remote_read: bool = True, - cloud_storage_enable_remote_write: bool = True, - cloud_storage_reconciliation_interval_ms: Optional[int] = None, - cloud_storage_max_connections: Optional[int] = None, - cloud_storage_disable_tls: bool = True, - cloud_storage_segment_max_upload_interval_sec: Optional[int] = None - ): + self, + *, + log_segment_size: int = 16 * 1000000, + cloud_storage_access_key: str = 'panda-user', + cloud_storage_secret_key: str = 'panda-secret', + cloud_storage_region: str = 'panda-region', + cloud_storage_bucket: Optional[str] = None, + cloud_storage_api_endpoint: str = 'minio-s3', + cloud_storage_api_endpoint_port: int = 9000, + cloud_storage_cache_size: int = 160 * 1000000, + cloud_storage_enable_remote_read: bool = True, + cloud_storage_enable_remote_write: bool = True, + cloud_storage_reconciliation_interval_ms: Optional[int] = None, + cloud_storage_max_connections: Optional[int] = None, + cloud_storage_disable_tls: bool = True, + cloud_storage_segment_max_upload_interval_sec: Optional[int] = None, + cloud_storage_readreplica_manifest_sync_timeout_ms: Optional[ + int] = None): self.log_segment_size = log_segment_size self.cloud_storage_access_key = cloud_storage_access_key self.cloud_storage_secret_key = cloud_storage_secret_key @@ -253,6 +254,7 @@ def __init__( self.cloud_storage_max_connections = cloud_storage_max_connections self.cloud_storage_disable_tls = cloud_storage_disable_tls self.cloud_storage_segment_max_upload_interval_sec = cloud_storage_segment_max_upload_interval_sec + self.cloud_storage_readreplica_manifest_sync_timeout_ms = cloud_storage_readreplica_manifest_sync_timeout_ms self.endpoint_url = f'http://{self.cloud_storage_api_endpoint}:{self.cloud_storage_api_endpoint_port}' def load_context(self, logger, test_context): @@ -309,6 +311,9 @@ def update_rp_conf(self, conf) -> dict[str, Any]: if self.cloud_storage_max_connections: conf[ 'cloud_storage_max_connections'] = self.cloud_storage_max_connections + if self.cloud_storage_readreplica_manifest_sync_timeout_ms: + conf[ + 'cloud_storage_readreplica_manifest_sync_timeout_ms'] = self.cloud_storage_readreplica_manifest_sync_timeout_ms if self.cloud_storage_segment_max_upload_interval_sec: conf[ 'cloud_storage_segment_max_upload_interval_sec'] = self.cloud_storage_segment_max_upload_interval_sec @@ -348,14 +353,25 @@ class SecurityConfig: # the rules, so instead we use a fixed mapping and arrange for certs to use # a similar format. this will change when we get closer to GA and the # configuration becomes more general. - PRINCIPAL_MAPPING_RULES = "RULE:^O=Redpanda,CN=(.*?)$/$1/L, DEFAULT" + __DEFAULT_PRINCIPAL_MAPPING_RULES = "RULE:^O=Redpanda,CN=(.*?)$/$1/L, DEFAULT" def __init__(self): self.enable_sasl = False + self.kafka_enable_authorization: Optional[bool] = None + self.endpoint_authn_method: Optional[str] = None self.tls_provider: Optional[TLSProvider] = None - # extract principal from mtls distinguished name - self.enable_mtls_identity = False + # The rules to extract principal from mtls + self.principal_mapping_rules = self.__DEFAULT_PRINCIPAL_MAPPING_RULES + + # sasl is required + def sasl_enabled(self): + return (self.kafka_enable_authorization is None + and self.enable_sasl) or self.endpoint_authn_method == "sasl" + + # principal is extracted from mtls distinguished name + def mtls_identity_enabled(self): + return self.endpoint_authn_method == "mtls_identity" class RedpandaService(Service): @@ -402,7 +418,6 @@ class RedpandaService(Service): 'default_topic_partitions': 4, 'enable_metrics_reporter': False, 'superusers': [SUPERUSER_CREDENTIALS[0]], - 'partition_autobalancing_mode': 'node_add_remove' } logs = { @@ -443,7 +458,7 @@ def __init__(self, environment: Optional[dict[str, str]] = None, security: SecurityConfig = SecurityConfig(), node_ready_timeout_s=None, - enable_installer=False): + superuser: Optional[SaslCredentials] = None): super(RedpandaService, self).__init__(context, num_nodes=num_brokers) self._context = context self._enable_rp = enable_rp @@ -451,9 +466,17 @@ def __init__(self, self._enable_pp = enable_pp self._enable_sr = enable_sr self._security = security - self._installer: Optional[RedpandaInstaller] = None - if enable_installer: - self._installer = RedpandaInstaller(self) + self._installer: RedpandaInstaller = RedpandaInstaller(self) + + if superuser is None: + superuser = self.SUPERUSER_CREDENTIALS + self._skip_create_superuser = False + else: + # When we are passed explicit superuser credentials, presume that the caller + # is taking care of user creation themselves (e.g. when testing credential bootstrap) + self._skip_create_superuser = True + + self._superuser = superuser if node_ready_timeout_s is None: node_ready_timeout_s = RedpandaService.DEFAULT_NODE_READY_TIMEOUT_SEC @@ -469,10 +492,9 @@ def __init__(self, else: self._log_level = log_level - self._admin = Admin(self) self._admin = Admin(self, - auth=(self.SUPERUSER_CREDENTIALS.username, - self.SUPERUSER_CREDENTIALS.password)) + auth=(self._superuser.username, + self._superuser.password)) self._started = [] self._security_config = dict() @@ -514,6 +536,9 @@ def set_resource_settings(self, rs): def set_extra_rp_conf(self, conf): self._extra_rp_conf = conf + def add_extra_rp_conf(self, conf): + self._extra_rp_conf = {**self._extra_rp_conf, **conf} + def set_extra_node_conf(self, node, conf): assert node in self.nodes self._extra_node_conf[node] = conf @@ -532,7 +557,13 @@ def _init_tls(self): self, "redpanda.service.admin") def sasl_enabled(self): - return self._security.enable_sasl + return self._security.sasl_enabled() + + def mtls_identity_enabled(self): + return self._security.mtls_identity_enabled() + + def endpoint_authn_method(self): + return self._security.endpoint_authn_method @property def dedicated_nodes(self): @@ -563,7 +594,7 @@ def get_node_memory_mb(self): memory_kb = int(line.strip().split()[1]) return memory_kb / 1024 - def start(self, nodes=None, clean_nodes=True): + def start(self, nodes=None, clean_nodes=True, start_si=True): """Start the service on all nodes.""" to_start = nodes if nodes is not None else self.nodes assert all((node in self.nodes for node in to_start)) @@ -587,7 +618,7 @@ def start(self, nodes=None, clean_nodes=True): # Expected usage is that we may install new binaries before # starting the cluster, and installation-cleaning happened # when we started the installer. - self.clean_node(node, clean_installs=False) + self.clean_node(node, preserve_current_install=True) else: self.logger.debug("%s: skip cleaning node" % self.who_am_i(node)) @@ -607,7 +638,8 @@ def start(self, nodes=None, clean_nodes=True): if self._start_duration_seconds < 0: self._start_duration_seconds = time.time() - self._start_time - self._admin.create_user(*self.SUPERUSER_CREDENTIALS) + if not self._skip_create_superuser: + self._admin.create_user(*self._superuser) self.logger.info("Waiting for all brokers to join cluster") expected = set(self._started) @@ -629,7 +661,7 @@ def start(self, nodes=None, clean_nodes=True): raise RuntimeError("Unexpected files in data directory") if self.sasl_enabled(): - username, password, algorithm = self.SUPERUSER_CREDENTIALS + username, password, algorithm = self._superuser self._security_config = dict(security_protocol='SASL_PLAINTEXT', sasl_mechanism=algorithm, sasl_plain_username=username, @@ -637,7 +669,7 @@ def start(self, nodes=None, clean_nodes=True): request_timeout_ms=30000, api_version_auto_timeout_ms=3000) - if self._si_settings is not None: + if start_si and self._si_settings is not None: self.start_si() def write_tls_certs(self): @@ -915,6 +947,8 @@ def start_si(self): logger=self.logger, ) + self.logger.debug( + f"Creating S3 bucket: {self._si_settings.cloud_storage_bucket}") self._s3client.create_bucket(self._si_settings.cloud_storage_bucket) def list_buckets(self) -> dict[str, Union[list, dict]]: @@ -1117,7 +1151,7 @@ def decode_backtraces(self): self.logger.exception("Failed to run seastar-addr2line") def rp_install_path(self): - if self._installer and self._installer._started: + if self._installer._started: # The installer sets up binaries to always use /opt/redpanda. return "/opt/redpanda" return self._context.globals.get("rp_install_path_root", None) @@ -1190,9 +1224,20 @@ def clean(self, **kwargs): if self._s3client: self.delete_bucket_from_si() - def clean_node(self, node, preserve_logs=False, clean_installs=True): - node.account.kill_process("redpanda", clean_shutdown=False) - node.account.kill_process("bin/node", clean_shutdown=False) + def clean_node(self, + node, + preserve_logs=False, + preserve_current_install=False): + # These are allow_fail=True to allow for a race where kill_process finds + # the PID, but then the process has died before it sends the SIGKILL. This + # should be safe against actual failures to of the process to stop, because + # we're using SIGKILL which does not require the process's cooperation. + node.account.kill_process("redpanda", + clean_shutdown=False, + allow_fail=True) + node.account.kill_process("bin/node", + clean_shutdown=False, + allow_fail=True) if node.account.exists(RedpandaService.PERSISTENT_ROOT): if node.account.sftp_client.listdir( RedpandaService.PERSISTENT_ROOT): @@ -1210,9 +1255,11 @@ def clean_node(self, node, preserve_logs=False, clean_installs=True): self.EXECUTABLE_SAVE_PATH): node.account.remove(self.EXECUTABLE_SAVE_PATH) - if clean_installs and self._installer is not None: - # Get rid of any installed packages. - self._installer.clean(node) + if not preserve_current_install or not self._installer._started: + # Reset the binaries to use the original binaries. + # NOTE: if the installer hasn't been started, there is no + # installation to preserve! + self._installer.reset_current_install([node]) def remove_local_data(self, node): node.account.remove(f"{RedpandaService.PERSISTENT_ROOT}/data/*") @@ -1268,8 +1315,9 @@ def write_node_conf_file(self, node, override_cfg_params=None): enable_rp=self._enable_rp, enable_pp=self._enable_pp, enable_sr=self._enable_sr, - superuser=self.SUPERUSER_CREDENTIALS, - sasl_enabled=self.sasl_enabled()) + superuser=self._superuser, + sasl_enabled=self.sasl_enabled(), + endpoint_authn_method=self.endpoint_authn_method()) if override_cfg_params or self._extra_node_conf[node]: doc = yaml.full_load(conf) @@ -1292,10 +1340,6 @@ def write_node_conf_file(self, node, override_cfg_params=None): cert_file=RedpandaService.TLS_SERVER_CRT_FILE, truststore_file=RedpandaService.TLS_CA_CRT_FILE, ) - if self._security.enable_mtls_identity: - tls_config.update( - dict(principal_mapping_rules=SecurityConfig. - PRINCIPAL_MAPPING_RULES, )) doc = yaml.full_load(conf) doc["redpanda"].update(dict(kafka_api_tls=tls_config)) conf = yaml.dump(doc) @@ -1316,6 +1360,13 @@ def write_bootstrap_cluster_config(self): if self._security.enable_sasl: self.logger.debug("Enabling SASL in cluster configuration") conf.update(dict(enable_sasl=True)) + if self._security.kafka_enable_authorization is not None: + self.logger.debug( + f"Setting kafka_enable_authorization: {self._security.kafka_enable_authorization} in cluster configuration" + ) + conf.update( + dict(kafka_enable_authorization=self._security. + kafka_enable_authorization)) conf_yaml = yaml.dump(conf) for node in self.nodes: @@ -1355,10 +1406,9 @@ def registered(self, node): # the node is stored in raft0 AND has been replayed on all nodes. Otherwise # a kafka metadata request to the last node to join could return incomplete # metadata and cause strange issues within a test. - admin = Admin(self) for peer in self._started: try: - admin_brokers = admin.get_brokers(node=peer) + admin_brokers = self._admin.get_brokers(node=peer) except requests.exceptions.RequestException as e: # We run during startup, when admin API may not even be listening yet: tolerate # API errors but presume that if some APIs are not up yet, then node registration @@ -1388,7 +1438,16 @@ def registered(self, node): f"registered: node {node.name} now visible in peer {peer.name}'s broker list ({admin_brokers})" ) - client = PythonLibrdkafka(self, tls_cert=self._tls_cert) + auth_args = {} + if self.sasl_enabled(): + auth_args = { + 'username': self._superuser.username, + 'password': self._superuser.password, + 'algorithm': self._superuser.algorithm + } + + client = PythonLibrdkafka(self, tls_cert=self._tls_cert, **auth_args) + brokers = client.brokers() broker = brokers.get(idx, None) if broker is None: @@ -1695,7 +1754,7 @@ def save_executable(self): # Any node will do. Even in a mixed-version upgrade test, we should # still have the original binaries available. node = self.nodes[0] - if self._installer and self._installer._started: + if self._installer._started: head_root_path = self._installer.path_for_version( RedpandaInstaller.HEAD) binary = f"{head_root_path}/libexec/redpanda" diff --git a/tests/rptest/services/redpanda_installer.py b/tests/rptest/services/redpanda_installer.py index fe4cfd1bdc462..445d6e9fe8f61 100644 --- a/tests/rptest/services/redpanda_installer.py +++ b/tests/rptest/services/redpanda_installer.py @@ -7,15 +7,37 @@ # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0 +import errno +import os import re import requests +from ducktape.utils.util import wait_until + # Match any version that may result from a redpanda binary, which may not be a # released version. # E.g. "v22.1.1-rc1-1373-g77f868..." VERSION_RE = re.compile(".*v(\\d+)\\.(\\d+)\\.(\\d+).*") +def wait_for_num_versions(redpanda, num_versions): + def get_unique_versions(): + node = redpanda.nodes[0] + brokers_list = \ + str(node.account.ssh_output(f"{redpanda.find_binary('rpk')} redpanda admin brokers list")) + redpanda.logger.debug(brokers_list) + version_re = re.compile("v\\d+\\.\\d+\\.\\d+") + return set(version_re.findall(brokers_list)) + + # NOTE: allow retries, as the version may not be available immediately + # following a restart. + wait_until(lambda: len(get_unique_versions()) == num_versions, + timeout_sec=30) + unique_versions = get_unique_versions() + assert len(unique_versions) == num_versions, unique_versions + return unique_versions + + class RedpandaInstaller: """ Provides mechanisms to install multiple Redpanda binaries on a cluster. @@ -30,9 +52,19 @@ class RedpandaInstaller: # Represents the binaries installed at the time of the call to start(). It # is expected that this is identical across all nodes initially. HEAD = "head" + + # Directory to which binaries are downloaded. + # + # In local deployments it is expected that this is shared by all nodes in a + # cluster, and that directories therein are only ever created (never + # deleted) during the lifetime of the RedpandaInstaller. INSTALLER_ROOT = "/opt/redpanda_installs" TGZ_URL_TEMPLATE = "https://packages.vectorized.io/qSZR7V26sJx7tCXe/redpanda/raw/names/redpanda-{arch}/versions/{version}/redpanda-{version}-{arch}.tar.gz" + # File path to be used for locking to prevent multiple local test processes + # from operating on the same volume mounts. + INSTALLER_LOCK_PATH = f"{INSTALLER_ROOT}/install_lock" + @staticmethod def root_for_version(version): """ @@ -59,16 +91,96 @@ def __init__(self, redpanda): """ self._started = False self._redpanda = redpanda - self._installed_per_node = dict() - # Keep track if the original install path is /opt/redpanda is used, as - # is the case for package-deployed clusters. Since the installer uses - # this directory, we'll need to be mindful not to mess with the - # original binaries. + # Keep track if the original install path is /opt/redpanda, as is the + # case for package-deployed clusters. Since the installer uses this + # directory, we'll need to be mindful not to mess with the original + # binaries. rp_install_path_root = self._redpanda._context.globals.get( "rp_install_path_root", None) self._head_backed_up = rp_install_path_root == "/opt/redpanda" + # Whether the nodes are expected to share a single mounted volume for + # their installs. If so, care should be taken to coordinate operations + # on the installer root. + self._nodes_share_installs = rp_install_path_root != "/opt/redpanda" + + # File descriptor used to coordinate access to the installer root when + # multiple test processes are running on the same machine. + # Must be acquire when operating on the contents of the installer root + # (i.e. root_for_version(), etc). + self._install_lock_fd = None + + def _acquire_install_lock(self, timeout_sec=600): + """ + Attempt to take the install lock, preventing other test processes from + operating an installer. + + Serves to prevent concurrent operations to the same local mountpoint. + """ + if not self._nodes_share_installs: + self._redpanda.logger.debug( + "Nodes don't share installs; no locking needed") + return + + def _lock(): + try: + self._redpanda.logger.debug( + f"Acquiring install lock {self.INSTALLER_LOCK_PATH}") + fd = os.open(self.INSTALLER_LOCK_PATH, + os.O_CREAT | os.O_EXCL | os.O_RDWR) + self._install_lock_fd = fd + except OSError as e: + if e.errno != errno.EEXIST: + raise + # Another process holds the lock. + return False + return True + + wait_until(lambda: _lock(), timeout_sec=timeout_sec) + self._redpanda.logger.debug( + f"Acquired install lock {self.INSTALLER_LOCK_PATH}") + + def _release_install_lock(self): + """ + Releases the install lock, allowing other test processes running + locally to perform downloads. + """ + if not self._nodes_share_installs: + self._redpanda.logger.debug( + "Nodes don't share installs; no locking needed") + return + + if not self._install_lock_fd: + self._redpanda.logger.debug("Installer lock not held") + return True + os.close(self._install_lock_fd) + os.unlink(self.INSTALLER_LOCK_PATH) + self._redpanda.logger.debug("Released install lock") + + def _setup_head_roots_unlocked(self): + """ + Sets up the head roots on each node such that they contain or point to + the original binaries installed at 'rp_install_path_root'. + + Expects that the install lock has been acquired before calling. + """ + nodes = self._redpanda.nodes + head_root_path = RedpandaInstaller.root_for_version( + RedpandaInstaller.HEAD) + rp_install_path_root = self._redpanda._context.globals.get( + "rp_install_path_root", None) + for node in nodes: + # Always end up with binaries at 'head_root_path', so we can + # continue to use root_for_version() to reference the head root. + cmd = None + if self._head_backed_up: + cmd = f"mv /opt/redpanda {head_root_path}" + elif not node.account.exists(head_root_path): + cmd = f"ln -s {rp_install_path_root} {head_root_path}" + if cmd: + node.account.ssh_output(cmd) + def start(self): """ Validates that all nodes in the service have installed the same @@ -78,6 +190,9 @@ def start(self): if self._started: return + # In case a previous test was aborted, do some cleanup. + self.reset_current_install(self._redpanda.nodes) + initial_version = None nodes = self._redpanda.nodes @@ -88,37 +203,23 @@ def start(self): initial_version = vers assert initial_version == vers, \ f"Mismatch version {node.account.hostname} has {vers}, {nodes[0].account.hostname} has {initial_version}" + node.account.ssh_output(f"mkdir -p {self.INSTALLER_ROOT}") - # Clean up the installer root directory so we start out clean. - for node in nodes: - if node.account.exists(RedpandaInstaller.INSTALLER_ROOT): - node.account.remove(f"{RedpandaInstaller.INSTALLER_ROOT}/*", - allow_fail=True) - else: - node.account.mkdir(RedpandaInstaller.INSTALLER_ROOT) + try: + self._acquire_install_lock() + self._setup_head_roots_unlocked() + finally: + self._release_install_lock() - # Now that we're at a sane starting point, set up our install path for - # ease of jumping between versions. + # Start out pointing /opt/redpanda at the current installation. ssh_setup_head_per_node = dict() - head_root_path = RedpandaInstaller.root_for_version( - RedpandaInstaller.HEAD) - rp_install_path_root = self._redpanda._context.globals.get( - "rp_install_path_root", None) + head_root_path = self.root_for_version(RedpandaInstaller.HEAD) for node in nodes: - # For simplicity's sake, always end up with binaries at - # 'head_root_path', so we can continue to use root_for_version() to - # reference the head root. - head_cmd = "" - if self._head_backed_up: - head_cmd = f"mv /opt/redpanda {head_root_path}" - else: - head_cmd = f"ln -s {rp_install_path_root} {head_root_path}" - - cmd = f"{head_cmd} && ln -s {head_root_path} /opt/redpanda" - ssh_setup_head_per_node[node] = node.account.ssh_capture(cmd) - self._installed_per_node[node] = set() + if not node.account.exists("/opt/redpanda"): + cmd = f"ln -s {head_root_path} /opt/redpanda" + ssh_setup_head_per_node[node] = node.account.ssh_capture(cmd) self.wait_for_async_ssh(self._redpanda.logger, ssh_setup_head_per_node, - "Setting up head binaries") + "Setting up /opt/redpanda") def int_tuple(str_tuple): return (int(str_tuple[0]), int(str_tuple[1]), int(str_tuple[2])) @@ -159,8 +260,8 @@ def highest_from_prior_feature_version(self, version): def install(self, nodes, version): """ - Installs the release on the given node such that the next time the node - is restarted, it will use the newly installed bits. + Installs the release on the given nodes such that the next time the + nodes are restarted, they will use the newly installed bits. TODO: abstract 'version' into a more generic installation that doesn't necessarily correspond to a released version. E.g. a custom build @@ -168,64 +269,99 @@ def install(self, nodes, version): """ if not self._started: self.start() + + try: + self._acquire_install_lock() + self._install_unlocked(nodes, version) + finally: + self._release_install_lock() + + def _install_unlocked(self, nodes, version): + """ + Like above but expects the install lock to have been taken before + calling. + """ assert version == RedpandaInstaller.HEAD or version in self._released_versions, \ f"Can't find installation for {version}" - ssh_install_per_node = dict() + version_root = self.root_for_version(version) + + nodes_to_download = nodes + if self._nodes_share_installs: + nodes_to_download = [nodes[0]] + + ssh_download_per_node = dict() + for node in nodes_to_download: + if not version == RedpandaInstaller.HEAD and not node.account.exists( + version_root): + ssh_download_per_node[ + node] = self._async_download_on_node_unlocked( + node, version) + self.wait_for_async_ssh(self._redpanda.logger, ssh_download_per_node, + "Finished downloading binaries") + + # Regardless of whether we downloaded anything, adjust the + # /opt/redpanda link to point to the appropriate version on all nodes. + relink_cmd = f"unlink /opt/redpanda && ln -s {version_root} /opt/redpanda" for node in nodes: - # If we already have this version installed, just adjust the - # symlinks. - version_root = self.root_for_version(version) - relink_cmd = f"unlink /opt/redpanda && ln -s {version_root} /opt/redpanda" - if version == RedpandaInstaller.HEAD or version in self._installed_per_node[ - node]: - ssh_install_per_node[node] = node.account.ssh_capture( - relink_cmd) - continue - - arch = "amd64" - uname = str(node.account.ssh_output("uname -m")) - if "aarch" in uname or "arm" in uname: - arch = "arm64" - self._redpanda.logger.debug( - f"{node.account.hostname} uname output: {uname}") + node.account.ssh_output(relink_cmd) + + def _async_download_on_node_unlocked(self, node, version): + """ + Asynchonously downloads Redpanda of the given version on the given + node. Returns an iterator to the results. - self._installed_per_node[node].add(version) - url = RedpandaInstaller.TGZ_URL_TEMPLATE.format( \ - arch=arch, version=f"{version[0]}.{version[1]}.{version[2]}") - tgz = "redpanda.tar.gz" - cmd = f"curl -fsSL {url} --create-dir --output-dir {version_root} -o {tgz} && gunzip -c {version_root}/{tgz} | tar -xf - -C {version_root} && rm {version_root}/{tgz} && {relink_cmd}" - ssh_install_per_node[node] = node.account.ssh_capture(cmd) + Expects the install lock to have been taken before calling. + """ + version_root = self.root_for_version(version) + arch = "amd64" + uname = str(node.account.ssh_output("uname -m")) + if "aarch" in uname or "arm" in uname: + arch = "arm64" + self._redpanda.logger.debug( + f"{node.account.hostname} uname output: {uname}") - self.wait_for_async_ssh(self._redpanda.logger, ssh_install_per_node, - "Finished installing binaries") + url = RedpandaInstaller.TGZ_URL_TEMPLATE.format( \ + arch=arch, version=f"{version[0]}.{version[1]}.{version[2]}") + tgz = "redpanda.tar.gz" + cmd = f"curl -fsSL {url} --create-dir --output-dir {version_root} -o {tgz} && gunzip -c {version_root}/{tgz} | tar -xf - -C {version_root} && rm {version_root}/{tgz}" + return node.account.ssh_capture(cmd) - def clean(self, node): + def reset_current_install(self, nodes): """ - Cleans the node such that only the original installation remains. + WARNING: should not be used to upgrade to the originally installed + binaries; use 'install(RedpandaInstaller.HEAD)' for that. This should + only be used to clean up a node to its expected starting state (the + state of the world before the first call to 'start()'). + + Resets any /opt/redpanda symlink to instead be real binaries if they + exist. This is a best attempt effort to revert the installs to their + original state (i.e. the state before installing other versions). - This should only be called once there is no longer a need to run the - RedpandaService. + Upon returning, either: + - this is a packaged deployment (CDT) and we are left with a real + /opt/redpanda directory (not a symlink) if possible, or + - this is a local deployment and we are left with no links to head + binaries """ - if not self._started: - self._redpanda.logger.debug( - "Ignoring cleanup, installer not started") - return + head_root_path = self.root_for_version(RedpandaInstaller.HEAD) + for node in nodes: + host = node.account.hostname + if self._head_backed_up: + assert not self._nodes_share_installs + # NOTE: no locking required since installs aren't shared. + head_root_path_exists = node.account.exists(head_root_path) + opt_redpanda_exists = node.account.exists("/opt/redpanda") + if opt_redpanda_exists: + if not node.account.islink("/opt/redpanda"): + assert not head_root_path_exists, \ + f"{host}: {head_root_path} exists and /opt/redpanda exists but is not a link; unclear which to use" + continue + node.account.ssh_output("unlink /opt/redpanda", + allow_fail=True) - # Allow failures so the entire cleanup can proceed even on failure. - head_root_path = RedpandaInstaller.root_for_version( - RedpandaInstaller.HEAD) - if self._head_backed_up: - cmd = f"unlink /opt/redpanda && mv {head_root_path} /opt/redpanda" - node.account.ssh(cmd, allow_fail=True) - else: - cmd = f"unlink /opt/redpanda && unlink {head_root_path}" - node.account.ssh(cmd, allow_fail=True) - - # Also clean up all the downloaded published binaries. - roots_to_rm = [ - RedpandaInstaller.root_for_version(v) - for v in self._installed_per_node[node] - ] - if len(roots_to_rm) == 0: - return - node.account.remove(' '.join(roots_to_rm), allow_fail=True) + assert head_root_path_exists, f"{host}: neither {head_root_path} nor /opt/redpanda exists" + node.account.ssh_output(f"mv {head_root_path} /opt/redpanda", + allow_fail=True) + else: + node.account.ssh_output("unlink /opt/redpanda", + allow_fail=True) diff --git a/tests/rptest/services/templates/redpanda.yaml b/tests/rptest/services/templates/redpanda.yaml index 18ec6f1d3e13a..c68605ae3f710 100644 --- a/tests/rptest/services/templates/redpanda.yaml +++ b/tests/rptest/services/templates/redpanda.yaml @@ -21,9 +21,15 @@ redpanda: - name: dnslistener address: "{{node.account.hostname}}" port: 9092 + {% if endpoint_authn_method %} + authentication_method: {{ endpoint_authn_method }} + {% endif %} - name: iplistener address: "{{node_ip}}" port: {{kafka_alternate_port}} + {% if endpoint_authn_method %} + authentication_method: {{ endpoint_authn_method }} + {% endif %} admin: - address: 127.0.0.1 port: 9644 diff --git a/tests/rptest/test_suite_quick.yml b/tests/rptest/test_suite_quick.yml index 51ff6beb922c3..f8fcdd94938fe 100644 --- a/tests/rptest/test_suite_quick.yml +++ b/tests/rptest/test_suite_quick.yml @@ -17,3 +17,4 @@ quick: - tests/wasm_identity_test.py - tests/wasm_partition_movement_test.py - tests/wasm_redpanda_failure_recovery_test.py + - tests/rpk_tuner_test.py diff --git a/tests/rptest/test_suite_rpk.yml b/tests/rptest/test_suite_rpk.yml index 07041744ba826..4a2e5159ed6a7 100644 --- a/tests/rptest/test_suite_rpk.yml +++ b/tests/rptest/test_suite_rpk.yml @@ -12,3 +12,4 @@ quick: - tests/rpk_topic_test.py - tests/rpk_cluster_test.py - tests/rpk_config_test.py + - tests/rpk_tuner_test.py diff --git a/tests/rptest/tests/acls_test.py b/tests/rptest/tests/acls_test.py index 4d839a9676e25..52fe67170c7e6 100644 --- a/tests/rptest/tests/acls_test.py +++ b/tests/rptest/tests/acls_test.py @@ -31,7 +31,9 @@ def create_broker_cert(self, redpanda, node): return self.tls.create_cert(node.name) def create_service_client_cert(self, _, name): - return self.tls.create_cert(socket.gethostname(), name=name) + return self.tls.create_cert(socket.gethostname(), + name=name, + common_name=name) class AccessControlListTest(RedpandaTest): @@ -46,10 +48,16 @@ def setUp(self): # it with custom security settings return - def prepare_cluster(self, use_tls, use_sasl): + def prepare_cluster(self, + use_tls, + use_sasl, + enable_authz=None, + authn_method=None, + principal_mapping_rules=None): self.security = SecurityConfig() self.security.enable_sasl = use_sasl - self.security.enable_mtls_identity = use_tls and not use_sasl + self.security.kafka_enable_authorization = enable_authz + self.security.endpoint_authn_method = authn_method if use_tls: self.tls = tls.TLSCertManager(self.logger) @@ -73,12 +81,20 @@ def prepare_cluster(self, use_tls, use_sasl): self.security.tls_provider = MTLSProvider(self.tls) + if self.security.mtls_identity_enabled(): + if principal_mapping_rules is not None: + self.security.principal_mapping_rules = principal_mapping_rules + self.redpanda.add_extra_rp_conf({ + 'kafka_mtls_principal_mapping_rules': + [self.security.principal_mapping_rules] + }) + self.redpanda.set_security_settings(self.security) self.redpanda.start() admin = Admin(self.redpanda) - if self.security.enable_mtls_identity: + if self.security.mtls_identity_enabled(): feature_name = "mtls_authentication" admin.put_feature(feature_name, {"state": "active"}) @@ -92,11 +108,11 @@ def check_feature_active(): wait_until(check_feature_active, timeout_sec=10, backoff_sec=1) # base case user is not a superuser and has no configured ACLs - if use_sasl: + if use_sasl or enable_authz: admin.create_user("base", self.password, self.algorithm) # only grant cluster describe permission to user cluster_describe - if use_sasl: + if use_sasl or enable_authz: admin.create_user("cluster_describe", self.password, self.algorithm) client = self.get_super_client() @@ -105,7 +121,7 @@ def check_feature_active(): # there is not a convenient interface for waiting for acls to propogate # to all nodes so when we are using mtls only for identity we inject a # sleep here to try to avoid any acl propogation races. - if self.security.enable_mtls_identity: + if self.security.mtls_identity_enabled(): time.sleep(5) return @@ -120,7 +136,8 @@ def users_propogated(): wait_until(users_propogated, timeout_sec=10, backoff_sec=1) def get_client(self, username): - if self.security.enable_mtls_identity: + if self.security.mtls_identity_enabled( + ) or not self.security.sasl_enabled(): if username == "base": cert = self.base_user_cert elif username == "cluster_describe": @@ -140,7 +157,8 @@ def get_client(self, username): tls_cert=cert) def get_super_client(self): - if self.security.enable_mtls_identity: + if self.security.mtls_identity_enabled( + ) or not self.security.sasl_enabled(): return RpkTool(self.redpanda, tls_cert=self.admin_user_cert) username, password, _ = self.redpanda.SUPERUSER_CREDENTIALS @@ -154,24 +172,111 @@ def get_super_client(self): sasl_mechanism=self.algorithm, tls_cert=cert) + # The old config style has use_sasl at the top level, which enables + # authorization. New config style has kafka_enable_authorization at the + # top-level, with authentication_method on the listener. @cluster(num_nodes=3) + # plaintext conn + sasl for authn (global sasl config) @parametrize(use_tls=False, - use_sasl=True) # plaintext conn + sasl for authn - @parametrize(use_tls=True, use_sasl=True) # ssl/tls conn + sasl for authn - @parametrize(use_tls=True, use_sasl=False) # ssl/tls conn + mtls for authn - def test_describe_acls(self, use_tls, use_sasl): + use_sasl=True, + enable_authz=None, + authn_method=None) + # ssl/tls conn + sasl for authn (global sasl config) + @parametrize(use_tls=True, + use_sasl=True, + enable_authz=None, + authn_method=None) + # ssl/tls conn + sasl for authn (listener sasl config) + @parametrize(use_tls=True, + use_sasl=False, + enable_authz=True, + authn_method="sasl") + # ssl/tls conn + mtls for authn (listener mtls config) + @parametrize(use_tls=True, + use_sasl=False, + enable_authz=True, + authn_method="mtls_identity") + # Disable authz + @parametrize(use_tls=True, + use_sasl=True, + enable_authz=False, + authn_method=None, + always_succeed=True) + # Disable authz + @parametrize(use_tls=True, + use_sasl=True, + enable_authz=False, + authn_method="sasl", + always_succeed=True) + # Disable authz + @parametrize(use_tls=True, + use_sasl=True, + enable_authz=False, + authn_method="mtls_identity", + always_succeed=True) + def test_describe_acls(self, + use_tls, + use_sasl, + enable_authz, + authn_method, + always_succeed=False): """ security::acl_operation::describe, security::default_cluster_name """ - self.prepare_cluster(use_tls, use_sasl) + self.prepare_cluster(use_tls, use_sasl, enable_authz, authn_method) # run a few times for good health for _ in range(5): try: self.get_client("base").acl_list() - assert False, "list acls should have failed" + assert always_succeed, "list acls should have failed" except ClusterAuthorizationError: - pass + assert not always_succeed self.get_client("cluster_describe").acl_list() self.get_super_client().acl_list() + + # Test mtls identity + # Principals in use: + # * redpanda.service.admin: the default admin client + # * admin: used for acl bootstrap + # * cluster_describe: the principal under test + @cluster(num_nodes=3) + # DEFAULT: The whole SAN + @parametrize(rules="DEFAULT", fail=True) + # Match admin, or O (Redpanda) + @parametrize( + rules= + "RULE:^O=Redpanda,CN=(redpanda.service.admin|admin)$/$1/, RULE:^O=([^,]+),CN=(.*?)$/$1/", + fail=True) + # Wrong Case + @parametrize(rules="RULE:^O=Redpanda,CN=(.*?)$/$1/U", fail=True) + # Match CN + @parametrize(rules="RULE:^O=Redpanda,CN=(.*?)$/$1/L", fail=False) + # Full Match + @parametrize( + rules= + "RULE:^O=Redpanda,CN=(cluster_describe|redpanda.service.admin|admin)$/$1/", + fail=False) + # Match admin or empty + @parametrize( + rules= + "RULE:^O=Redpanda,CN=(admin|redpanda.service.admin)$/$1/, RULE:^O=Redpanda,CN=()$/$1/L", + fail=True) + def test_mtls_principal(self, rules=None, fail=False): + """ + security::acl_operation::describe, security::default_cluster_name + """ + self.prepare_cluster(use_tls=True, + use_sasl=False, + enable_authz=True, + authn_method="mtls_identity", + principal_mapping_rules=rules) + + # run a few times for good health + for _ in range(5): + try: + self.get_client("cluster_describe").acl_list() + assert not fail, "list acls should have failed" + except ClusterAuthorizationError: + assert fail, "list acls should have succeeded" diff --git a/tests/rptest/tests/cluster_config_test.py b/tests/rptest/tests/cluster_config_test.py index feef159957568..56bd89c0331b0 100644 --- a/tests/rptest/tests/cluster_config_test.py +++ b/tests/rptest/tests/cluster_config_test.py @@ -484,7 +484,10 @@ def test_valid_settings(self): # Don't change these settings, they prevent the test from subsequently # using the cluster - exclude_settings = {'enable_sasl'} + exclude_settings = { + 'enable_sasl', 'kafka_enable_authorization', + 'kafka_mtls_principal_mapping_rules' + } # Don't enable coproc: it generates log errors if its companion service isn't running exclude_settings.add('enable_coproc') diff --git a/tests/rptest/tests/cluster_features_test.py b/tests/rptest/tests/cluster_features_test.py index 471b576cba87d..ab7c91c43df89 100644 --- a/tests/rptest/tests/cluster_features_test.py +++ b/tests/rptest/tests/cluster_features_test.py @@ -42,7 +42,7 @@ def _assert_default_features(self): # This assertion will break each time we increment the value # of `latest_version` in the redpanda source. Update it when # that happens. - assert features_response['cluster_version'] == 4 + assert features_response['cluster_version'] == 5 assert self._get_features_map( features_response)['central_config']['state'] == 'active' diff --git a/tests/rptest/tests/end_to_end.py b/tests/rptest/tests/end_to_end.py index ecdda96790149..594be6b244d3b 100644 --- a/tests/rptest/tests/end_to_end.py +++ b/tests/rptest/tests/end_to_end.py @@ -86,7 +86,8 @@ def start_redpanda(self, self.redpanda = RedpandaService(self.test_context, num_nodes, extra_rp_conf=self._extra_rp_conf, - extra_node_conf=self._extra_node_conf) + extra_node_conf=self._extra_node_conf, + si_settings=self.si_settings) self.redpanda.start() self._client = DefaultClient(self.redpanda) @@ -153,8 +154,8 @@ def has_finished_consuming(): wait_until(has_finished_consuming, timeout_sec=timeout_sec, - err_msg="Consumer failed to consume up to offsets %s after waiting %ds." %\ - (str(last_acked_offsets), timeout_sec)) + err_msg="Consumer failed to consume up to offsets %s after waiting %ds, last consumed offsets: %s." %\ + (str(last_acked_offsets), timeout_sec, list(self.last_consumed_offsets))) def _collect_all_logs(self): for s in self.test_context.services: diff --git a/tests/rptest/tests/fix_5355_upgrade_test.py b/tests/rptest/tests/fix_5355_upgrade_test.py new file mode 100644 index 0000000000000..d7776299ba7df --- /dev/null +++ b/tests/rptest/tests/fix_5355_upgrade_test.py @@ -0,0 +1,117 @@ +# Copyright 2022 Redpanda Data, Inc. +# +# Use of this software is governed by the Business Source License +# included in the file licenses/BSL.md +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0 + +import re + +from rptest.clients.types import TopicSpec +from rptest.tests.redpanda_test import RedpandaTest +from rptest.services.cluster import cluster +from rptest.services.redpanda import RESTART_LOG_ALLOW_LIST +from rptest.services.redpanda_installer import RedpandaInstaller, wait_for_num_versions +from rptest.services.redpanda import RedpandaService + +from confluent_kafka import (Producer, KafkaException) +from random import choice +from string import ascii_uppercase + + +def on_delivery(err, msg): + if err is not None: + raise KafkaException(err) + + +class Fix5355UpgradeTest(RedpandaTest): + topics = [TopicSpec(name="topic1")] + """ + Basic test that upgrading software works as expected. + """ + def __init__(self, test_context): + extra_rp_conf = { + "default_topic_replications": 3, + "default_topic_partitions": 1, + "log_segment_size": 1048576 + } + super(Fix5355UpgradeTest, self).__init__(test_context=test_context, + num_brokers=3, + extra_rp_conf=extra_rp_conf) + self.installer = self.redpanda._installer + + def setUp(self): + # NOTE: `rpk redpanda admin brokers list` requires versions v22.1.x and + # above. + self.installer.install(self.redpanda.nodes, (22, 1, 3)) + super(Fix5355UpgradeTest, self).setUp() + + def fill_segment(self): + payload_1kb = ''.join(choice(ascii_uppercase) for i in range(1024)) + p = Producer({ + "bootstrap.servers": self.redpanda.brokers(), + "enable.idempotence": True, + "retries": 5 + }) + for i in range(0, 2 * 1024): + p.produce("topic1", + key="key1".encode('utf-8'), + value=payload_1kb.encode('utf-8'), + callback=on_delivery) + p.flush() + + def check_snapshot_exist(self): + for node in self.redpanda.nodes: + cmd = f"find {RedpandaService.DATA_DIR}" + out_iter = node.account.ssh_capture(cmd) + has_snapshot = False + for line in out_iter: + has_snapshot = has_snapshot or re.match( + f"{RedpandaService.DATA_DIR}/kafka/topic1/\\d+_\\d+/tx.snapshot", + line) + assert has_snapshot + + @cluster(num_nodes=3, log_allow_list=RESTART_LOG_ALLOW_LIST) + def test_rollback(self): + """ + the test checks than a mid upgrade rollback isn't broken + """ + first_node = self.redpanda.nodes[0] + + unique_versions = wait_for_num_versions(self.redpanda, 1) + assert "v22.1.3" in unique_versions, unique_versions + + # Upgrade one node to the head version. + self.installer.install([first_node], RedpandaInstaller.HEAD) + self.redpanda.restart_nodes([first_node]) + unique_versions = wait_for_num_versions(self.redpanda, 2) + assert "v22.1.3" in unique_versions, unique_versions + + self.fill_segment() + self.check_snapshot_exist() + + # Rollback the partial upgrade and ensure we go back to the original + # state. + self.installer.install([first_node], (22, 1, 3)) + self.redpanda.restart_nodes([first_node]) + unique_versions = wait_for_num_versions(self.redpanda, 1) + assert "v22.1.3" in unique_versions, unique_versions + + @cluster(num_nodes=3, log_allow_list=RESTART_LOG_ALLOW_LIST) + def test_upgrade(self): + """ + the test checks than upgrade isn't broken + """ + unique_versions = wait_for_num_versions(self.redpanda, 1) + assert "v22.1.3" in unique_versions, unique_versions + + self.fill_segment() + self.check_snapshot_exist() + + # Upgrade one node to the head version. + self.installer.install(self.redpanda.nodes, RedpandaInstaller.HEAD) + self.redpanda.restart_nodes(self.redpanda.nodes) + unique_versions = wait_for_num_versions(self.redpanda, 1) + assert "v22.1.3" not in unique_versions, unique_versions diff --git a/tests/rptest/tests/group_membership_test.py b/tests/rptest/tests/group_membership_test.py index 389bc1cf3a6b8..d57fce734e79c 100644 --- a/tests/rptest/tests/group_membership_test.py +++ b/tests/rptest/tests/group_membership_test.py @@ -127,7 +127,8 @@ def __init__(self, ctx, *args, **kwargs): # Require internal_kafka topic to have an increased replication factor extra_rp_conf = dict(default_topic_replications=3, - enable_leader_balancer=False) + enable_leader_balancer=False, + group_topic_partitions=1) super(GroupMetricsTest, self).__init__(test_context=ctx, num_brokers=3, extra_rp_conf=extra_rp_conf) @@ -374,6 +375,9 @@ def select_next_leader(): timeout_sec=30, backoff_sec=5) + self.logger.debug( + f"Waiting for metrics from the single node: {new_leader.account.hostname}" + ) wait_until(lambda: metrics_from_single_node(new_leader), timeout_sec=30, backoff_sec=5) diff --git a/tests/rptest/tests/nodes_decommissioning_test.py b/tests/rptest/tests/nodes_decommissioning_test.py index 2256f2cbf70c6..d38d0d451100d 100644 --- a/tests/rptest/tests/nodes_decommissioning_test.py +++ b/tests/rptest/tests/nodes_decommissioning_test.py @@ -44,7 +44,7 @@ def _partitions_moving(self): def _partitions_not_moving(self): admin = Admin(self.redpanda) reconfigurations = admin.list_reconfigurations() - return len(reconfigurations) > 0 + return len(reconfigurations) == 0 def _partition_to_move(self, predicate): rpk = RpkTool(self.redpanda) @@ -69,6 +69,38 @@ def _node_removed(self, removed_id, node_to_query): return False return True + def _find_replacement(self, current_replicas, to_remove): + new_replicas = [] + unique_node_ids = set() + for r in current_replicas: + if r['node_id'] != to_remove: + unique_node_ids.add(r['node_id']) + new_replicas.append(r) + + admin = Admin(self.redpanda) + brokers = admin.get_brokers() + + to_add = None + while len(unique_node_ids) < len(current_replicas): + id = random.choice(brokers)['node_id'] + if id == to_remove: + continue + to_add = id + unique_node_ids.add(to_add) + + new_replicas.append({"node_id": to_add, "core": 0}) + return new_replicas + + def _wait_until_status(self, node_id, status, timeout_sec=15): + def requested_status(): + brokers = Admin(self.redpanda).get_brokers() + for broker in brokers: + if broker['node_id'] == node_id: + return broker['membership_status'] == status + return False + + wait_until(requested_status, timeout_sec=timeout_sec, backoff_sec=1) + @cluster( num_nodes=6, # A decom can look like a restart in terms of logs from peers dropping @@ -174,18 +206,6 @@ def test_decommissioning_cancel_ongoing_movements(self): self.logger.info(f"decommissioning node: {to_decommission}", ) admin.decommission_broker(to_decommission) - def check_status(node_id, status): - brokers = admin.get_brokers() - for broker in brokers: - if broker['node_id'] == node_id: - return broker['membership_status'] == status - - return False - - wait_until(lambda: check_status(to_decommission, 'draining'), - timeout_sec=15, - backoff_sec=1) - survivor_node = self._not_decommissioned_node(to_decommission) # adjust recovery throttle to make sure moves will finish rpk.cluster_config_set("raft_learner_recovery_rate", str(2 << 30)) @@ -198,3 +218,141 @@ def check_status(node_id, status): self.redpanda.stop_node(self.redpanda.get_node(to_decommission)) self.run_validation(enable_idempotence=False, consumer_timeout_sec=90) + + @cluster(num_nodes=6, log_allow_list=RESTART_LOG_ALLOW_LIST) + def test_recommissioning_node(self): + self.start_redpanda(num_nodes=4) + self._create_topics() + + self.start_producer(1) + self.start_consumer(1) + self.await_startup() + admin = Admin(self.redpanda) + + brokers = admin.get_brokers() + to_decommission = random.choice(brokers)['node_id'] + + # throttle recovery + rpk = RpkTool(self.redpanda) + rpk.cluster_config_set("raft_learner_recovery_rate", str(1)) + + self.logger.info(f"decommissioning node: {to_decommission}", ) + admin.decommission_broker(to_decommission) + + self._wait_until_status(to_decommission, 'draining') + + wait_until(lambda: self._partitions_moving(), + timeout_sec=15, + backoff_sec=1) + + # recommission broker + admin.recommission_broker(to_decommission) + self._wait_until_status(to_decommission, 'active') + + wait_until(lambda: self._partitions_not_moving(), + timeout_sec=15, + backoff_sec=1) + + @cluster(num_nodes=6, log_allow_list=RESTART_LOG_ALLOW_LIST) + def test_recommissioning_do_not_stop_all_moves_node(self): + self.start_redpanda(num_nodes=4) + self._create_topics() + + self.start_producer(1) + self.start_consumer(1) + self.await_startup() + admin = Admin(self.redpanda) + + brokers = admin.get_brokers() + to_decommission = random.choice(brokers)['node_id'] + + # throttle recovery + rpk = RpkTool(self.redpanda) + rpk.cluster_config_set("raft_learner_recovery_rate", str(1)) + + # schedule partition move from the node being decommissioned before actually calling decommission + + to_move_tp, to_move_p, _ = self._partition_to_move( + lambda p: to_decommission in p.replicas) + details = admin.get_partitions(topic=to_move_tp, partition=to_move_p) + + new_replicas = self._find_replacement(details['replicas'], + to_decommission) + self.logger.info( + f"moving partition {to_move_tp}/{to_move_p} - {details['replicas']} -> {new_replicas}" + ) + + admin.set_partition_replicas(topic=to_move_tp, + partition=to_move_p, + replicas=new_replicas) + # moving partition should be present in moving list + wait_until(lambda: self._partitions_moving(), + timeout_sec=15, + backoff_sec=1) + + self.logger.info(f"decommissioning node: {to_decommission}", ) + admin.decommission_broker(to_decommission) + + self._wait_until_status(to_decommission, 'draining') + + wait_until(lambda: self._partitions_moving(), + timeout_sec=15, + backoff_sec=1) + + # recommission broker + admin.recommission_broker(to_decommission) + self._wait_until_status(to_decommission, 'active') + + def one_left_moving(): + reconfigurations = admin.list_reconfigurations() + return len(reconfigurations) == 1 + + wait_until(one_left_moving, timeout_sec=15, backoff_sec=1) + + @cluster(num_nodes=7, log_allow_list=RESTART_LOG_ALLOW_LIST) + def test_recommissioning_one_of_decommissioned_nodes(self): + self.start_redpanda(num_nodes=5) + self._create_topics() + + self.start_producer(1) + self.start_consumer(1) + self.await_startup() + admin = Admin(self.redpanda) + + brokers = admin.get_brokers() + to_decommission_1 = random.choice(brokers)['node_id'] + to_decommission_2 = to_decommission_1 + + while to_decommission_1 == to_decommission_2: + to_decommission_2 = random.choice(brokers)['node_id'] + + # throttle recovery + rpk = RpkTool(self.redpanda) + rpk.cluster_config_set("raft_learner_recovery_rate", str(1)) + + self.logger.info(f"decommissioning node: {to_decommission_1}", ) + admin.decommission_broker(to_decommission_1) + self.logger.info(f"decommissioning node: {to_decommission_2}", ) + admin.decommission_broker(to_decommission_2) + + self._wait_until_status(to_decommission_1, 'draining') + self._wait_until_status(to_decommission_2, 'draining') + + wait_until(lambda: self._partitions_moving(), + timeout_sec=15, + backoff_sec=1) + + # recommission broker that was decommissioned first + admin.recommission_broker(to_decommission_1) + self._wait_until_status(to_decommission_1, 'active') + + rpk.cluster_config_set("raft_learner_recovery_rate", str(2 << 30)) + + def node_removed(): + brokers = admin.get_brokers() + for broker in brokers: + if broker['node_id'] == to_decommission_2: + return False + return True + + wait_until(node_removed, 60, 2) diff --git a/tests/rptest/tests/partition_balancer_test.py b/tests/rptest/tests/partition_balancer_test.py index 6e47c29517c27..5b2d0e9596266 100644 --- a/tests/rptest/tests/partition_balancer_test.py +++ b/tests/rptest/tests/partition_balancer_test.py @@ -47,12 +47,11 @@ def all_partitions_ready(): return False return (len(partitions) == num_partitions, partitions) - return wait_until_result( - all_partitions_ready, - timeout_sec=30, - backoff_sec=1, - err_msg="failed to wait until all partitions have leaders", - ) + partitions = wait_until_result( + all_partitions_ready, + timeout_sec=120, + backoff_sec=1, + err_msg="failed to wait until all partitions have leaders") def node2partition_count(self): topics = [self.topic] @@ -65,7 +64,7 @@ def node2partition_count(self): return ret - def wait_until_status(self, predicate, timeout_sec=60): + def wait_until_status(self, predicate, timeout_sec=120): admin = Admin(self.redpanda) start = time.time() @@ -90,7 +89,7 @@ def check(): err_msg="failed to wait until status condition", ) - def wait_until_ready(self, timeout_sec=60): + def wait_until_ready(self, timeout_sec=120): return self.wait_until_status( lambda status: status["status"] == "ready", timeout_sec=timeout_sec ) diff --git a/tests/rptest/tests/read_replica_e2e_test.py b/tests/rptest/tests/read_replica_e2e_test.py new file mode 100644 index 0000000000000..81afc0e3896f5 --- /dev/null +++ b/tests/rptest/tests/read_replica_e2e_test.py @@ -0,0 +1,132 @@ +# Copyright 2022 Redpanda Data, Inc. +# +# Use of this software is governed by the Business Source License +# included in the file licenses/BSL.md +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0 +from rptest.services.cluster import cluster + +from rptest.clients.default import DefaultClient +from rptest.services.redpanda import SISettings +from rptest.clients.rpk import RpkTool +from rptest.clients.types import TopicSpec +from ducktape.mark import matrix + +import json + +from rptest.services.redpanda import RedpandaService +from rptest.tests.end_to_end import EndToEndTest +from rptest.services.verifiable_producer import VerifiableProducer, is_int_with_prefix +from rptest.services.verifiable_consumer import VerifiableConsumer +from rptest.util import ( + wait_until, ) + + +class TestReadReplicaService(EndToEndTest): + log_segment_size = 1048576 # 5MB + topic_name = "panda-topic" + s3_bucket_name = "panda-bucket" + si_settings = SISettings( + cloud_storage_bucket=s3_bucket_name, + cloud_storage_reconciliation_interval_ms=500, + cloud_storage_max_connections=5, + log_segment_size=log_segment_size, + cloud_storage_readreplica_manifest_sync_timeout_ms=500, + cloud_storage_segment_max_upload_interval_sec=5) + + def __init__(self, test_context): + super(TestReadReplicaService, self).__init__(test_context=test_context) + self.second_cluster = None + + def create_read_replica_topic(self): + self.second_cluster = RedpandaService(self.test_context, + num_brokers=3, + si_settings=self.si_settings) + self.second_cluster.start(start_si=False) + + rpk_second_cluster = RpkTool(self.second_cluster) + conf = { + 'redpanda.remote.readreplica': self.s3_bucket_name, + } + rpk_second_cluster.create_topic(self.topic_name, config=conf) + + def start_consumer(self): + self.consumer = VerifiableConsumer( + self.test_context, + num_nodes=1, + redpanda=self.second_cluster, + topic=self.topic_name, + group_id='consumer_test_group', + on_record_consumed=self.on_record_consumed) + self.consumer.start() + + def start_producer(self): + self.producer = VerifiableProducer( + self.test_context, + num_nodes=1, + redpanda=self.redpanda, + topic=self.topic_name, + throughput=1000, + message_validator=is_int_with_prefix) + self.producer.start() + + @cluster(num_nodes=8) + @matrix(partition_count=[10], min_records=[10000]) + def test_simple_end_to_end(self, partition_count, min_records): + # Create original topic, produce data to it + self.start_redpanda(3, si_settings=self.si_settings) + spec = TopicSpec(name=self.topic_name, + partition_count=partition_count, + replication_factor=3) + + DefaultClient(self.redpanda).create_topic(spec) + + self.start_producer() + wait_until(lambda: self.producer.num_acked > min_records, + timeout_sec=30, + err_msg="Producer failed to produce messages for %ds." %\ + 30) + self.logger.info("Stopping producer after writing up to offsets %s" %\ + str(self.producer.last_acked_offsets)) + self.producer.stop() + + # Make original topic upload data to S3 + rpk = RpkTool(self.redpanda) + rpk.alter_topic_config(spec.name, 'redpanda.remote.write', 'true') + + # Make sure all produced data is uploaded to S3 + def s3_has_all_data(): + objects = list( + self.redpanda._s3client.list_objects(self.s3_bucket_name)) + total_uploaded = 0 + for o in objects: + if o.Key.endswith( + "/manifest.json") and self.topic_name in o.Key: + data = self.redpanda._s3client.get_object_data( + self.s3_bucket_name, o.Key) + manifest = json.loads(data) + last_upl_offset = manifest['last_offset'] + total_uploaded += last_upl_offset + self.logger.info( + f"Found manifest at {o.Key}, last_offset is {last_upl_offset}" + ) + self.logger.info( + f"Total uploaded: {total_uploaded}, num_acked: {self.producer.num_acked}" + ) + return total_uploaded >= self.producer.num_acked + + wait_until( + s3_has_all_data, + timeout_sec= + 30, #should be uploaded since cloud_storage_segment_max_upload_interval_sec=5 + backoff_sec=5, + err_msg= + f"Not all data is uploaded to S3 bucket, is S3 bucket: {list(self.redpanda._s3client.list_objects(self.s3_bucket_name))}" + ) + + # Create read replica topic, consume from it and validate + self.create_read_replica_topic() + self.start_consumer() + self.run_validation() diff --git a/tests/rptest/tests/redpanda_test.py b/tests/rptest/tests/redpanda_test.py index 3b69e5b44bb3a..f81f7c75b1a22 100644 --- a/tests/rptest/tests/redpanda_test.py +++ b/tests/rptest/tests/redpanda_test.py @@ -34,7 +34,6 @@ def __init__(self, enable_pp=False, enable_sr=False, si_settings=None, - enable_installer=False, **kwargs): """ Any trailing keyword arguments are passed through to the @@ -43,7 +42,6 @@ def __init__(self, super(RedpandaTest, self).__init__(test_context) self.scale = Scale(test_context) self.si_settings = si_settings - self.enable_installer = enable_installer if num_brokers is None: # Default to a 3 node cluster if sufficient nodes are available, else @@ -65,7 +63,6 @@ def __init__(self, enable_pp=enable_pp, enable_sr=enable_sr, si_settings=self.si_settings, - enable_installer=enable_installer, **kwargs) self._client = DefaultClient(self.redpanda) diff --git a/tests/rptest/tests/rpk_cluster_test.py b/tests/rptest/tests/rpk_cluster_test.py index 4df7b9be69cca..a2d6f2ea30a8b 100644 --- a/tests/rptest/tests/rpk_cluster_test.py +++ b/tests/rptest/tests/rpk_cluster_test.py @@ -9,11 +9,14 @@ import os import re +import datetime +import tempfile import zipfile import json from rptest.services.cluster import cluster from rptest.services.redpanda import RESTART_LOG_ALLOW_LIST +from rptest.util import expect_exception, get_cluster_license from ducktape.utils.util import wait_until from rptest.tests.redpanda_test import RedpandaTest @@ -184,3 +187,76 @@ def test_cluster_down(self): pass else: assert False, f"Unexpected success: '{r}'" + + @cluster(num_nodes=3) + def test_upload_and_query_cluster_license_rpk(self): + """ + Test uploading and retrieval of license via rpk + using --path option + """ + license = get_cluster_license() + if license is None: + self.logger.info( + "Skipping test, REDPANDA_SAMPLE_LICENSE env var not found") + return + + with tempfile.NamedTemporaryFile() as tf: + tf.write(bytes(license, 'UTF-8')) + tf.seek(0) + output = self._rpk.license_set(tf.name) + assert "Successfully uploaded license" in output + + def get_license(): + output = self._rpk.license_info() + resp = json.loads(output) + if resp['org'] == "redpanda-testing": + return True + + return False + + wait_until(get_license, + timeout_sec=10, + backoff_sec=1, + retry_on_exc=True, + err_msg="unable to retrieve license information") + + expected_license = { + 'expires': + (datetime.date(2122, 6, 6) - datetime.date.today()).days, + 'format_version': 0, + 'org': 'redpanda-testing', + 'type': 'enterprise' + } + output = self._rpk.license_info() + assert expected_license == json.loads(output) + + @cluster(num_nodes=3) + def test_upload_cluster_license_rpk(self): + """ + Test uploading of license via rpk + using inline license option + """ + license = get_cluster_license() + if license is None: + self.logger.info( + "Skipping test, REDPANDA_SAMPLE_LICENSE env var not found") + return + + output = self._rpk.license_set("", license) + assert "Successfully uploaded license" in output + + @cluster(num_nodes=3) + def test_upload_cluster_license_error(self): + with expect_exception(RpkException, + lambda e: "Internal Server Error" in str(e)): + license = get_cluster_license() + if license is None: + self.logger.info( + "Skipping test, REDPANDA_SAMPLE_LICENSE env var not found") + return + + with tempfile.NamedTemporaryFile() as tf: + tf.write(bytes(license + 'r', 'UTF-8')) + tf.seek(0) + + self._rpk.license_set(tf.name) diff --git a/tests/rptest/tests/rpk_config_test.py b/tests/rptest/tests/rpk_config_test.py index 18047df74fa51..ab42007b1431d 100644 --- a/tests/rptest/tests/rpk_config_test.py +++ b/tests/rptest/tests/rpk_config_test.py @@ -53,15 +53,9 @@ def test_config_init(self): port: 9644 developer_mode: true rpk: - admin_api: - addresses: - - 127.0.0.1:9644 coredump_dir: /var/lib/redpanda/coredump enable_memory_locking: false enable_usage_stats: false - kafka_api: - brokers: - - 0.0.0.0:9092 overprovisioned: false tune_aio_events: false tune_ballast_file: false @@ -182,9 +176,6 @@ def test_config_set_json(self): rpk.config_set(key, value, format='json') expected_config = yaml.full_load(''' -admin_api: - addresses: - - 127.0.0.1:9644 coredump_dir: /var/lib/redpanda/coredump enable_memory_locking: false enable_usage_stats: false @@ -210,12 +201,6 @@ def test_config_set_json(self): with open(os.path.join(d, 'redpanda.yaml')) as f: actual_config = yaml.full_load(f.read()) - assert actual_config['rpk']['kafka_api'] is not None - - # Delete 'kafka_api' so they can be compared since the - # brokers change depending on the container it's running - del actual_config['rpk']['kafka_api'] - if actual_config['rpk'] != expected_config: self.logger.error("Configs differ") self.logger.error( @@ -234,3 +219,46 @@ def test_config_change_then_restart_node(self): rpk.config_set(key, value) self.redpanda.restart_nodes(node) + + @cluster(num_nodes=1) + def test_config_change_mode_prod(self): + """ + Verify that after running rpk redpanda mode prod, the + configuration values of the tuners change accordingly. + """ + node = self.redpanda.nodes[0] + rpk = RpkRemoteTool(self.redpanda, node) + rpk.mode_set("prod") + expected_config = yaml.full_load(''' + enable_usage_stats: false + tune_network: true + tune_disk_scheduler: true + tune_disk_nomerges: true + tune_disk_write_cache: true + tune_disk_irq: true + tune_fstrim: false + tune_cpu: true + tune_aio_events: true + tune_clocksource: true + tune_swappiness: true + tune_transparent_hugepages: false + enable_memory_locking: false + tune_coredump: false + coredump_dir: /var/lib/redpanda/coredump + tune_ballast_file: true + overprovisioned: false +''') + with tempfile.TemporaryDirectory() as d: + node.account.copy_from(RedpandaService.NODE_CONFIG_FILE, d) + + with open(os.path.join(d, 'redpanda.yaml')) as f: + actual_config = yaml.full_load(f.read()) + + if actual_config['rpk'] != expected_config: + self.logger.error("Configs differ") + self.logger.error( + f"Expected: {yaml.dump(expected_config)}") + self.logger.error( + f"Actual: {yaml.dump(actual_config['rpk'])}") + assert actual_config['rpk'] == expected_config + assert actual_config['redpanda']['developer_mode'] == False diff --git a/tests/rptest/tests/rpk_tuner_test.py b/tests/rptest/tests/rpk_tuner_test.py new file mode 100644 index 0000000000000..b0d40c197eaa1 --- /dev/null +++ b/tests/rptest/tests/rpk_tuner_test.py @@ -0,0 +1,88 @@ +# Copyright 2022 Redpanda Data, Inc. +# +# Use of this software is governed by the Business Source License +# included in the file licenses/BSL.md +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0 + +from rptest.services.cluster import cluster +from rptest.tests.redpanda_test import RedpandaTest +from rptest.clients.rpk_remote import RpkRemoteTool + + +class RpkTunerTest(RedpandaTest): + def __init__(self, ctx): + super(RpkTunerTest, self).__init__(test_context=ctx) + self._ctx = ctx + + @cluster(num_nodes=1) + def test_tune_prod_all(self): + """ + Test will set production mode and execute rpk redpanda tune all, + we expect the command to exit with 1 if an error happens. + """ + node = self.redpanda.nodes[0] + rpk = RpkRemoteTool(self.redpanda, node) + rpk.mode_set("prod") + + rpk.tune("all") + + @cluster(num_nodes=1) + def test_tune_fstrim(self): + """ + Validate fstrim tuner execution, + fstrim was disabled in production mode https://github.com/redpanda-data/redpanda/issues/3068 + """ + node = self.redpanda.nodes[0] + rpk = RpkRemoteTool(self.redpanda, node) + rpk.config_set('rpk.tune_fstrim', 'true') + + rpk.tune("fstrim") + + @cluster(num_nodes=1) + def test_tune_transparent_hugepages(self): + """ + Validate transparent hugepage tuner execution. + THP tuner is disabled in production mode + """ + node = self.redpanda.nodes[0] + rpk = RpkRemoteTool(self.redpanda, node) + rpk.config_set('rpk.tune_transparent_hugepages', 'true') + + rpk.tune("transparent_hugepages") + + @cluster(num_nodes=1) + def test_tune_list(self): + """ + Forward compatible test, the purpose is to check if available + tuners match our current setup, if a new tuner gets added we + will catch it here. + """ + node = self.redpanda.nodes[0] + rpk = RpkRemoteTool(self.redpanda, node) + # Set all tuners: + rpk.mode_set("prod") + rpk.config_set('rpk.tune_fstrim', 'true') + rpk.config_set('rpk.tune_transparent_hugepages', 'true') + rpk.config_set('rpk.tune_coredump', 'true') + + expected = '''TUNER ENABLED SUPPORTED UNSUPPORTED-REASON +aio_events true true +ballast_file true true +clocksource true true +coredump true true +cpu true true +disk_irq true true +disk_nomerges true true +disk_scheduler true true +disk_write_cache true false Disk write cache tuner is only supported in GCP +fstrim true true +net true true +swappiness true true +transparent_hugepages true true +''' + output = rpk.tune("list") + + assert output == expected diff --git a/tests/rptest/tests/scaling_up_test.py b/tests/rptest/tests/scaling_up_test.py index d18a02c30d0fb..624843363df2e 100644 --- a/tests/rptest/tests/scaling_up_test.py +++ b/tests/rptest/tests/scaling_up_test.py @@ -23,8 +23,14 @@ class ScalingUpTest(EndToEndTest): """ @cluster(num_nodes=5) def test_adding_nodes_to_cluster(self): - self.redpanda = RedpandaService( - self.test_context, 3, extra_rp_conf={"group_topic_partitions": 1}) + self.redpanda = RedpandaService(self.test_context, + 3, + extra_rp_conf={ + "group_topic_partitions": + 1, + "partition_autobalancing_mode": + "node_add_remove" + }) # start single node cluster self.redpanda.start(nodes=[self.redpanda.nodes[0]]) # create some topics diff --git a/tests/rptest/tests/scram_test.py b/tests/rptest/tests/scram_test.py index 55bd61538acc4..d0b7603b62fe9 100644 --- a/tests/rptest/tests/scram_test.py +++ b/tests/rptest/tests/scram_test.py @@ -10,16 +10,19 @@ import socket import string import requests +from requests.exceptions import HTTPError import time from ducktape.mark import parametrize +from ducktape.utils.util import wait_until from rptest.services.cluster import cluster from rptest.tests.redpanda_test import RedpandaTest from rptest.clients.types import TopicSpec from rptest.clients.python_librdkafka import PythonLibrdkafka from rptest.services.admin import Admin -from rptest.services.redpanda import SecurityConfig +from rptest.services.redpanda import SecurityConfig, SaslCredentials, SecurityConfig +from rptest.util import expect_http_error class ScramTest(RedpandaTest): @@ -299,3 +302,90 @@ def test_enable_sasl_live(self): # An unauthenticated client should be accepted again assert len(unauthenticated_client.topics()) == 1 + + +class ScramBootstrapUserTest(RedpandaTest): + BOOTSTRAP_USERNAME = 'bob' + BOOTSTRAP_PASSWORD = 'sekrit' + + def __init__(self, *args, **kwargs): + # Configure the cluster as a user might configure it for secure + # bootstrap: i.e. all auth turned on from moment of creation. + + security_config = SecurityConfig() + security_config.enable_sasl = True + + super().__init__( + *args, + environment={ + 'RP_BOOTSTRAP_USER': + f'{self.BOOTSTRAP_USERNAME}:{self.BOOTSTRAP_PASSWORD}' + }, + extra_rp_conf={ + 'enable_sasl': True, + 'admin_api_require_auth': True, + 'superusers': ['bob'] + }, + security=security_config, + superuser=SaslCredentials(self.BOOTSTRAP_USERNAME, + self.BOOTSTRAP_PASSWORD, + "SCRAM-SHA-256"), + **kwargs) + + def _check_http_status_everywhere(self, expect_status, callable): + """ + Check that the callback results in an HTTP error with the + given status code from all nodes in the cluster. This enables + checking that auth state has propagated as expected. + + :returns: true if all nodes throw an error with the expected status code + """ + + for n in self.redpanda.nodes: + try: + callable(n) + except HTTPError as e: + if e.response.status_code != expect_status: + return False + else: + return False + + return True + + @cluster(num_nodes=3) + def test_bootstrap_user(self): + # Anonymous access should be refused + admin = Admin(self.redpanda) + with expect_http_error(403): + admin.list_users() + + # Access using the bootstrap credentials should succeed + admin = Admin(self.redpanda, + auth=(self.BOOTSTRAP_USERNAME, self.BOOTSTRAP_PASSWORD)) + assert self.BOOTSTRAP_USERNAME in admin.list_users() + + # Modify the bootstrap user's credential + admin.update_user(self.BOOTSTRAP_USERNAME, "newpassword", + "SCRAM-SHA-256") + + # Getting 401 with old credentials everywhere will show that the + # credential update has propagated to all nodes + wait_until(lambda: self._check_http_status_everywhere( + 401, lambda n: admin.list_users(node=n)), + timeout_sec=10, + backoff_sec=0.5) + + # Using old password should fail + with expect_http_error(401): + admin.list_users() + + # Using new credential should succeed + admin = Admin(self.redpanda, + auth=(self.BOOTSTRAP_USERNAME, 'newpassword')) + admin.list_users() + + # Modified credential should survive a restart: this verifies that + # the RP_BOOTSTRAP_USER setting does not fight with changes made + # by other means. + self.redpanda.restart_nodes(self.redpanda.nodes) + admin.list_users() diff --git a/tests/rptest/tests/upgrade_test.py b/tests/rptest/tests/upgrade_test.py index e5da152eeb6fb..5da4b85973638 100644 --- a/tests/rptest/tests/upgrade_test.py +++ b/tests/rptest/tests/upgrade_test.py @@ -14,25 +14,7 @@ from rptest.tests.redpanda_test import RedpandaTest from rptest.services.cluster import cluster from rptest.services.redpanda import RESTART_LOG_ALLOW_LIST -from rptest.services.redpanda_installer import RedpandaInstaller - - -def wait_for_num_versions(redpanda, num_versions): - def get_unique_versions(): - node = redpanda.nodes[0] - brokers_list = \ - str(node.account.ssh_output(f"{redpanda.find_binary('rpk')} redpanda admin brokers list")) - redpanda.logger.debug(brokers_list) - version_re = re.compile("v\\d+\\.\\d+\\.\\d+") - return set(version_re.findall(brokers_list)) - - # NOTE: allow retries, as the version may not be available immediately - # following a restart. - wait_until(lambda: len(get_unique_versions()) == num_versions, - timeout_sec=30) - unique_versions = get_unique_versions() - assert len(unique_versions) == num_versions, unique_versions - return unique_versions +from rptest.services.redpanda_installer import RedpandaInstaller, wait_for_num_versions class UpgradeFromSpecificVersion(RedpandaTest): @@ -41,9 +23,7 @@ class UpgradeFromSpecificVersion(RedpandaTest): """ def __init__(self, test_context): super(UpgradeFromSpecificVersion, - self).__init__(test_context=test_context, - num_brokers=3, - enable_installer=True) + self).__init__(test_context=test_context, num_brokers=3) self.installer = self.redpanda._installer def setUp(self): @@ -87,9 +67,7 @@ class UpgradeFromPriorFeatureVersionTest(RedpandaTest): """ def __init__(self, test_context): super(UpgradeFromPriorFeatureVersionTest, - self).__init__(test_context=test_context, - num_brokers=1, - enable_installer=True) + self).__init__(test_context=test_context, num_brokers=1) self.installer = self.redpanda._installer def setUp(self): diff --git a/tests/rptest/util.py b/tests/rptest/util.py index 0049cda0cc86c..5dc2cca9c34f5 100644 --- a/tests/rptest/util.py +++ b/tests/rptest/util.py @@ -215,6 +215,18 @@ def inject_remote_script(node, script_name): return remote_path +def get_cluster_license(): + license = os.environ.get("REDPANDA_SAMPLE_LICENSE", None) + if license is None: + is_ci = os.environ.get("CI", "false") + if is_ci == "true": + raise RuntimeError( + "Expected REDPANDA_SAMPLE_LICENSE variable to be set in this environment" + ) + + return license + + class firewall_blocked: """Temporary firewall barrier that isolates set of redpanda nodes from the ip-address""" diff --git a/tools/rpcgen.py b/tools/rpcgen.py index f932169e2f5cf..e604604e529e0 100755 --- a/tools/rpcgen.py +++ b/tools/rpcgen.py @@ -54,7 +54,8 @@ namespace {{namespace}} { -class {{service_name}}_service : public rpc::service { +template +class {{service_name}}_service_base : public rpc::service { public: class failure_probes; @@ -62,21 +63,21 @@ class failure_probes; static constexpr uint32_t {{method.name}}_method_id = {{method.id}}; {%- endfor %} - {{service_name}}_service(ss::scheduling_group sc, ss::smp_service_group ssg) + {{service_name}}_service_base(ss::scheduling_group sc, ss::smp_service_group ssg) : _sc(sc), _ssg(ssg) {} - {{service_name}}_service({{service_name}}_service&& o) noexcept + {{service_name}}_service_base({{service_name}}_service_base&& o) noexcept : _sc(std::move(o._sc)), _ssg(std::move(o._ssg)), _methods(std::move(o._methods)) {} - {{service_name}}_service& operator=({{service_name}}_service&& o) noexcept { + {{service_name}}_service_base& operator=({{service_name}}_service_base&& o) noexcept { if(this != &o){ - this->~{{service_name}}_service(); - new (this) {{service_name}}_service(std::move(o)); + this->~{{service_name}}_service_base(); + new (this) {{service_name}}_service_base(std::move(o)); } return *this; } - virtual ~{{service_name}}_service() noexcept = default; + virtual ~{{service_name}}_service_base() noexcept = default; void setup_metrics() final { namespace sm = ss::metrics; @@ -124,7 +125,8 @@ class failure_probes; virtual ss::future raw_{{method.name}}(ss::input_stream& in, rpc::streaming_context& ctx) { return execution_helper<{{method.input_type}}, - {{method.output_type}}>::exec(in, ctx, {{method.id}}, + {{method.output_type}}, + Codec>::exec(in, ctx, {{method.id}}, [this]( {{method.input_type}}&& t, rpc::streaming_context& ctx) -> ss::future<{{method.output_type}}> { return {{method.name}}(std::move(t), ctx); @@ -166,7 +168,8 @@ class {{service_name}}_client_protocol { rpc::transport& _transport; }; -class {{service_name}}_service::failure_probes final : public finjector::probe { +template +class {{service_name}}_service_base::failure_probes final : public finjector::probe { public: using type = uint32_t; @@ -221,6 +224,8 @@ class {{service_name}}_service::failure_probes final : public finjector::probe { fast_prng _prng; }; +using {{service_name}}_service = {{service_name}}_service_base; + } // namespace """