diff --git a/src/go/rpk/pkg/api/admin/api_cluster.go b/src/go/rpk/pkg/api/admin/api_cluster.go index 130184c231cd1..96f5d660352be 100644 --- a/src/go/rpk/pkg/api/admin/api_cluster.go +++ b/src/go/rpk/pkg/api/admin/api_cluster.go @@ -23,7 +23,45 @@ type ClusterHealthOverview struct { LeaderlessPartitions []string `json:"leaderless_partitions"` } +// PartitionBalancerStatus is the status of the partition auto balancer. +type PartitionBalancerStatus struct { + // Status is either off, ready, starting, in_progress or stalled. + // + // off: The balancer is disabled. + // ready: The balancer is active but there is nothing to do. + // starting: The balancer is starting but has not run yet. + // in_progress: The balancer is active and is in the process of + // scheduling partition movements. + // stalled: There are some violations, but for some reason, the + // balancer cannot make progress in mitigating them. + Status string `json:"status,omitempty"` + // Violations are the partition balancer violations. + Violations PartitionBalancerViolations `json:"violations,omitempty"` + // SecondsSinceLastTick is the last time the partition balancer ran. + SecondsSinceLastTick int `json:"seconds_since_last_tick,omitempty"` + // CurrentReassignmentsCount is the current number of partition + // reassignments in progress. + CurrentReassignmentsCount int `json:"current_reassignments_count,omitempty"` +} + +// PartitionBalancerViolations describe the violations of the partition +// auto balancer. +type PartitionBalancerViolations struct { + // UnavailableNodes are the nodes that have been unavailable after a time + // set by 'partition_autobalancing_node_availability_timeout_sec' property. + UnavailableNodes []int `json:"unavailable_nodes,omitempty"` + // OverDiskLimitNodes are the nodes that surpassed the threshold of used + // disk percentage set by 'partition_autobalancing_max_disk_usage_percent' + // property. + OverDiskLimitNodes []int `json:"over_disk_limit_nodes,omitempty"` +} + func (a *AdminAPI) GetHealthOverview(ctx context.Context) (ClusterHealthOverview, error) { var response ClusterHealthOverview return response, a.sendAny(ctx, http.MethodGet, "/v1/cluster/health_overview", nil, &response) } + +func (a *AdminAPI) GetPartitionStatus(ctx context.Context) (PartitionBalancerStatus, error) { + var response PartitionBalancerStatus + return response, a.sendAny(ctx, http.MethodGet, "/v1/cluster/partition_balancer/status", nil, &response) +} diff --git a/src/go/rpk/pkg/cli/cmd/cluster.go b/src/go/rpk/pkg/cli/cmd/cluster.go index 6f221aa024eba..d4b679cf40a87 100644 --- a/src/go/rpk/pkg/cli/cmd/cluster.go +++ b/src/go/rpk/pkg/cli/cmd/cluster.go @@ -14,6 +14,7 @@ import ( "github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/cluster/config" "github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/cluster/license" "github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/cluster/maintenance" + "github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/cluster/partitions" "github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/common" "github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/group" "github.com/spf13/afero" @@ -61,6 +62,7 @@ func NewClusterCommand(fs afero.Fs) *cobra.Command { config.NewConfigCommand(fs), license.NewLicenseCommand(fs), maintenance.NewMaintenanceCommand(fs), + partitions.NewPartitionsCommand(fs), offsets, ) diff --git a/src/go/rpk/pkg/cli/cmd/cluster/partitions/partitions.go b/src/go/rpk/pkg/cli/cmd/cluster/partitions/partitions.go new file mode 100644 index 0000000000000..e1437a5fb383c --- /dev/null +++ b/src/go/rpk/pkg/cli/cmd/cluster/partitions/partitions.go @@ -0,0 +1,43 @@ +package partitions + +import ( + "github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/common" + "github.com/redpanda-data/redpanda/src/go/rpk/pkg/config" + "github.com/spf13/afero" + "github.com/spf13/cobra" +) + +func NewPartitionsCommand(fs afero.Fs) *cobra.Command { + var ( + adminURL string + adminEnableTLS bool + adminCertFile string + adminKeyFile string + adminCAFile string + ) + + cmd := &cobra.Command{ + Use: "partitions", + Args: cobra.ExactArgs(0), + Short: "Manage cluster partitions", + } + + common.AddAdminAPITLSFlags(cmd, + &adminEnableTLS, + &adminCertFile, + &adminKeyFile, + &adminCAFile, + ) + + cmd.AddCommand( + NewBalancerStatusCommand(fs), + ) + + cmd.PersistentFlags().StringVar( + &adminURL, + config.FlagAdminHosts2, + "", + "Comma-separated list of admin API addresses (:)") + + return cmd +} diff --git a/src/go/rpk/pkg/cli/cmd/cluster/partitions/status.go b/src/go/rpk/pkg/cli/cmd/cluster/partitions/status.go new file mode 100644 index 0000000000000..5fc0fec5dee02 --- /dev/null +++ b/src/go/rpk/pkg/cli/cmd/cluster/partitions/status.go @@ -0,0 +1,88 @@ +package partitions + +import ( + "fmt" + + "github.com/redpanda-data/redpanda/src/go/rpk/pkg/api/admin" + "github.com/redpanda-data/redpanda/src/go/rpk/pkg/config" + "github.com/redpanda-data/redpanda/src/go/rpk/pkg/out" + "github.com/spf13/afero" + "github.com/spf13/cobra" +) + +func NewBalancerStatusCommand(fs afero.Fs) *cobra.Command { + cmd := &cobra.Command{ + Use: "balancer-status", + Short: "Queries cluster for partition balancer status", + Long: `Queries cluster for partition balancer status: + +If continuous partition balancing is enabled, redpanda will continuously +reassign partitions from both unavailable nodes and from nodes using more disk +space than the configured limit. + +This command can be used to monitor the partition balancer status. + +FIELDS + + Status: Either off, ready, starting, in progress, or stalled. + Seconds Since Last Tick: The last time the partition balancer ran. + Current Reassignments Count: Current number of partition reassignments in progress. + Unavailable Nodes: The nodes that have been unavailable after a time set + by the "partition_autobalancing_node_availability_timeout_sec" + cluster property. + Over Disk Limit Nodes: The nodes that surpassed the threshold of used disk + percentage specified in the "partition_autobalancing_max_disk_usage_percent" + cluster property. + +BALANCER STATUS + + off: The balancer is disabled. + ready: The balancer is active but there is nothing to do. + starting: The balancer is starting but has not run yet. + in_progress: The balancer is active and is in the process of scheduling partition movements. + stalled: Violations have been detected and the balancer cannot correct them. Check the following: + +- Are there are enough healthy nodes to which to move partitions? For example, + in a 3-node cluster no movements are possible, so you will see a stall every + time there is a violation. +- Does the cluster have sufficient space? if there are no nodes in the cluster + with less than 80% of used disk space, rebalancing cannot proceed. +- Do all partitions have quorum? If two of the three partition replicas are down, + this partition cannot be moved. +- Are any nodes in maintenance mode? Partitions are not moved if a node is in + maintenance mode. +`, + Args: cobra.ExactArgs(0), + Run: func(cmd *cobra.Command, _ []string) { + p := config.ParamsFromCommand(cmd) + cfg, err := p.Load(fs) + out.MaybeDie(err, "unable to load config: %v", err) + + cl, err := admin.NewClient(fs, cfg) + out.MaybeDie(err, "unable to initialize admin client: %v", err) + + status, err := cl.GetPartitionStatus(cmd.Context()) + out.MaybeDie(err, "unable to request balancer status: %v", err) + + printBalancerStatus(status) + }, + } + + return cmd +} + +func printBalancerStatus(pbs admin.PartitionBalancerStatus) { + const format = `Status: %v +Seconds Since Last Tick: %v +Current Reassignment Count: %v +` + fmt.Printf(format, pbs.Status, pbs.SecondsSinceLastTick, pbs.CurrentReassignmentsCount) + + v := pbs.Violations + if len(v.OverDiskLimitNodes) > 0 || len(v.UnavailableNodes) > 0 { + const vFormat = `Unavailable Nodes: %v +Over Disk Limit Nodes: %v +` + fmt.Printf(vFormat, v.UnavailableNodes, v.OverDiskLimitNodes) + } +} diff --git a/src/go/rpk/pkg/cli/cmd/common/common.go b/src/go/rpk/pkg/cli/cmd/common/common.go index 6c1cbf7964cd0..f7648b50fad26 100644 --- a/src/go/rpk/pkg/cli/cmd/common/common.go +++ b/src/go/rpk/pkg/cli/cmd/common/common.go @@ -43,7 +43,7 @@ func AddKafkaFlags( "brokers", []string{}, "Comma-separated list of broker ip:port pairs (e.g."+ - " --brokers '192.168.78.34:9092,192.168.78.35:9092,192.179.23.54:9092' )."+ + " --brokers '192.168.78.34:9092,192.168.78.35:9092,192.179.23.54:9092')."+ " Alternatively, you may set the REDPANDA_BROKERS environment"+ " variable with the comma-separated list of broker addresses", )