Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rpk: introduce cluster partitions balancer-status #5798

Merged
merged 1 commit into from
Aug 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions src/go/rpk/pkg/api/admin/api_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,45 @@ type ClusterHealthOverview struct {
LeaderlessPartitions []string `json:"leaderless_partitions"`
}

// PartitionBalancerStatus is the status of the partition auto balancer.
type PartitionBalancerStatus struct {
// Status is either off, ready, starting, in_progress or stalled.
//
// off: The balancer is disabled.
// ready: The balancer is active but there is nothing to do.
// starting: The balancer is starting but has not run yet.
// in_progress: The balancer is active and is in the process of
// scheduling partition movements.
// stalled: There are some violations, but for some reason, the
// balancer cannot make progress in mitigating them.
Status string `json:"status,omitempty"`
// Violations are the partition balancer violations.
Violations PartitionBalancerViolations `json:"violations,omitempty"`
// SecondsSinceLastTick is the last time the partition balancer ran.
SecondsSinceLastTick int `json:"seconds_since_last_tick,omitempty"`
// CurrentReassignmentsCount is the current number of partition
// reassignments in progress.
CurrentReassignmentsCount int `json:"current_reassignments_count,omitempty"`
}

// PartitionBalancerViolations describe the violations of the partition
// auto balancer.
type PartitionBalancerViolations struct {
// UnavailableNodes are the nodes that have been unavailable after a time
// set by 'partition_autobalancing_node_availability_timeout_sec' property.
UnavailableNodes []int `json:"unavailable_nodes,omitempty"`
// OverDiskLimitNodes are the nodes that surpassed the threshold of used
// disk percentage set by 'partition_autobalancing_max_disk_usage_percent'
// property.
OverDiskLimitNodes []int `json:"over_disk_limit_nodes,omitempty"`
}

func (a *AdminAPI) GetHealthOverview(ctx context.Context) (ClusterHealthOverview, error) {
var response ClusterHealthOverview
return response, a.sendAny(ctx, http.MethodGet, "/v1/cluster/health_overview", nil, &response)
}

func (a *AdminAPI) GetPartitionStatus(ctx context.Context) (PartitionBalancerStatus, error) {
var response PartitionBalancerStatus
return response, a.sendAny(ctx, http.MethodGet, "/v1/cluster/partition_balancer/status", nil, &response)
}
2 changes: 2 additions & 0 deletions src/go/rpk/pkg/cli/cmd/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/cluster/config"
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/cluster/license"
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/cluster/maintenance"
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/cluster/partitions"
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/common"
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/group"
"github.com/spf13/afero"
Expand Down Expand Up @@ -61,6 +62,7 @@ func NewClusterCommand(fs afero.Fs) *cobra.Command {
config.NewConfigCommand(fs),
license.NewLicenseCommand(fs),
maintenance.NewMaintenanceCommand(fs),
partitions.NewPartitionsCommand(fs),
offsets,
)

Expand Down
43 changes: 43 additions & 0 deletions src/go/rpk/pkg/cli/cmd/cluster/partitions/partitions.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package partitions

import (
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/common"
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/config"
"github.com/spf13/afero"
"github.com/spf13/cobra"
)

func NewPartitionsCommand(fs afero.Fs) *cobra.Command {
var (
adminURL string
adminEnableTLS bool
adminCertFile string
adminKeyFile string
adminCAFile string
)

cmd := &cobra.Command{
Use: "partitions",
Args: cobra.ExactArgs(0),
Short: "Manage cluster partitions",
}

common.AddAdminAPITLSFlags(cmd,
&adminEnableTLS,
&adminCertFile,
&adminKeyFile,
&adminCAFile,
)

cmd.AddCommand(
NewBalancerStatusCommand(fs),
)

cmd.PersistentFlags().StringVar(
&adminURL,
config.FlagAdminHosts2,
"",
"Comma-separated list of admin API addresses (<IP>:<port>)")

return cmd
}
102 changes: 102 additions & 0 deletions src/go/rpk/pkg/cli/cmd/cluster/partitions/status.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
package partitions

import (
"fmt"

"github.com/redpanda-data/redpanda/src/go/rpk/pkg/api/admin"
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/config"
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/out"
"github.com/spf13/afero"
"github.com/spf13/cobra"
)

func NewBalancerStatusCommand(fs afero.Fs) *cobra.Command {
cmd := &cobra.Command{
Use: "balancer-status",
Short: "Queries cluster for partition balancer status",
Long: `Queries cluster for partition balancer status:
If continuous partition balancing is enabled, redpanda will continuously
r-vasquez marked this conversation as resolved.
Show resolved Hide resolved
reassign partitions from both unavailable nodes and from nodes using more disk
space than the configured limit.
This command can be used to monitor the partition balancer status.
FIELDS
Status: Either off, ready, starting, in progress, or
stalled.
Seconds Since Last Tick: The last time the partition balancer ran.
Current Reassignments Count: Current number of partition reassignments in
progress.
Unavailable Nodes: The nodes that have been unavailable after a
time set by the
"partition_autobalancing_node_availability_timeout_sec"
cluster property.
Over Disk Limit Nodes: The nodes that surpassed the threshold of
used disk percentage specified in the
"partition_autobalancing_max_disk_usage_percent"
cluster property.
BALANCER STATUS
off: The balancer is disabled.
ready: The balancer is active but there is nothing to do.
starting: The balancer is starting but has not run yet.
in_progress: The balancer is active and is in the process of scheduling
partition movements.
stalled: Violations have been detected and the balancer cannot correct
them.
STALLED BALANCER
A stalled balancer can occur for a few reasons and requires a bit of manual
investigation. A few areas to investigate:
* Are there are enough healthy nodes to which to move partitions? For example,
in a three node cluster, no movements are possible for partitions with three
replicas. You will see a stall every time there is a violation.
* Does the cluster have sufficient space? If all nodes in the cluster are
utilizing more than 80% of their disk space, rebalancing cannot proceed.
* Do all partitions have quorum? If the majority of a partition's replicas are
down, the partition cannot be moved.
* Are any nodes in maintenance mode? Partitions are not moved if any node is in
maintenance mode.
`,
Args: cobra.ExactArgs(0),
Run: func(cmd *cobra.Command, _ []string) {
p := config.ParamsFromCommand(cmd)
cfg, err := p.Load(fs)
out.MaybeDie(err, "unable to load config: %v", err)

cl, err := admin.NewClient(fs, cfg)
out.MaybeDie(err, "unable to initialize admin client: %v", err)

status, err := cl.GetPartitionStatus(cmd.Context())
out.MaybeDie(err, "unable to request balancer status: %v", err)

printBalancerStatus(status)
},
}

return cmd
}

func printBalancerStatus(pbs admin.PartitionBalancerStatus) {
const format = `Status: %v
Seconds Since Last Tick: %v
Current Reassignment Count: %v
`
fmt.Printf(format, pbs.Status, pbs.SecondsSinceLastTick, pbs.CurrentReassignmentsCount)

v := pbs.Violations
if len(v.OverDiskLimitNodes) > 0 || len(v.UnavailableNodes) > 0 {
const vFormat = `Unavailable Nodes: %v
Over Disk Limit Nodes: %v
`
fmt.Printf(vFormat, v.UnavailableNodes, v.OverDiskLimitNodes)
}
}
2 changes: 1 addition & 1 deletion src/go/rpk/pkg/cli/cmd/common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ func AddKafkaFlags(
"brokers",
[]string{},
"Comma-separated list of broker ip:port pairs (e.g."+
" --brokers '192.168.78.34:9092,192.168.78.35:9092,192.179.23.54:9092' )."+
" --brokers '192.168.78.34:9092,192.168.78.35:9092,192.179.23.54:9092')."+
" Alternatively, you may set the REDPANDA_BROKERS environment"+
" variable with the comma-separated list of broker addresses",
)
Expand Down