Skip to content

Commit

Permalink
rpk: introduce cluster partitions balancer-status
Browse files Browse the repository at this point in the history
This command will query the cluster via admin api
to retrieve information about the partition auto
balancer.
  • Loading branch information
r-vasquez committed Aug 9, 2022
1 parent 59faad6 commit 7ada01c
Show file tree
Hide file tree
Showing 5 changed files with 186 additions and 1 deletion.
38 changes: 38 additions & 0 deletions src/go/rpk/pkg/api/admin/api_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,45 @@ type ClusterHealthOverview struct {
LeaderlessPartitions []string `json:"leaderless_partitions"`
}

// PartitionBalancerStatus is the status of the partition auto balancer.
type PartitionBalancerStatus struct {
// Status is either off, ready, starting, in_progress or stalled.
//
// off: The balancer is disabled.
// ready: The balancer is active but there is nothing to do.
// starting: The balancer is starting but has not run yet.
// in_progress: The balancer is active and is in the process of
// scheduling partition movements.
// stalled: There are some violations, but for some reason, the
// balancer cannot make progress in mitigating them.
Status string `json:"status,omitempty"`
// Violations are the partition balancer violations.
Violations PartitionBalancerViolations `json:"violations,omitempty"`
// SecondsSinceLastTick is the last time the partition balancer ran.
SecondsSinceLastTick int `json:"seconds_since_last_tick,omitempty"`
// CurrentReassignmentsCount is the current number of partition
// reassignments in progress.
CurrentReassignmentsCount int `json:"current_reassignments_count,omitempty"`
}

// PartitionBalancerViolations describe the violations of the partition
// auto balancer.
type PartitionBalancerViolations struct {
// UnavailableNodes are the nodes that have been unavailable after a time
// set by 'partition_autobalancing_node_availability_timeout_sec' property.
UnavailableNodes []int `json:"unavailable_nodes,omitempty"`
// OverDiskLimitNodes are the nodes that surpassed the threshold of used
// disk percentage set by 'partition_autobalancing_max_disk_usage_percent'
// property.
OverDiskLimitNodes []int `json:"over_disk_limit_nodes,omitempty"`
}

func (a *AdminAPI) GetHealthOverview(ctx context.Context) (ClusterHealthOverview, error) {
var response ClusterHealthOverview
return response, a.sendAny(ctx, http.MethodGet, "/v1/cluster/health_overview", nil, &response)
}

func (a *AdminAPI) GetPartitionStatus(ctx context.Context) (PartitionBalancerStatus, error) {
var response PartitionBalancerStatus
return response, a.sendAny(ctx, http.MethodGet, "/v1/cluster/partition_balancer/status", nil, &response)
}
2 changes: 2 additions & 0 deletions src/go/rpk/pkg/cli/cmd/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/cluster/config"
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/cluster/license"
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/cluster/maintenance"
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/cluster/partitions"
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/common"
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/group"
"github.com/spf13/afero"
Expand Down Expand Up @@ -61,6 +62,7 @@ func NewClusterCommand(fs afero.Fs) *cobra.Command {
config.NewConfigCommand(fs),
license.NewLicenseCommand(fs),
maintenance.NewMaintenanceCommand(fs),
partitions.NewPartitionsCommand(fs),
offsets,
)

Expand Down
43 changes: 43 additions & 0 deletions src/go/rpk/pkg/cli/cmd/cluster/partitions/partitions.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package partitions

import (
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/common"
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/config"
"github.com/spf13/afero"
"github.com/spf13/cobra"
)

func NewPartitionsCommand(fs afero.Fs) *cobra.Command {
var (
adminURL string
adminEnableTLS bool
adminCertFile string
adminKeyFile string
adminCAFile string
)

cmd := &cobra.Command{
Use: "partitions",
Args: cobra.ExactArgs(0),
Short: "Manage cluster partitions",
}

common.AddAdminAPITLSFlags(cmd,
&adminEnableTLS,
&adminCertFile,
&adminKeyFile,
&adminCAFile,
)

cmd.AddCommand(
NewBalancerStatusCommand(fs),
)

cmd.PersistentFlags().StringVar(
&adminURL,
config.FlagAdminHosts2,
"",
"Comma-separated list of admin API addresses (<IP>:<port>)")

return cmd
}
102 changes: 102 additions & 0 deletions src/go/rpk/pkg/cli/cmd/cluster/partitions/status.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
package partitions

import (
"fmt"

"github.com/redpanda-data/redpanda/src/go/rpk/pkg/api/admin"
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/config"
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/out"
"github.com/spf13/afero"
"github.com/spf13/cobra"
)

func NewBalancerStatusCommand(fs afero.Fs) *cobra.Command {
cmd := &cobra.Command{
Use: "balancer-status",
Short: "Queries cluster for partition balancer status",
Long: `Queries cluster for partition balancer status:
If continuous partition balancing is enabled, redpanda will continuously
reassign partitions from both unavailable nodes and from nodes using more disk
space than the configured limit.
This command can be used to monitor the partition balancer status.
FIELDS
Status: Either off, ready, starting, in progress, or
stalled.
Seconds Since Last Tick: The last time the partition balancer ran.
Current Reassignments Count: Current number of partition reassignments in
progress.
Unavailable Nodes: The nodes that have been unavailable after a
time set by the
"partition_autobalancing_node_availability_timeout_sec"
cluster property.
Over Disk Limit Nodes: The nodes that surpassed the threshold of
used disk percentage specified in the
"partition_autobalancing_max_disk_usage_percent"
cluster property.
BALANCER STATUS
off: The balancer is disabled.
ready: The balancer is active but there is nothing to do.
starting: The balancer is starting but has not run yet.
in_progress: The balancer is active and is in the process of scheduling
partition movements.
stalled: Violations have been detected and the balancer cannot correct
them.
STALLED BALANCER
A stalled balancer can occur for a few reasons and requires a bit of manual
investigation. A few areas to investigate:
* Are there are enough healthy nodes to which to move partitions? For example,
in a 3-node cluster, no movements are possible, so you will see a stall every
time there is a violation.
* Does the cluster have sufficient space? if there are no nodes in the cluster
with less than 80% of used disk space, rebalancing cannot proceed.
* Do all partitions have quorum? If two of the three partition replicas are
down, this partition cannot be moved.
* Are any nodes in maintenance mode? Partitions are not moved if a node is in
maintenance mode.
`,
Args: cobra.ExactArgs(0),
Run: func(cmd *cobra.Command, _ []string) {
p := config.ParamsFromCommand(cmd)
cfg, err := p.Load(fs)
out.MaybeDie(err, "unable to load config: %v", err)

cl, err := admin.NewClient(fs, cfg)
out.MaybeDie(err, "unable to initialize admin client: %v", err)

status, err := cl.GetPartitionStatus(cmd.Context())
out.MaybeDie(err, "unable to request balancer status: %v", err)

printBalancerStatus(status)
},
}

return cmd
}

func printBalancerStatus(pbs admin.PartitionBalancerStatus) {
const format = `Status: %v
Seconds Since Last Tick: %v
Current Reassignment Count: %v
`
fmt.Printf(format, pbs.Status, pbs.SecondsSinceLastTick, pbs.CurrentReassignmentsCount)

v := pbs.Violations
if len(v.OverDiskLimitNodes) > 0 || len(v.UnavailableNodes) > 0 {
const vFormat = `Unavailable Nodes: %v
Over Disk Limit Nodes: %v
`
fmt.Printf(vFormat, v.UnavailableNodes, v.OverDiskLimitNodes)
}
}
2 changes: 1 addition & 1 deletion src/go/rpk/pkg/cli/cmd/common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ func AddKafkaFlags(
"brokers",
[]string{},
"Comma-separated list of broker ip:port pairs (e.g."+
" --brokers '192.168.78.34:9092,192.168.78.35:9092,192.179.23.54:9092' )."+
" --brokers '192.168.78.34:9092,192.168.78.35:9092,192.179.23.54:9092')."+
" Alternatively, you may set the REDPANDA_BROKERS environment"+
" variable with the comma-separated list of broker addresses",
)
Expand Down

0 comments on commit 7ada01c

Please sign in to comment.