From b970b2d61037ea34aed3607cbf48cbc071254262 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 25 May 2022 10:13:31 +0100 Subject: [PATCH 1/2] admin: reject maintenance mode req on 1 node cluster If a single node cluster puts its only node in maintenance mode, then there is no node elegible to become controller leader, and all further progress is stopped. Fixes https://github.com/redpanda-data/redpanda/issues/4338 --- src/v/redpanda/admin_server.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/v/redpanda/admin_server.cc b/src/v/redpanda/admin_server.cc index 68e842dd0447..35fdf232252f 100644 --- a/src/v/redpanda/admin_server.cc +++ b/src/v/redpanda/admin_server.cc @@ -1798,6 +1798,13 @@ void admin_server::register_broker_routes() { throw ss::httpd::bad_request_exception( "Maintenance mode feature not active (upgrade in progress?)"); } + + if ( + _controller->get_members_table().local().all_brokers().size() < 2) { + throw ss::httpd::bad_request_exception( + "Maintenance mode may not be used on a single node cluster"); + } + model::node_id id = parse_broker_id(*req); auto ec = co_await _controller->get_members_frontend() .local() From 20f3597b4fbeb95b0a958f74940e51fe9247bb99 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 25 May 2022 10:20:02 +0100 Subject: [PATCH 2/2] cluster: drop maintenance mode messages on n=1 cluster If a system got into the bad state of issue #4338, then their cluster is broken until we replay the controller log _without_ putting the node into maintenance mode. Related https://github.com/redpanda-data/redpanda/issues/4338 --- src/v/cluster/members_table.cc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/v/cluster/members_table.cc b/src/v/cluster/members_table.cc index fbfdfea12414..dcac0cea9dab 100644 --- a/src/v/cluster/members_table.cc +++ b/src/v/cluster/members_table.cc @@ -159,6 +159,17 @@ members_table::apply(model::offset version, maintenance_mode_cmd cmd) { return errc::success; } + if (_brokers.size() < 2) { + // Maintenance mode is refused on size 1 clusters in the admin API, but + // we might be upgrading from a version that didn't have the validation. + vlog( + clusterlog.info, + "Dropping maintenance mode enable operation on single node cluster"); + + // Return success to enable progress: this is a clean no-op. + return errc::success; + } + if ( target->second->get_maintenance_state() == model::maintenance_state::active) {