diff --git a/src/v/cluster/controller.cc b/src/v/cluster/controller.cc index 179fe6c5192b..36058f4ef76c 100644 --- a/src/v/cluster/controller.cc +++ b/src/v/cluster/controller.cc @@ -305,11 +305,59 @@ ss::future<> controller::start(cluster_discovery& discovery) { return _stm.invoke_on(controller_stm_shard, &controller_stm::start); }) .then([this] { - return _stm.invoke_on(controller_stm_shard, [](controller_stm& stm) { - // we do not have to use timeout in here as all the batches to - // apply have to be accesssible - return stm.wait(stm.bootstrap_last_applied(), model::no_timeout); - }); + auto disk_dirty_offset = _raft0->log().offsets().dirty_offset; + + return _stm + .invoke_on( + controller_stm_shard, + [disk_dirty_offset](controller_stm& stm) { + // we do not have to use timeout in here as all the batches to + // apply have to be accesssible + auto last_applied = stm.bootstrap_last_applied(); + + // Consistency check: on a bug-free system, the last_applied + // value in the kvstore always points to data on disk. + // However, if there is a bug, or someone has removed some log + // segments out of band, we will hang trying to read up to + // last_applied. Mitigate this by clamping it to the top of + // the log on disk. + if (last_applied > disk_dirty_offset) { + vlog( + clusterlog.error, + "Inconsistency detected between KVStore last_applied " + "({}) and actual log size ({}). If the storage " + "directory was not modified intentionally, this is a " + "bug.", + last_applied, + disk_dirty_offset); + + // Try waiting for replay to reach the disk_dirty_offset, + // ignore last_applied. + return stm + .wait(disk_dirty_offset, model::time_from_now(5s)) + .handle_exception_type([](const ss::timed_out_error&) { + // Ignore timeout: it just means the controller + // log replay is done without hitting the disk + // log hwm (truncation happened), or that we were + // slow and controller replay will continue in + // the background while the rest of redpanda + // starts up. + }); + } + + vlog( + clusterlog.info, + "Controller log replay starting (to offset {})", + last_applied); + + if (last_applied == model::offset{}) { + return ss::now(); + } else { + return stm.wait(last_applied, model::no_timeout); + } + }) + .then( + [] { vlog(clusterlog.info, "Controller log replay complete."); }); }) .then([this, &discovery] { return cluster_creation_hook(discovery); }) .then( diff --git a/src/v/redpanda/admin_server.cc b/src/v/redpanda/admin_server.cc index 2d4c78f883da..ad75a1e81d78 100644 --- a/src/v/redpanda/admin_server.cc +++ b/src/v/redpanda/admin_server.cc @@ -782,6 +782,13 @@ ss::future<> admin_server::throw_on_error( co_return; case raft::errc::not_leader: throw co_await redirect_to_leader(req, ntp); + case raft::errc::node_does_not_exists: + case raft::errc::not_voter: + // node_does_not_exist is a 400 rather than a 404, because it + // comes up in the context of a destination for leader transfer, + // rather than a node ID appearing in a URL path. + throw ss::httpd::bad_request_exception( + fmt::format("Invalid request: {}", ec.message())); default: throw ss::httpd::server_error_exception( fmt::format("Unexpected raft error: {}", ec.message())); diff --git a/tests/rptest/services/redpanda.py b/tests/rptest/services/redpanda.py index b3b111b304be..55640d86fa82 100644 --- a/tests/rptest/services/redpanda.py +++ b/tests/rptest/services/redpanda.py @@ -2348,24 +2348,31 @@ def make_partition(p): def cov_enabled(self): return self._context.globals.get(self.COV_KEY, self.DEFAULT_COV_OPT) + def search_log_node(self, node: ClusterNode, pattern: str): + for line in node.account.ssh_capture( + f"grep \"{pattern}\" {RedpandaService.STDOUT_STDERR_CAPTURE} || true" + ): + # We got a match + self.logger.debug(f"Found {pattern} on node {node.name}: {line}") + return True + + return False + def search_log_any(self, pattern: str, nodes: list[ClusterNode] = None): - # Test helper for grepping the redpanda log. - # The design follows python's built-in any() function. - # https://docs.python.org/3/library/functions.html#any + """ + Test helper for grepping the redpanda log. + The design follows python's built-in any() function. + https://docs.python.org/3/library/functions.html#any - # :param pattern: the string to search for - # :param nodes: a list of nodes to run grep on - # :return: true if any instances of `pattern` found + :param pattern: the string to search for + :param nodes: a list of nodes to run grep on + :return: true if any instances of `pattern` found + """ if nodes is None: nodes = self.nodes for node in nodes: - for line in node.account.ssh_capture( - f"grep \"{pattern}\" {RedpandaService.STDOUT_STDERR_CAPTURE} || true" - ): - # We got a match - self.logger.debug( - f"Found {pattern} on node {node.name}: {line}") + if self.search_log_node(node, pattern): return True # Fall through, no matches diff --git a/tests/rptest/tests/controller_erase_test.py b/tests/rptest/tests/controller_erase_test.py new file mode 100644 index 000000000000..b9277caf5ae8 --- /dev/null +++ b/tests/rptest/tests/controller_erase_test.py @@ -0,0 +1,86 @@ +# Copyright 2022 Redpanda Data, Inc. +# +# Use of this software is governed by the Business Source License +# included in the file licenses/BSL.md +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0 + +from rptest.tests.redpanda_test import RedpandaTest +from rptest.services.redpanda import RESTART_LOG_ALLOW_LIST, RedpandaService +from rptest.services.cluster import cluster +from rptest.services.admin import Admin +from rptest.clients.types import TopicSpec +from ducktape.mark import parametrize + +ERASE_ERROR_MSG = "Inconsistency detected between KVStore last_applied" +ERASE_LOG_ALLOW_LIST = RESTART_LOG_ALLOW_LIST + [ERASE_ERROR_MSG] + + +class ControllerEraseTest(RedpandaTest): + @cluster(num_nodes=3, log_allow_list=ERASE_LOG_ALLOW_LIST) + @parametrize(partial=True) + @parametrize(partial=False) + def test_erase_controller_log(self, partial): + """ + This test covers the case where an administrator intentionally + removes the controller log data from a node, but does not wipe + the entire data directory for the node. + + In issue https://github.com/redpanda-data/redpanda/issues/4950 + this fails because the node's kvstore points to a higher last_applied + offset than exists in the local log, blocking startup: this is a regression + test for that behaviour. + + In general Redpanda doesn't handle on-disk corruption (such as selectively + deleting segments) gracefully, but in the particular case of wiping + the whole controller log, or a contiguous sufffix region of the log, + we can gracefully let raft fill in the gaps. + """ + + admin = Admin(self.redpanda) + + # Do a bunch of metadata operations to put something in the controller log + segment_count = 4 + for i in range(0, segment_count): + for j in range(0, 4): + spec = TopicSpec(partition_count=1, replication_factor=3) + self.client().create_topic(spec) + + # Move a leader to roll a segment + leader_node = self.redpanda.controller() + next_leader = ((self.redpanda.idx(leader_node) + 1) % 3) + 1 + admin.partition_transfer_leadership('redpanda', 'controller', 0, + next_leader) + + # Stop the node we will intentionally damage + victim_node = self.redpanda.nodes[1] + bystander_node = self.redpanda.nodes[0] + self.redpanda.stop_node(victim_node) + + # Erase controller log on the victim node + controller_path = f"{RedpandaService.PERSISTENT_ROOT}/data/redpanda/controller/0_0" + if partial: + # Partial deletion: remove the latest controller log segment + storage = self.redpanda.node_storage(victim_node) + segments = storage.ns['redpanda'].topics['controller'].partitions[ + "0_0"].segments.keys() + assert len(segments) == segment_count + segments = sorted(list(segments)) + victim_path = f"{controller_path}/{segments[-1]}.log" + else: + # Full deletion: remove all log segments. + victim_path = f"{controller_path}/*" + victim_node.account.remove(victim_path) + + # Node should come up cleanly + self.redpanda.start_node(victim_node) + + # Node should have logged a complaint during startup + assert self.redpanda.search_log_node(victim_node, + ERASE_ERROR_MSG) is True + + # Control: undamaged node should not have logged the complaint + assert self.redpanda.search_log_node(bystander_node, + ERASE_ERROR_MSG) is False