Merge pull request #5703 from jcsp/controller-replay-after-dataloss

cluster: enable controller replay if last_applied is ahead of log
redpanda-data · Nov 28, 2022 · 79a8e3e · 79a8e3e
2 parents 3a0df25 + 0eda42c
commit 79a8e3e
Show file tree

Hide file tree

Showing 4 changed files with 165 additions and 17 deletions.
diff --git a/src/v/cluster/controller.cc b/src/v/cluster/controller.cc
@@ -305,11 +305,59 @@ ss::future<> controller::start(cluster_discovery& discovery) {
           return _stm.invoke_on(controller_stm_shard, &controller_stm::start);
       })
       .then([this] {
-          return _stm.invoke_on(controller_stm_shard, [](controller_stm& stm) {
-              // we do not have to use timeout in here as all the batches to
-              // apply have to be accesssible
-              return stm.wait(stm.bootstrap_last_applied(), model::no_timeout);
-          });
+          auto disk_dirty_offset = _raft0->log().offsets().dirty_offset;
+
+          return _stm
+            .invoke_on(
+              controller_stm_shard,
+              [disk_dirty_offset](controller_stm& stm) {
+                  // we do not have to use timeout in here as all the batches to
+                  // apply have to be accesssible
+                  auto last_applied = stm.bootstrap_last_applied();
+
+                  // Consistency check: on a bug-free system, the last_applied
+                  // value in the kvstore always points to data on disk.
+                  // However, if there is a bug, or someone has removed some log
+                  // segments out of band, we will hang trying to read up to
+                  // last_applied. Mitigate this by clamping it to the top of
+                  // the log on disk.
+                  if (last_applied > disk_dirty_offset) {
+                      vlog(
+                        clusterlog.error,
+                        "Inconsistency detected between KVStore last_applied "
+                        "({}) and actual log size ({}).  If the storage "
+                        "directory was not modified intentionally, this is a "
+                        "bug.",
+                        last_applied,
+                        disk_dirty_offset);
+
+                      // Try waiting for replay to reach the disk_dirty_offset,
+                      // ignore last_applied.
+                      return stm
+                        .wait(disk_dirty_offset, model::time_from_now(5s))
+                        .handle_exception_type([](const ss::timed_out_error&) {
+                            // Ignore timeout: it just means the controller
+                            // log replay is done without hitting the disk
+                            // log hwm (truncation happened), or that we were
+                            // slow and controller replay will continue in
+                            // the background while the rest of redpanda
+                            // starts up.
+                        });
+                  }
+
+                  vlog(
+                    clusterlog.info,
+                    "Controller log replay starting (to offset {})",
+                    last_applied);
+
+                  if (last_applied == model::offset{}) {
+                      return ss::now();
+                  } else {
+                      return stm.wait(last_applied, model::no_timeout);
+                  }
+              })
+            .then(
+              [] { vlog(clusterlog.info, "Controller log replay complete."); });
       })
       .then([this, &discovery] { return cluster_creation_hook(discovery); })
       .then(

diff --git a/src/v/redpanda/admin_server.cc b/src/v/redpanda/admin_server.cc
@@ -782,6 +782,13 @@ ss::future<> admin_server::throw_on_error(
             co_return;
         case raft::errc::not_leader:
             throw co_await redirect_to_leader(req, ntp);
+        case raft::errc::node_does_not_exists:
+        case raft::errc::not_voter:
+            // node_does_not_exist is a 400 rather than a 404, because it
+            // comes up in the context of a destination for leader transfer,
+            // rather than a node ID appearing in a URL path.
+            throw ss::httpd::bad_request_exception(
+              fmt::format("Invalid request: {}", ec.message()));
         default:
             throw ss::httpd::server_error_exception(
               fmt::format("Unexpected raft error: {}", ec.message()));

diff --git a/tests/rptest/services/redpanda.py b/tests/rptest/services/redpanda.py
@@ -2348,24 +2348,31 @@ def make_partition(p):
     def cov_enabled(self):
         return self._context.globals.get(self.COV_KEY, self.DEFAULT_COV_OPT)
 
+    def search_log_node(self, node: ClusterNode, pattern: str):
+        for line in node.account.ssh_capture(
+                f"grep \"{pattern}\" {RedpandaService.STDOUT_STDERR_CAPTURE} || true"
+        ):
+            # We got a match
+            self.logger.debug(f"Found {pattern} on node {node.name}: {line}")
+            return True
+
+        return False
+
     def search_log_any(self, pattern: str, nodes: list[ClusterNode] = None):
-        # Test helper for grepping the redpanda log.
-        # The design follows python's built-in any() function.
-        # https://docs.python.org/3/library/functions.html#any
+        """
+        Test helper for grepping the redpanda log.
+        The design follows python's built-in any() function.
+        https://docs.python.org/3/library/functions.html#any
 
-        # :param pattern: the string to search for
-        # :param nodes: a list of nodes to run grep on
-        # :return:  true if any instances of `pattern` found
+        :param pattern: the string to search for
+        :param nodes: a list of nodes to run grep on
+        :return:  true if any instances of `pattern` found
+        """
         if nodes is None:
             nodes = self.nodes
 
         for node in nodes:
-            for line in node.account.ssh_capture(
-                    f"grep \"{pattern}\" {RedpandaService.STDOUT_STDERR_CAPTURE} || true"
-            ):
-                # We got a match
-                self.logger.debug(
-                    f"Found {pattern} on node {node.name}: {line}")
+            if self.search_log_node(node, pattern):
                 return True
 
         # Fall through, no matches

diff --git a/tests/rptest/tests/controller_erase_test.py b/tests/rptest/tests/controller_erase_test.py
@@ -0,0 +1,86 @@
+# Copyright 2022 Redpanda Data, Inc.
+#
+# Use of this software is governed by the Business Source License
+# included in the file licenses/BSL.md
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0
+
+from rptest.tests.redpanda_test import RedpandaTest
+from rptest.services.redpanda import RESTART_LOG_ALLOW_LIST, RedpandaService
+from rptest.services.cluster import cluster
+from rptest.services.admin import Admin
+from rptest.clients.types import TopicSpec
+from ducktape.mark import parametrize
+
+ERASE_ERROR_MSG = "Inconsistency detected between KVStore last_applied"
+ERASE_LOG_ALLOW_LIST = RESTART_LOG_ALLOW_LIST + [ERASE_ERROR_MSG]
+
+
+class ControllerEraseTest(RedpandaTest):
+    @cluster(num_nodes=3, log_allow_list=ERASE_LOG_ALLOW_LIST)
+    @parametrize(partial=True)
+    @parametrize(partial=False)
+    def test_erase_controller_log(self, partial):
+        """
+        This test covers the case where an administrator intentionally
+        removes the controller log data from a node, but does not wipe
+        the entire data directory for the node.
+
+        In issue https://github.com/redpanda-data/redpanda/issues/4950
+        this fails because the node's kvstore points to a higher last_applied
+        offset than exists in the local log, blocking startup: this is a regression
+        test for that behaviour.
+
+        In general Redpanda doesn't handle on-disk corruption (such as selectively
+        deleting segments) gracefully, but in the particular case of wiping
+        the whole controller log, or a contiguous sufffix region of the log,
+        we can gracefully let raft fill in the gaps.
+        """
+
+        admin = Admin(self.redpanda)
+
+        # Do a bunch of metadata operations to put something in the controller log
+        segment_count = 4
+        for i in range(0, segment_count):
+            for j in range(0, 4):
+                spec = TopicSpec(partition_count=1, replication_factor=3)
+                self.client().create_topic(spec)
+
+            # Move a leader to roll a segment
+            leader_node = self.redpanda.controller()
+            next_leader = ((self.redpanda.idx(leader_node) + 1) % 3) + 1
+            admin.partition_transfer_leadership('redpanda', 'controller', 0,
+                                                next_leader)
+
+        # Stop the node we will intentionally damage
+        victim_node = self.redpanda.nodes[1]
+        bystander_node = self.redpanda.nodes[0]
+        self.redpanda.stop_node(victim_node)
+
+        # Erase controller log on the victim node
+        controller_path = f"{RedpandaService.PERSISTENT_ROOT}/data/redpanda/controller/0_0"
+        if partial:
+            # Partial deletion: remove the latest controller log segment
+            storage = self.redpanda.node_storage(victim_node)
+            segments = storage.ns['redpanda'].topics['controller'].partitions[
+                "0_0"].segments.keys()
+            assert len(segments) == segment_count
+            segments = sorted(list(segments))
+            victim_path = f"{controller_path}/{segments[-1]}.log"
+        else:
+            # Full deletion: remove all log segments.
+            victim_path = f"{controller_path}/*"
+        victim_node.account.remove(victim_path)
+
+        # Node should come up cleanly
+        self.redpanda.start_node(victim_node)
+
+        # Node should have logged a complaint during startup
+        assert self.redpanda.search_log_node(victim_node,
+                                             ERASE_ERROR_MSG) is True
+
+        # Control: undamaged node should not have logged the complaint
+        assert self.redpanda.search_log_node(bystander_node,
+                                             ERASE_ERROR_MSG) is False