From b9015b7766f1d05a31aa9afaed7a4ee2b48a2147 Mon Sep 17 00:00:00 2001
From: John Spray <jcs@redpanda.com>
Date: Tue, 2 Aug 2022 12:59:15 +0100
Subject: [PATCH 1/4] admin: fix 500 errors on unknown node IDs in leader
 transfer

Noticed this while writing a test with an off by one on
a node id.  Internally the error handling is safe, but
in the API we're returning a 500 instead of a 400.
---
 src/v/redpanda/admin_server.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/v/redpanda/admin_server.cc b/src/v/redpanda/admin_server.cc
index 2d4c78f883da..ad75a1e81d78 100644
--- a/src/v/redpanda/admin_server.cc
+++ b/src/v/redpanda/admin_server.cc
@@ -782,6 +782,13 @@ ss::future<> admin_server::throw_on_error(
             co_return;
         case raft::errc::not_leader:
             throw co_await redirect_to_leader(req, ntp);
+        case raft::errc::node_does_not_exists:
+        case raft::errc::not_voter:
+            // node_does_not_exist is a 400 rather than a 404, because it
+            // comes up in the context of a destination for leader transfer,
+            // rather than a node ID appearing in a URL path.
+            throw ss::httpd::bad_request_exception(
+              fmt::format("Invalid request: {}", ec.message()));
         default:
             throw ss::httpd::server_error_exception(
               fmt::format("Unexpected raft error: {}", ec.message()));

From ed4f4641a47f8469e2907d3c44c6503c39481990 Mon Sep 17 00:00:00 2001
From: John Spray <jcs@redpanda.com>
Date: Thu, 28 Jul 2022 13:47:34 +0100
Subject: [PATCH 2/4] cluster: enable controller replay if last_applied is
 ahead of log

This situation happens if someone deletes controller log
segments while leaving the kvstore in place.

Previously, the kvstore last_applied would cause the
node to hang waiting for the controller log to replay
to that offset.

Now, we log an error about the apparent inconsistency,
and proceed.

In general we do not want to ignore data inconsistency, but
this is a special case: deleting the controller log is something
a user might legitimately do in order to work around another
issue + force redpanda to rebuild the local copy of the
controller log.
---
 src/v/cluster/controller.cc | 58 +++++++++++++++++++++++++++++++++----
 1 file changed, 53 insertions(+), 5 deletions(-)

diff --git a/src/v/cluster/controller.cc b/src/v/cluster/controller.cc
index 179fe6c5192b..36058f4ef76c 100644
--- a/src/v/cluster/controller.cc
+++ b/src/v/cluster/controller.cc
@@ -305,11 +305,59 @@ ss::future<> controller::start(cluster_discovery& discovery) {
           return _stm.invoke_on(controller_stm_shard, &controller_stm::start);
       })
       .then([this] {
-          return _stm.invoke_on(controller_stm_shard, [](controller_stm& stm) {
-              // we do not have to use timeout in here as all the batches to
-              // apply have to be accesssible
-              return stm.wait(stm.bootstrap_last_applied(), model::no_timeout);
-          });
+          auto disk_dirty_offset = _raft0->log().offsets().dirty_offset;
+
+          return _stm
+            .invoke_on(
+              controller_stm_shard,
+              [disk_dirty_offset](controller_stm& stm) {
+                  // we do not have to use timeout in here as all the batches to
+                  // apply have to be accesssible
+                  auto last_applied = stm.bootstrap_last_applied();
+
+                  // Consistency check: on a bug-free system, the last_applied
+                  // value in the kvstore always points to data on disk.
+                  // However, if there is a bug, or someone has removed some log
+                  // segments out of band, we will hang trying to read up to
+                  // last_applied. Mitigate this by clamping it to the top of
+                  // the log on disk.
+                  if (last_applied > disk_dirty_offset) {
+                      vlog(
+                        clusterlog.error,
+                        "Inconsistency detected between KVStore last_applied "
+                        "({}) and actual log size ({}).  If the storage "
+                        "directory was not modified intentionally, this is a "
+                        "bug.",
+                        last_applied,
+                        disk_dirty_offset);
+
+                      // Try waiting for replay to reach the disk_dirty_offset,
+                      // ignore last_applied.
+                      return stm
+                        .wait(disk_dirty_offset, model::time_from_now(5s))
+                        .handle_exception_type([](const ss::timed_out_error&) {
+                            // Ignore timeout: it just means the controller
+                            // log replay is done without hitting the disk
+                            // log hwm (truncation happened), or that we were
+                            // slow and controller replay will continue in
+                            // the background while the rest of redpanda
+                            // starts up.
+                        });
+                  }
+
+                  vlog(
+                    clusterlog.info,
+                    "Controller log replay starting (to offset {})",
+                    last_applied);
+
+                  if (last_applied == model::offset{}) {
+                      return ss::now();
+                  } else {
+                      return stm.wait(last_applied, model::no_timeout);
+                  }
+              })
+            .then(
+              [] { vlog(clusterlog.info, "Controller log replay complete."); });
       })
       .then([this, &discovery] { return cluster_creation_hook(discovery); })
       .then(

From 6a1798c6afef4a91f92e59f9d9c4559ff193cd88 Mon Sep 17 00:00:00 2001
From: John Spray <jcs@redpanda.com>
Date: Tue, 2 Aug 2022 14:05:48 +0100
Subject: [PATCH 3/4] tests: enable searching a single node's log in
 RedpandaTest

For tests that want to know "did node X log message Y" rather
than just "was message Y logged anywhere"
---
 tests/rptest/services/redpanda.py | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/tests/rptest/services/redpanda.py b/tests/rptest/services/redpanda.py
index b3b111b304be..55640d86fa82 100644
--- a/tests/rptest/services/redpanda.py
+++ b/tests/rptest/services/redpanda.py
@@ -2348,24 +2348,31 @@ def make_partition(p):
     def cov_enabled(self):
         return self._context.globals.get(self.COV_KEY, self.DEFAULT_COV_OPT)
 
+    def search_log_node(self, node: ClusterNode, pattern: str):
+        for line in node.account.ssh_capture(
+                f"grep \"{pattern}\" {RedpandaService.STDOUT_STDERR_CAPTURE} || true"
+        ):
+            # We got a match
+            self.logger.debug(f"Found {pattern} on node {node.name}: {line}")
+            return True
+
+        return False
+
     def search_log_any(self, pattern: str, nodes: list[ClusterNode] = None):
-        # Test helper for grepping the redpanda log.
-        # The design follows python's built-in any() function.
-        # https://docs.python.org/3/library/functions.html#any
+        """
+        Test helper for grepping the redpanda log.
+        The design follows python's built-in any() function.
+        https://docs.python.org/3/library/functions.html#any
 
-        # :param pattern: the string to search for
-        # :param nodes: a list of nodes to run grep on
-        # :return:  true if any instances of `pattern` found
+        :param pattern: the string to search for
+        :param nodes: a list of nodes to run grep on
+        :return:  true if any instances of `pattern` found
+        """
         if nodes is None:
             nodes = self.nodes
 
         for node in nodes:
-            for line in node.account.ssh_capture(
-                    f"grep \"{pattern}\" {RedpandaService.STDOUT_STDERR_CAPTURE} || true"
-            ):
-                # We got a match
-                self.logger.debug(
-                    f"Found {pattern} on node {node.name}: {line}")
+            if self.search_log_node(node, pattern):
                 return True
 
         # Fall through, no matches

From 0eda42ca25ff5120205eb79ce68071e6e88ebf0e Mon Sep 17 00:00:00 2001
From: John Spray <jcs@redpanda.com>
Date: Tue, 2 Aug 2022 14:07:39 +0100
Subject: [PATCH 4/4] tests: add ControllerEraseTest

This test validates that it is possible to reset the
controller log on a single node by removing it, a procedure
occasionally used in the field in the event of a cluster
in a split brain situation resulting from interference with
a node's storage.
---
 tests/rptest/tests/controller_erase_test.py | 86 +++++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 tests/rptest/tests/controller_erase_test.py

diff --git a/tests/rptest/tests/controller_erase_test.py b/tests/rptest/tests/controller_erase_test.py
new file mode 100644
index 000000000000..b9277caf5ae8
--- /dev/null
+++ b/tests/rptest/tests/controller_erase_test.py
@@ -0,0 +1,86 @@
+# Copyright 2022 Redpanda Data, Inc.
+#
+# Use of this software is governed by the Business Source License
+# included in the file licenses/BSL.md
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0
+
+from rptest.tests.redpanda_test import RedpandaTest
+from rptest.services.redpanda import RESTART_LOG_ALLOW_LIST, RedpandaService
+from rptest.services.cluster import cluster
+from rptest.services.admin import Admin
+from rptest.clients.types import TopicSpec
+from ducktape.mark import parametrize
+
+ERASE_ERROR_MSG = "Inconsistency detected between KVStore last_applied"
+ERASE_LOG_ALLOW_LIST = RESTART_LOG_ALLOW_LIST + [ERASE_ERROR_MSG]
+
+
+class ControllerEraseTest(RedpandaTest):
+    @cluster(num_nodes=3, log_allow_list=ERASE_LOG_ALLOW_LIST)
+    @parametrize(partial=True)
+    @parametrize(partial=False)
+    def test_erase_controller_log(self, partial):
+        """
+        This test covers the case where an administrator intentionally
+        removes the controller log data from a node, but does not wipe
+        the entire data directory for the node.
+
+        In issue https://github.com/redpanda-data/redpanda/issues/4950
+        this fails because the node's kvstore points to a higher last_applied
+        offset than exists in the local log, blocking startup: this is a regression
+        test for that behaviour.
+
+        In general Redpanda doesn't handle on-disk corruption (such as selectively
+        deleting segments) gracefully, but in the particular case of wiping
+        the whole controller log, or a contiguous sufffix region of the log,
+        we can gracefully let raft fill in the gaps.
+        """
+
+        admin = Admin(self.redpanda)
+
+        # Do a bunch of metadata operations to put something in the controller log
+        segment_count = 4
+        for i in range(0, segment_count):
+            for j in range(0, 4):
+                spec = TopicSpec(partition_count=1, replication_factor=3)
+                self.client().create_topic(spec)
+
+            # Move a leader to roll a segment
+            leader_node = self.redpanda.controller()
+            next_leader = ((self.redpanda.idx(leader_node) + 1) % 3) + 1
+            admin.partition_transfer_leadership('redpanda', 'controller', 0,
+                                                next_leader)
+
+        # Stop the node we will intentionally damage
+        victim_node = self.redpanda.nodes[1]
+        bystander_node = self.redpanda.nodes[0]
+        self.redpanda.stop_node(victim_node)
+
+        # Erase controller log on the victim node
+        controller_path = f"{RedpandaService.PERSISTENT_ROOT}/data/redpanda/controller/0_0"
+        if partial:
+            # Partial deletion: remove the latest controller log segment
+            storage = self.redpanda.node_storage(victim_node)
+            segments = storage.ns['redpanda'].topics['controller'].partitions[
+                "0_0"].segments.keys()
+            assert len(segments) == segment_count
+            segments = sorted(list(segments))
+            victim_path = f"{controller_path}/{segments[-1]}.log"
+        else:
+            # Full deletion: remove all log segments.
+            victim_path = f"{controller_path}/*"
+        victim_node.account.remove(victim_path)
+
+        # Node should come up cleanly
+        self.redpanda.start_node(victim_node)
+
+        # Node should have logged a complaint during startup
+        assert self.redpanda.search_log_node(victim_node,
+                                             ERASE_ERROR_MSG) is True
+
+        # Control: undamaged node should not have logged the complaint
+        assert self.redpanda.search_log_node(bystander_node,
+                                             ERASE_ERROR_MSG) is False