Skip to content

Commit

Permalink
Merge pull request redpanda-data#5703 from jcsp/controller-replay-aft…
Browse files Browse the repository at this point in the history
…er-dataloss

cluster: enable controller replay if last_applied is ahead of log
  • Loading branch information
jcsp committed Nov 28, 2022
2 parents 3a0df25 + 0eda42c commit 79a8e3e
Show file tree
Hide file tree
Showing 4 changed files with 165 additions and 17 deletions.
58 changes: 53 additions & 5 deletions src/v/cluster/controller.cc
Original file line number Diff line number Diff line change
Expand Up @@ -305,11 +305,59 @@ ss::future<> controller::start(cluster_discovery& discovery) {
return _stm.invoke_on(controller_stm_shard, &controller_stm::start);
})
.then([this] {
return _stm.invoke_on(controller_stm_shard, [](controller_stm& stm) {
// we do not have to use timeout in here as all the batches to
// apply have to be accesssible
return stm.wait(stm.bootstrap_last_applied(), model::no_timeout);
});
auto disk_dirty_offset = _raft0->log().offsets().dirty_offset;

return _stm
.invoke_on(
controller_stm_shard,
[disk_dirty_offset](controller_stm& stm) {
// we do not have to use timeout in here as all the batches to
// apply have to be accesssible
auto last_applied = stm.bootstrap_last_applied();

// Consistency check: on a bug-free system, the last_applied
// value in the kvstore always points to data on disk.
// However, if there is a bug, or someone has removed some log
// segments out of band, we will hang trying to read up to
// last_applied. Mitigate this by clamping it to the top of
// the log on disk.
if (last_applied > disk_dirty_offset) {
vlog(
clusterlog.error,
"Inconsistency detected between KVStore last_applied "
"({}) and actual log size ({}). If the storage "
"directory was not modified intentionally, this is a "
"bug.",
last_applied,
disk_dirty_offset);

// Try waiting for replay to reach the disk_dirty_offset,
// ignore last_applied.
return stm
.wait(disk_dirty_offset, model::time_from_now(5s))
.handle_exception_type([](const ss::timed_out_error&) {
// Ignore timeout: it just means the controller
// log replay is done without hitting the disk
// log hwm (truncation happened), or that we were
// slow and controller replay will continue in
// the background while the rest of redpanda
// starts up.
});
}

vlog(
clusterlog.info,
"Controller log replay starting (to offset {})",
last_applied);

if (last_applied == model::offset{}) {
return ss::now();
} else {
return stm.wait(last_applied, model::no_timeout);
}
})
.then(
[] { vlog(clusterlog.info, "Controller log replay complete."); });
})
.then([this, &discovery] { return cluster_creation_hook(discovery); })
.then(
Expand Down
7 changes: 7 additions & 0 deletions src/v/redpanda/admin_server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -782,6 +782,13 @@ ss::future<> admin_server::throw_on_error(
co_return;
case raft::errc::not_leader:
throw co_await redirect_to_leader(req, ntp);
case raft::errc::node_does_not_exists:
case raft::errc::not_voter:
// node_does_not_exist is a 400 rather than a 404, because it
// comes up in the context of a destination for leader transfer,
// rather than a node ID appearing in a URL path.
throw ss::httpd::bad_request_exception(
fmt::format("Invalid request: {}", ec.message()));
default:
throw ss::httpd::server_error_exception(
fmt::format("Unexpected raft error: {}", ec.message()));
Expand Down
31 changes: 19 additions & 12 deletions tests/rptest/services/redpanda.py
Original file line number Diff line number Diff line change
Expand Up @@ -2348,24 +2348,31 @@ def make_partition(p):
def cov_enabled(self):
return self._context.globals.get(self.COV_KEY, self.DEFAULT_COV_OPT)

def search_log_node(self, node: ClusterNode, pattern: str):
for line in node.account.ssh_capture(
f"grep \"{pattern}\" {RedpandaService.STDOUT_STDERR_CAPTURE} || true"
):
# We got a match
self.logger.debug(f"Found {pattern} on node {node.name}: {line}")
return True

return False

def search_log_any(self, pattern: str, nodes: list[ClusterNode] = None):
# Test helper for grepping the redpanda log.
# The design follows python's built-in any() function.
# https://docs.python.org/3/library/functions.html#any
"""
Test helper for grepping the redpanda log.
The design follows python's built-in any() function.
https://docs.python.org/3/library/functions.html#any
# :param pattern: the string to search for
# :param nodes: a list of nodes to run grep on
# :return: true if any instances of `pattern` found
:param pattern: the string to search for
:param nodes: a list of nodes to run grep on
:return: true if any instances of `pattern` found
"""
if nodes is None:
nodes = self.nodes

for node in nodes:
for line in node.account.ssh_capture(
f"grep \"{pattern}\" {RedpandaService.STDOUT_STDERR_CAPTURE} || true"
):
# We got a match
self.logger.debug(
f"Found {pattern} on node {node.name}: {line}")
if self.search_log_node(node, pattern):
return True

# Fall through, no matches
Expand Down
86 changes: 86 additions & 0 deletions tests/rptest/tests/controller_erase_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# Copyright 2022 Redpanda Data, Inc.
#
# Use of this software is governed by the Business Source License
# included in the file licenses/BSL.md
#
# As of the Change Date specified in that file, in accordance with
# the Business Source License, use of this software will be governed
# by the Apache License, Version 2.0

from rptest.tests.redpanda_test import RedpandaTest
from rptest.services.redpanda import RESTART_LOG_ALLOW_LIST, RedpandaService
from rptest.services.cluster import cluster
from rptest.services.admin import Admin
from rptest.clients.types import TopicSpec
from ducktape.mark import parametrize

ERASE_ERROR_MSG = "Inconsistency detected between KVStore last_applied"
ERASE_LOG_ALLOW_LIST = RESTART_LOG_ALLOW_LIST + [ERASE_ERROR_MSG]


class ControllerEraseTest(RedpandaTest):
@cluster(num_nodes=3, log_allow_list=ERASE_LOG_ALLOW_LIST)
@parametrize(partial=True)
@parametrize(partial=False)
def test_erase_controller_log(self, partial):
"""
This test covers the case where an administrator intentionally
removes the controller log data from a node, but does not wipe
the entire data directory for the node.
In issue https://github.com/redpanda-data/redpanda/issues/4950
this fails because the node's kvstore points to a higher last_applied
offset than exists in the local log, blocking startup: this is a regression
test for that behaviour.
In general Redpanda doesn't handle on-disk corruption (such as selectively
deleting segments) gracefully, but in the particular case of wiping
the whole controller log, or a contiguous sufffix region of the log,
we can gracefully let raft fill in the gaps.
"""

admin = Admin(self.redpanda)

# Do a bunch of metadata operations to put something in the controller log
segment_count = 4
for i in range(0, segment_count):
for j in range(0, 4):
spec = TopicSpec(partition_count=1, replication_factor=3)
self.client().create_topic(spec)

# Move a leader to roll a segment
leader_node = self.redpanda.controller()
next_leader = ((self.redpanda.idx(leader_node) + 1) % 3) + 1
admin.partition_transfer_leadership('redpanda', 'controller', 0,
next_leader)

# Stop the node we will intentionally damage
victim_node = self.redpanda.nodes[1]
bystander_node = self.redpanda.nodes[0]
self.redpanda.stop_node(victim_node)

# Erase controller log on the victim node
controller_path = f"{RedpandaService.PERSISTENT_ROOT}/data/redpanda/controller/0_0"
if partial:
# Partial deletion: remove the latest controller log segment
storage = self.redpanda.node_storage(victim_node)
segments = storage.ns['redpanda'].topics['controller'].partitions[
"0_0"].segments.keys()
assert len(segments) == segment_count
segments = sorted(list(segments))
victim_path = f"{controller_path}/{segments[-1]}.log"
else:
# Full deletion: remove all log segments.
victim_path = f"{controller_path}/*"
victim_node.account.remove(victim_path)

# Node should come up cleanly
self.redpanda.start_node(victim_node)

# Node should have logged a complaint during startup
assert self.redpanda.search_log_node(victim_node,
ERASE_ERROR_MSG) is True

# Control: undamaged node should not have logged the complaint
assert self.redpanda.search_log_node(bystander_node,
ERASE_ERROR_MSG) is False

0 comments on commit 79a8e3e

Please sign in to comment.