Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tests: update offline_log_viewer and use in ducktape tests #5315

Merged
merged 22 commits into from
Dec 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
e168164
tools: write offline_log_viewer kvstore/controller output to stdout
jcsp Jul 1, 2022
9f00c02
tools: make offline_log_viewer aware of clean_segment kvstore key
jcsp Jul 1, 2022
963fc3d
tools: offline_log_viewer don't replay pre-snapshot kvstore messages
jcsp Jul 1, 2022
7f666f7
tools/metadata_viewer: handle kvstore deletions
jcsp Jul 1, 2022
b492385
tools: handle short header read in offline_log_viewer
jcsp Jul 1, 2022
37b76a1
tools/offline_log_viewer: added decode_adl_or_serde to dispatch decod…
andijcr Aug 30, 2022
18328b1
tools/offline_log_viewer: fixed variable used to read BatchType (was …
andijcr Aug 30, 2022
4bbd0c5
tools/offline_log_viewer: added peek_int8, read_serde_enum
andijcr Aug 30, 2022
021317a
tools/offline_log_viewer: added serde decoding for topic_commands
andijcr Aug 30, 2022
ce43d16
tools/offline_log_viewer: added decode_user_command for serde
andijcr Aug 30, 2022
8b924de
tools/offline_log_viewer: added decode_acl_command for serde
andijcr Aug 30, 2022
29f9e7e
tools/offline_log_viewer: added decode_config_command for serde for c…
andijcr Aug 30, 2022
98e4df6
tools/offline_log_viewer: decode_feature_command, decode_node_managem…
andijcr Aug 30, 2022
8fe0654
tools/offline_log_viewer: added method to get number of unprocessed b…
andijcr Sep 6, 2022
ba26cd7
tools/offline_log_viewer: added check that all the bytes of a record …
andijcr Sep 6, 2022
d8d85c4
tools/offline_log_viewer: support v=4 group_configuration
jcsp Dec 2, 2022
a9ab9fa
tools: fix kvstore reads in offline_log_viewer
jcsp Dec 2, 2022
8557ac4
tests: include offline_log_viewer in Dockerfile
jcsp Jul 1, 2022
a7e6983
tests: add clients.offline_log_viewer
jcsp Dec 2, 2022
fac7d51
tests: validate kvstore cleanup in topic_delete_test
jcsp Jul 1, 2022
1813135
tests: check controller log decode after multi-version upgrade
jcsp Dec 2, 2022
189e475
tools: disable broken controller command decode
jcsp Dec 2, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions tests/docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -179,4 +179,7 @@ RUN mkdir -p /opt/scripts && \
curl https://raw.githubusercontent.com/redpanda-data/seastar/2a9504b3238cba4150be59353bf8d0b3a01fe39c/scripts/seastar-addr2line -o /opt/scripts/seastar-addr2line && \
chmod +x /opt/scripts/seastar-addr2line

RUN mkdir -p /opt/scripts/offline_log_viewer
COPY --chown=0:0 tools/offline_log_viewer /opt/scripts/offline_log_viewer

CMD service ssh start && tail -f /dev/null
3 changes: 2 additions & 1 deletion tests/docker/Dockerfile.dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
!tests/docker/ssh
!tests/setup.py
!tests/python
!tests/go
!tests/go
!tools/offline_log_viewer
41 changes: 41 additions & 0 deletions tests/rptest/clients/offline_log_viewer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Copyright 2022 Redpanda Data, Inc.
#
# Use of this software is governed by the Business Source License
# included in the file licenses/BSL.md
#
# As of the Change Date specified in that file, in accordance with
# the Business Source License, use of this software will be governed
# by the Apache License, Version 2.0

import json


class OfflineLogViewer:
"""
Wrap tools/offline_log_viewer for use in tests: this is for tests that
want to peek at the structures, but also for validating the tool itself.
"""
def __init__(self, redpanda):
self._redpanda = redpanda

def _cmd(self, suffix):
viewer_path = "python3 /opt/scripts/offline_log_viewer/viewer.py"
return f"{viewer_path} --path {self._redpanda.DATA_DIR} {suffix}"

def read_kvstore(self, node):
cmd = self._cmd("--type kvstore")
kvstore_json = node.account.ssh_output(cmd, combine_stderr=False)
return json.loads(kvstore_json)

def read_controller(self, node):
cmd = self._cmd("--type controller")
controller_json = node.account.ssh_output(cmd, combine_stderr=False)
try:
return json.loads(controller_json)
except json.decoder.JSONDecodeError:
# Log the bad output before re-raising
self._redpanda.logger.error(
f"Invalid JSON output: {controller_json}")
import time
time.sleep(3600)
raise
125 changes: 108 additions & 17 deletions tests/rptest/tests/topic_delete_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,108 @@
# the Business Source License, use of this software will be governed
# by the Apache License, Version 2.0

from rptest.services.cluster import cluster
from rptest.services.admin import Admin
from ducktape.utils.util import wait_until
import time
import json

from ducktape.utils.util import wait_until
from ducktape.mark import parametrize
from requests.exceptions import HTTPError

from rptest.services.cluster import cluster
from rptest.clients.types import TopicSpec
from rptest.clients.offline_log_viewer import OfflineLogViewer
from rptest.tests.redpanda_test import RedpandaTest
from rptest.clients.kafka_cli_tools import KafkaCliTools
from rptest.services.rpk_producer import RpkProducer
from rptest.services.metrics_check import MetricCheck
from rptest.services.redpanda import SISettings
from rptest.util import wait_for_segments_removal
from ducktape.mark import parametrize
from rptest.services.admin import Admin


def topic_storage_purged(redpanda, topic: str):
def get_kvstore_topic_key_counts(redpanda):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Partitions have various kvstore items, stored by
the raft + storage layers. When the topic is
deleted, they should be deleted too.

Use our offline log decoder tool to inspect
the kvstore out-of-band and validate the keys
go away when the topic is deleted.

This is cool. What do you think the guidance should be on when to validate things on disk vs validating at a higher level? Applied too liberally it could eat up a lot of time and be hard to maintain.

For instance here what do we gain from reading the log compared to restarting redpanda and checking the kvstore? The reason being that we know kvstore serves data from memory and its only source of state is from the on disk file.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When something is available via external APIs, I would generally expect tests to get it that way.

The first use case for this (in this PR) is for something we can't validate externally -- cleanup of data that is no longer needed. It's similar to when we would use the existing storage inspection stuff: when we want to validate something beyond client-visible correctness, like storage being reclaimed, flushed up to a certain point, or migrated to/from tiered storage.

There are a bunch of storage tests that do things like "write 5 segments worth of data, wait til we see 5 segments, wait for 3 segments to upload to tiered storage...", which I anticipate being more robust if we convert them to use a decode of the log to instead assert on "wait til we see N batches, don't care how they're segmented". Only for the tests that write very small amounts of data, of course.

I would also like to use this as a sanity check that log-structured things like the controller log are at a length we expect: right now it would be pretty easy to slip in a bug that caused like 1000 extra controller log messages (like some rogue fiber writing messages repeatedly), and nothing would notice/fail. We could use some debug admin APIs for querying that kind of thing: the tradeoff is how much code we bake into redpanda proper vs. how much we can maintain as test-specific stuff. Then again, stuff we bake into redpanda proper is more usable in field debug tools...

Long story short, I mostly anticipate doing direct on-disk reads in the context of storage tests.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the explanation

I would also like to use this as a sanity check that log-structured things like the controller log are at a length we expect

i like it. i recall at some point @mmaslankaprv suggesting that we write a test that verified that all of the controller logs were identical (up to some common prefix) after a test run. i don't know if that was ever done but this would make that a lot easier!

"""
Count the keys in KVStore that relate to Kafka topics: this excludes all
internal topic items: if no Kafka topics exist, this should be zero for
all nodes.

:returns: dict of Node to integer
"""

viewer = OfflineLogViewer(redpanda)

# Find the raft group IDs of internal topics
admin = Admin(redpanda)
internal_group_ids = set()
for ntp in [
('redpanda', 'controller', 0),
('kafka_internal', 'id_allocator', 0),
]:
namespace, topic, partition = ntp
try:
p = admin.get_partitions(topic=topic,
namespace=namespace,
partition=partition)
except HTTPError as e:
# OK if internal topic doesn't exist (e.g. id_allocator
# doesn't have to exist)
if e.response.status_code != 404:
raise
else:
internal_group_ids.add(p['raft_group_id'])

result = {}
for n in redpanda.nodes:
kvstore_data = viewer.read_kvstore(node=n)

excess_keys = []
for shard, items in kvstore_data.items():
keys = [i['key'] for i in items]

for k in keys:
if k['keyspace'] == "cluster":
# Not a per-partition key
continue

if k['data'].get('group', None) in internal_group_ids:
# One of the internal topics
continue

if k['data'].get('ntp', {}).get('topic', None) == 'controller':
# Controller storage item
continue

excess_keys.append(k)

redpanda.logger.info(
f"{n.name}.{shard} Excess Keys {json.dumps(excess_keys,indent=2)}"
)

key_count = len(excess_keys)
result[n] = key_count

return result


def topic_storage_purged(redpanda, topic_name):
storage = redpanda.storage()
return all(map(lambda n: topic not in n.ns["kafka"].topics, storage.nodes))
logs_removed = all(
map(lambda n: topic_name not in n.ns["kafka"].topics, storage.nodes))

if not logs_removed:
return False

# Once logs are removed, also do more expensive inspection of
# kvstore to check that per-partition kvstore contents are
# gone. The user doesn't care about this, but it is important
# to avoid bugs that would cause kvstore to bloat through
# topic creation/destruction cycles.

topic_key_counts = get_kvstore_topic_key_counts(redpanda)
if any([v > 0 for v in topic_key_counts.values()]):
return False

return True


class TopicDeleteTest(RedpandaTest):
Expand All @@ -44,7 +128,9 @@ def __init__(self, test_context):
self.kafka_tools = KafkaCliTools(self.redpanda)

@cluster(num_nodes=3)
def topic_delete_test(self):
@parametrize(with_restart=False)
@parametrize(with_restart=True)
def topic_delete_test(self, with_restart):
def produce_until_partitions():
self.kafka_tools.produce(self.topic, 1024, 1024)
storage = self.redpanda.storage()
Expand All @@ -55,6 +141,16 @@ def produce_until_partitions():
backoff_sec=2,
err_msg="Expected partition did not materialize")

if with_restart:
# Do a restart to encourage writes and flushes, especially to
# the kvstore.
self.redpanda.restart_nodes(self.redpanda.nodes)

# Sanity check the kvstore checks: there should be at least one kvstore entry
# per partition while the topic exists.
assert sum(get_kvstore_topic_key_counts(
self.redpanda).values()) >= self.topics[0].partition_count

self.kafka_tools.delete_topic(self.topic)

try:
Expand Down Expand Up @@ -294,17 +390,12 @@ def check_compaction():
pass
producer.free()

def topic_storage_purged():
storage = self.redpanda.storage()
return all(
map(lambda n: topic_name not in n.ns["kafka"].topics,
storage.nodes))

try:
wait_until(lambda: topic_storage_purged(),
timeout_sec=60,
backoff_sec=2,
err_msg="Topic storage was not removed")
wait_until(
lambda: topic_storage_purged(self.redpanda, topic_name),
timeout_sec=60,
backoff_sec=2,
err_msg="Topic storage was not removed")

except:
# On errors, dump listing of the storage location
Expand Down
10 changes: 10 additions & 0 deletions tests/rptest/tests/upgrade_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from rptest.services.admin import Admin
from rptest.clients.rpk import RpkTool
from rptest.clients.types import TopicSpec
from rptest.clients.offline_log_viewer import OfflineLogViewer
from rptest.tests.prealloc_nodes import PreallocNodesTest
from rptest.tests.redpanda_test import RedpandaTest
from rptest.tests.end_to_end import EndToEndTest
Expand Down Expand Up @@ -259,6 +260,15 @@ def _consumer_offsets_present():
assert self._rand_consumer.consumer_status.validator.total_reads >= self.RANDOM_READ_COUNT * self.RANDOM_READ_PARALLEL
assert self._cg_consumer.consumer_status.validator.valid_reads >= wrote_at_least

# Validate that the data structures written by a mixture of historical
# versions remain readable by our current debug tools
log_viewer = OfflineLogViewer(self.redpanda)
for node in self.redpanda.nodes:
controller_records = log_viewer.read_controller(node=node)
self.logger.info(
f"Read {len(controller_records)} controller records from node {node.name} successfully"
)


class UpgradeWithWorkloadTest(EndToEndTest):
"""
Expand Down
Loading