Skip to content

Commit

Permalink
tests: reintroduce failure on process crash
Browse files Browse the repository at this point in the history
when `cluster` ends, process crash is reported via exception. a
flag added to an earlier commit suppressed these errors because
process kills are part of the new action context. this commit
removes that flag because now the killed processes are restarted
when the context exits, and so by the time cluster exits the state
should be restored and no processes should be in a crashed state.

also cleans up the e2e test for SI so that the topic replication
is easier to understand.
  • Loading branch information
abhijat committed Apr 27, 2022
1 parent 588146b commit 1fa228d
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 38 deletions.
11 changes: 4 additions & 7 deletions tests/rptest/services/action_injector.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,8 @@ def __init__(self, redpanda: RedpandaService, config: ActionConfig,

def max_affected_nodes_reached(self) -> bool:
"""
Checks if the number of affected nodes equals the maximum number of nodes
this action is allowed to affect. If so the next calls to action will return
early.
Checks if the number of affected nodes so far equals the maximum number of nodes
this action is allowed to affect. If so all future calls to action will be no-op.
"""
raise NotImplementedError

Expand All @@ -94,7 +93,7 @@ def target_node(self) -> ClusterNode:

def do_action(self) -> ClusterNode:
"""
Applies the disruptive action, returns node the action was applied on
Applies the disruptive action, returns node or entity the action was applied on
"""
raise NotImplementedError

Expand All @@ -104,7 +103,7 @@ def action(self) -> ClusterNode:

def do_reverse_action(self) -> ClusterNode:
"""
Reverses the last applied action
Reverses the last applied action if applicable.
"""
raise NotImplementedError

Expand All @@ -120,8 +119,6 @@ def restore_state_on_exit(self, action_log: List[ActionLogEntry]):
Optionally restore state when the action injector thread is ending.
Uses the action log to determine what restoration should be done.
"""

self.redpanda.logger.warn(action_log)
all_nodes = {entry.node for entry in action_log}

node_final_state = defaultdict(lambda: False)
Expand Down
5 changes: 2 additions & 3 deletions tests/rptest/services/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from ducktape.mark.resource import ClusterUseMetadata


def cluster(log_allow_list=None, allow_missing_process=False, **kwargs):
def cluster(log_allow_list=None, **kwargs):
"""
Drop-in replacement for Ducktape `cluster` that imposes additional
redpanda-specific checks and defaults.
Expand All @@ -35,8 +35,7 @@ def wrapped(self, *args, **kwargs):
r = f(self, *args, **kwargs)
except:
self.redpanda.decode_backtraces()
self.redpanda.raise_on_crash(
allow_missing_process=allow_missing_process)
self.redpanda.raise_on_crash()
raise
else:
# Only do log inspections on tests that are otherwise
Expand Down
34 changes: 17 additions & 17 deletions tests/rptest/services/redpanda.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,21 +264,21 @@ class SISettings:
GLOBAL_S3_REGION_KEY = "s3_region"

def __init__(
self,
*,
log_segment_size: int = 16 * 1000000,
cloud_storage_access_key: str = 'panda-user',
cloud_storage_secret_key: str = 'panda-secret',
cloud_storage_region: str = 'panda-region',
cloud_storage_bucket: Optional[str] = None,
cloud_storage_api_endpoint: str = 'minio-s3',
cloud_storage_api_endpoint_port: int = 9000,
cloud_storage_cache_size: int = 160 * 1000000,
cloud_storage_enable_remote_read: bool = True,
cloud_storage_enable_remote_write: bool = True,
cloud_storage_reconciliation_interval_ms: Optional[int] = None,
cloud_storage_max_connections: Optional[int] = None,
cloud_storage_disable_tls: bool = True):
self,
*,
log_segment_size: int = 16 * 1000000,
cloud_storage_access_key: str = 'panda-user',
cloud_storage_secret_key: str = 'panda-secret',
cloud_storage_region: str = 'panda-region',
cloud_storage_bucket: Optional[str] = None,
cloud_storage_api_endpoint: str = 'minio-s3',
cloud_storage_api_endpoint_port: int = 9000,
cloud_storage_cache_size: int = 160 * 1000000,
cloud_storage_enable_remote_read: bool = True,
cloud_storage_enable_remote_write: bool = True,
cloud_storage_reconciliation_interval_ms: Optional[int] = None,
cloud_storage_max_connections: Optional[int] = None,
cloud_storage_disable_tls: bool = True):
self.log_segment_size = log_segment_size
self.cloud_storage_access_key = cloud_storage_access_key
self.cloud_storage_secret_key = cloud_storage_secret_key
Expand Down Expand Up @@ -886,7 +886,7 @@ def monitor_log(self, node):
assert node in self._started
return node.account.monitor_log(RedpandaService.STDOUT_STDERR_CAPTURE)

def raise_on_crash(self, allow_missing_process=False):
def raise_on_crash(self):
"""
Check if any redpanda nodes are unexpectedly not running,
or if any logs contain segfaults or assertions.
Expand All @@ -911,7 +911,7 @@ def raise_on_crash(self, allow_missing_process=False):
if crash_log:
crashes.append((node, line))

if not crashes and not allow_missing_process:
if not crashes:
# Even if there is no assertion or segfault, look for unexpectedly
# not-running processes
for node in self._started:
Expand Down
22 changes: 11 additions & 11 deletions tests/rptest/tests/e2e_shadow_indexing_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,17 +102,17 @@ def test_write(self):

class EndToEndShadowIndexingTestWithDisruptions(EndToEndShadowIndexingBase):
def _build_redpanda_instance(self):
return RedpandaService(context=self.test_context,
num_brokers=self.num_brokers,
si_settings=self.si_settings,
extra_rp_conf={
'default_topic_replications':
self.num_brokers,
})

@cluster(num_nodes=5,
log_allow_list=CHAOS_LOG_ALLOW_LIST,
allow_missing_process=True)
return RedpandaService(
context=self.test_context,
num_brokers=self.num_brokers,
si_settings=self.si_settings,
# With node failures we require __consumer_offsets to be replicated
# to survive leader crash
extra_rp_conf={
'default_topic_replications': self.num_brokers,
})

@cluster(num_nodes=5, log_allow_list=CHAOS_LOG_ALLOW_LIST)
def test_write_with_node_failures(self):
self.start_producer()
produce_until_segments(
Expand Down

0 comments on commit 1fa228d

Please sign in to comment.