Skip to content

Commit

Permalink
tests: uses admin ready api to wait for cluster
Browse files Browse the repository at this point in the history
The redpanda.healthy api uses metrics which are sometimes disabled
eg in franzgo tests. so we use the admin ready api.
  • Loading branch information
abhijat committed Apr 27, 2022
1 parent c000795 commit 7a0ccb7
Showing 1 changed file with 25 additions and 3 deletions.
28 changes: 25 additions & 3 deletions tests/rptest/services/action_injector.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,9 @@ def max_affected_nodes_reached(self):


class ProcessKill(DisruptiveAction):
PROCESS_START_WAIT_SEC = 20
PROCESS_START_WAIT_BACKOFF = 2

def __init__(self, redpanda: RedpandaService, config: ActionConfig,
admin: Admin):
super(ProcessKill, self).__init__(redpanda, config, admin)
Expand Down Expand Up @@ -191,7 +194,7 @@ def do_action(self):
self.redpanda.logger.warn(f'no usable node')

def do_reverse_action(self):
self.failure_injector._start(self.last_affected_node)
self._start_rp(node=self.last_affected_node)
self.affected_nodes.remove(self.last_affected_node)
self.redpanda.add_to_started_nodes(self.last_affected_node)

Expand All @@ -203,7 +206,16 @@ def do_restore_nodes(self, nodes_to_restore: Set[ClusterNode]):
Attempt to restore the redpanda process on all nodes where it was stopped.
"""
for node in nodes_to_restore:
self.failure_injector._start(node)
self._start_rp(node)

def _start_rp(self, node):
self.failure_injector._start(node)
wait_until(
lambda: self.redpanda.redpanda_pid(node),
timeout_sec=self.PROCESS_START_WAIT_SEC,
backoff_sec=self.PROCESS_START_WAIT_BACKOFF,
err_msg=
f'Failed to start redpanda process on {node.account.hostname}')


class ActionInjectorThread(Thread):
Expand All @@ -223,11 +235,21 @@ def __init__(
self.action_log = []

def run(self):
wait_until(lambda: self.redpanda.healthy(),
admin = Admin(self.redpanda)

def all_nodes_started():
statuses = [
admin.ready(node).get("status") for node in self.redpanda.nodes
]
return all(status == 'ready' for status in statuses)

wait_until(all_nodes_started,
timeout_sec=self.config.cluster_start_lead_time_sec,
backoff_sec=2,
err_msg=f'Cluster not ready to begin actions')

self.redpanda.logger.info('cluster is ready, starting action loop')

while not self._stop_requested.is_set():
if self.config.reverse_action_on_next_cycle:
result = self.disruptive_action.reverse()
Expand Down

0 comments on commit 7a0ccb7

Please sign in to comment.