Skip to content

Commit

Permalink
Merge pull request #5343 from jcsp/tests-stop-node-logging
Browse files Browse the repository at this point in the history
tests: log details on failure to stop redpanda
  • Loading branch information
jcsp committed Jul 6, 2022
2 parents 547af6f + ad71a3f commit 26b7d23
Showing 1 changed file with 27 additions and 14 deletions.
41 changes: 27 additions & 14 deletions tests/rptest/services/redpanda.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from ducktape.utils.util import wait_until
from ducktape.cluster.cluster import ClusterNode
from prometheus_client.parser import text_string_to_metric_families
from ducktape.errors import TimeoutError

from rptest.clients.kafka_cat import KafkaCat
from rptest.services.admin import Admin
Expand Down Expand Up @@ -825,16 +826,21 @@ def start_rp():
self.start_service(node, start_rp)
self._started.append(node)

def start_service(self, node, start):
def log_node_stats():
for line in node.account.ssh_capture("ps aux"):
self.logger.debug(line.strip())
for line in node.account.ssh_capture("netstat -ant"):
self.logger.debug(line.strip())
def _log_node_process_state(self, node):
"""
For debugging issues around starting and stopping processes: log
which processes are running and which ports are in use.
"""

for line in node.account.ssh_capture("ps aux"):
self.logger.debug(line.strip())
for line in node.account.ssh_capture("netstat -ant"):
self.logger.debug(line.strip())

def start_service(self, node, start):
# Maybe the service collides with something that wasn't cleaned up
# properly: let's peek at what's going on on the node before starting it.
log_node_stats()
self._log_node_process_state(node)

try:
start()
Expand All @@ -844,7 +850,7 @@ def log_node_stats():
self.logger.warn(
f"Failed to start on {node.name}, gathering node ps and netstat..."
)
log_node_stats()
self._log_node_process_state(node)
raise

def coproc_enabled(self):
Expand Down Expand Up @@ -1156,12 +1162,19 @@ def stop_node(self, node, timeout=None):
if timeout is None:
timeout = 30

wait_until(
lambda: len(self.pids(node)) == 0,
timeout_sec=timeout,
err_msg=
f"Redpanda node {node.account.hostname} failed to stop in {timeout} seconds"
)
try:
wait_until(
lambda: len(self.pids(node)) == 0,
timeout_sec=timeout,
err_msg=
f"Redpanda node {node.account.hostname} failed to stop in {timeout} seconds"
)
except TimeoutError:
self.logger.warn(
f"Timed out waiting for stop on {node.name}, status:")
self._log_node_process_state(node)
raise

self.remove_from_started_nodes(node)

def remove_from_started_nodes(self, node):
Expand Down

0 comments on commit 26b7d23

Please sign in to comment.