From dc83a7bdc9b760766b3c88a09ab807b85f149f19 Mon Sep 17 00:00:00 2001 From: Alexey Zatelepin Date: Sun, 14 Aug 2022 17:31:50 +0300 Subject: [PATCH] tests/partition_balancer: more robust wait_until_status Previously, when the controller leader node was suspended during the test all status requests would fail with the timed-out error. This was true for all nodes, not just the suspended one (because we proxy the status request to the controller leader), so internal retries in the admin API wrapper didn't help. We increase the timeout and add 504 to retriable status codes so that internal retries can handle this situation. --- tests/rptest/tests/partition_balancer_test.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/rptest/tests/partition_balancer_test.py b/tests/rptest/tests/partition_balancer_test.py index 5c58d670ece5..77e6be691ddc 100644 --- a/tests/rptest/tests/partition_balancer_test.py +++ b/tests/rptest/tests/partition_balancer_test.py @@ -77,13 +77,15 @@ def node2partition_count(self): return ret def wait_until_status(self, predicate, timeout_sec=120): - admin = Admin(self.redpanda) + # We may get a 504 if we proxy a status request to a suspended node. + # It is okay to retry (the controller leader will get re-elected in the meantime). + admin = Admin(self.redpanda, retry_codes=[503, 504]) start = time.time() def check(): req_start = time.time() - status = admin.get_partition_balancer_status(timeout=1) + status = admin.get_partition_balancer_status(timeout=10) self.logger.info(f"partition balancer status: {status}") if "seconds_since_last_tick" not in status: