From dc83a7bdc9b760766b3c88a09ab807b85f149f19 Mon Sep 17 00:00:00 2001
From: Alexey Zatelepin <ztlpn@vectorized.io>
Date: Sun, 14 Aug 2022 17:31:50 +0300
Subject: [PATCH] tests/partition_balancer: more robust wait_until_status

Previously, when the controller leader node was suspended during the
test all status requests would fail with the timed-out error.
This was true for all nodes, not just the suspended one (because we
proxy the status request to the controller leader), so internal retries
in the admin API wrapper didn't help. We increase the timeout and add
504 to retriable status codes so that internal retries can handle this
situation.
---
 tests/rptest/tests/partition_balancer_test.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/rptest/tests/partition_balancer_test.py b/tests/rptest/tests/partition_balancer_test.py
index 5c58d670ece5..77e6be691ddc 100644
--- a/tests/rptest/tests/partition_balancer_test.py
+++ b/tests/rptest/tests/partition_balancer_test.py
@@ -77,13 +77,15 @@ def node2partition_count(self):
         return ret
 
     def wait_until_status(self, predicate, timeout_sec=120):
-        admin = Admin(self.redpanda)
+        # We may get a 504 if we proxy a status request to a suspended node.
+        # It is okay to retry (the controller leader will get re-elected in the meantime).
+        admin = Admin(self.redpanda, retry_codes=[503, 504])
         start = time.time()
 
         def check():
             req_start = time.time()
 
-            status = admin.get_partition_balancer_status(timeout=1)
+            status = admin.get_partition_balancer_status(timeout=10)
             self.logger.info(f"partition balancer status: {status}")
 
             if "seconds_since_last_tick" not in status: