From dca8797792dc45f36d61a0acf56d2f6700d3d27e Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 8 Jul 2022 17:35:13 +0100 Subject: [PATCH] tests: mitigate MaintenanceTest failures The real fix will be to make the leader balancer aware of maintenance mode, but the test has become much more unstable since recent leader balancer changes to do more movements concurrently, so its worth mitigating that. The workaround is to set a short mute timeout so that muting nodes has no real effect, and a short idle timeout so that post-maintenance leader movements happen promptly. Related: https://github.com/redpanda-data/redpanda/issues/4772 --- tests/rptest/tests/maintenance_test.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tests/rptest/tests/maintenance_test.py b/tests/rptest/tests/maintenance_test.py index 9a66e28ae538e..de35998f86c13 100644 --- a/tests/rptest/tests/maintenance_test.py +++ b/tests/rptest/tests/maintenance_test.py @@ -25,7 +25,22 @@ class MaintenanceTest(RedpandaTest): TopicSpec(partition_count=20, replication_factor=3)) def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + super().__init__( + *args, + extra_rp_conf={ + # Leader balancer configuration changes are a workaround + # to https://github.com/redpanda-data/redpanda/issues/4772 + + # Faster leader balancer iteration to get partitions moved + # back to nodes leaving maintenance mode promptly. + 'leader_balancer_idle_timeout': 5000, + + # Mute timeout shorter than idle timeout: effectvely disable + # node muting. This enables nodes leaving maintenance mode + # to get leaderships moved to them promptly. + 'leader_balancer_mute_timeout': 1000, + }, + **kwargs) self.admin = Admin(self.redpanda) self.rpk = RpkTool(self.redpanda) self._use_rpk = True