From 2c0ca272e2f5070005b190440e4e658bfce3659a Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 8 Jul 2022 17:35:13 +0100 Subject: [PATCH] tests: mitigate MaintenanceTest failures The real fix will be to make the leader balancer aware of maintenance mode, but the test has become much more unstable since recent leader balancer changes to do more movements concurrently, so its worth mitigating that. The workaround is to set a short mute timeout so that muting nodes has no real effect, and a short idle timeout so that post-maintenance leader movements happen promptly. Related: https://github.com/redpanda-data/redpanda/issues/4772 --- tests/rptest/tests/maintenance_test.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tests/rptest/tests/maintenance_test.py b/tests/rptest/tests/maintenance_test.py index 9a66e28ae538..de35998f86c1 100644 --- a/tests/rptest/tests/maintenance_test.py +++ b/tests/rptest/tests/maintenance_test.py @@ -25,7 +25,22 @@ class MaintenanceTest(RedpandaTest): TopicSpec(partition_count=20, replication_factor=3)) def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + super().__init__( + *args, + extra_rp_conf={ + # Leader balancer configuration changes are a workaround + # to https://github.com/redpanda-data/redpanda/issues/4772 + + # Faster leader balancer iteration to get partitions moved + # back to nodes leaving maintenance mode promptly. + 'leader_balancer_idle_timeout': 5000, + + # Mute timeout shorter than idle timeout: effectvely disable + # node muting. This enables nodes leaving maintenance mode + # to get leaderships moved to them promptly. + 'leader_balancer_mute_timeout': 1000, + }, + **kwargs) self.admin = Admin(self.redpanda) self.rpk = RpkTool(self.redpanda) self._use_rpk = True