From 657698698b830e6f7c8028c75801bc7c24f2a84e Mon Sep 17 00:00:00 2001 From: Rob Blafford Date: Wed, 1 Feb 2023 14:07:43 -0500 Subject: [PATCH] rptest: Prevent full upgrade in nodeops_test - The node_operations_fuzzy_test.test_node_operations test was failing due to the cluster eventually fully upgrading and having nodes with older versions of rp restart and attempt to join. - The fix is to have one node never restart so that the cluster will always stay in a partically upgraded state. - Fixes: #8498 --- .../scale_tests/node_operations_fuzzy_test.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tests/rptest/scale_tests/node_operations_fuzzy_test.py b/tests/rptest/scale_tests/node_operations_fuzzy_test.py index 659d57a29eec2..19306ee1f721b 100644 --- a/tests/rptest/scale_tests/node_operations_fuzzy_test.py +++ b/tests/rptest/scale_tests/node_operations_fuzzy_test.py @@ -110,16 +110,17 @@ def test_node_operations(self, enable_failures, num_to_upgrade, 5, log_config=log_config, extra_rp_conf=extra_rp_conf) + upgraded_nodes = [] if num_to_upgrade > 0: + upgraded_nodes = self.redpanda.nodes[:num_to_upgrade] installer = self.redpanda._installer installer.install( self.redpanda.nodes, installer.highest_from_prior_feature_version( RedpandaInstaller.HEAD)) self.redpanda.start() - installer.install(self.redpanda.nodes[:num_to_upgrade], - RedpandaInstaller.HEAD) - self.redpanda.restart_nodes(self.redpanda.nodes[:num_to_upgrade]) + installer.install(self.redpanda.nodes, RedpandaInstaller.HEAD) + self.redpanda.restart_nodes(upgraded_nodes) else: self.redpanda.start(auto_assign_node_id=True) @@ -145,6 +146,16 @@ def test_node_operations(self, enable_failures, num_to_upgrade, admin_fuzz.start() active_nodes = set([self.redpanda.idx(n) for n in self.redpanda.nodes]) + upgraded_nodes = set([self.redpanda.idx(n) for n in upgraded_nodes]) + # This conditional modifies the active_nodes set to intentionally + # remove one non-upgraded node from being restarted. This is to prevent + # situations where the entire cluster could be restarted, preventing + # nodes which haven't been upgraded from rejoining. + if len(upgraded_nodes) > 0: + non_upgraded_nodes = active_nodes ^ upgraded_nodes + random_non_upgraded_node = random.choice(list(non_upgraded_nodes)) + active_nodes.remove(random_non_upgraded_node) + fi = None if enable_failures: fi = FailureInjectorBackgroundThread(self.redpanda, self.logger,