From 54f650e2d02641e8b24b2d09eda4893c875c6bde Mon Sep 17 00:00:00 2001 From: Andrew Wong Date: Tue, 12 Jul 2022 21:15:44 -0700 Subject: [PATCH] tests: re-use installs in upgrade tests Tests that use the RedpandaInstaller are bandwidth-intensive and often take several minutes to complete on account of the download of hundreds of MBs worth of tarballs. This commit mitigates this in local ducktape by having all test containers share a single bind mount. The installer now uses a lock file to prevent concurrent operations on the mount (e.g. when downloading binaries, checking to see what binaries exist, etc). With this commit, regardless of whether in local or clustered ducktape, we also no longer get rid of downloaded binaries between test runs. Instead, after a test completes, we just revert any changes to the original binaries, and leave the rest be. --- tests/docker/docker-compose.yml | 1 + tests/rptest/services/redpanda.py | 24 +- tests/rptest/services/redpanda_installer.py | 291 ++++++++++++++------ tests/rptest/tests/fix_5355_upgrade_test.py | 1 - tests/rptest/tests/redpanda_test.py | 3 - tests/rptest/tests/upgrade_test.py | 8 +- 6 files changed, 220 insertions(+), 108 deletions(-) diff --git a/tests/docker/docker-compose.yml b/tests/docker/docker-compose.yml index c4687c13e74d..14de928b1ba0 100644 --- a/tests/docker/docker-compose.yml +++ b/tests/docker/docker-compose.yml @@ -42,5 +42,6 @@ services: - minio volumes: - '${BUILD_ROOT}:${BUILD_ROOT}' + - '${BUILD_ROOT}/redpanda_installs:/opt/redpanda_installs' networks: - redpanda-test diff --git a/tests/rptest/services/redpanda.py b/tests/rptest/services/redpanda.py index e7109b4fe857..f132ea62b92b 100644 --- a/tests/rptest/services/redpanda.py +++ b/tests/rptest/services/redpanda.py @@ -447,7 +447,6 @@ def __init__(self, environment: Optional[dict[str, str]] = None, security: SecurityConfig = SecurityConfig(), node_ready_timeout_s=None, - enable_installer=False, superuser: Optional[SaslCredentials] = None): super(RedpandaService, self).__init__(context, num_nodes=num_brokers) self._context = context @@ -456,9 +455,7 @@ def __init__(self, self._enable_pp = enable_pp self._enable_sr = enable_sr self._security = security - self._installer: Optional[RedpandaInstaller] = None - if enable_installer: - self._installer = RedpandaInstaller(self) + self._installer: RedpandaInstaller = RedpandaInstaller(self) if superuser is None: superuser = self.SUPERUSER_CREDENTIALS @@ -601,7 +598,7 @@ def start(self, nodes=None, clean_nodes=True, start_si=True): # Expected usage is that we may install new binaries before # starting the cluster, and installation-cleaning happened # when we started the installer. - self.clean_node(node, clean_installs=False) + self.clean_node(node, preserve_current_install=True) else: self.logger.debug("%s: skip cleaning node" % self.who_am_i(node)) @@ -1134,7 +1131,7 @@ def decode_backtraces(self): self.logger.exception("Failed to run seastar-addr2line") def rp_install_path(self): - if self._installer and self._installer._started: + if self._installer._started: # The installer sets up binaries to always use /opt/redpanda. return "/opt/redpanda" return self._context.globals.get("rp_install_path_root", None) @@ -1207,7 +1204,10 @@ def clean(self, **kwargs): if self._s3client: self.delete_bucket_from_si() - def clean_node(self, node, preserve_logs=False, clean_installs=True): + def clean_node(self, + node, + preserve_logs=False, + preserve_current_install=False): # These are allow_fail=True to allow for a race where kill_process finds # the PID, but then the process has died before it sends the SIGKILL. This # should be safe against actual failures to of the process to stop, because @@ -1235,9 +1235,11 @@ def clean_node(self, node, preserve_logs=False, clean_installs=True): self.EXECUTABLE_SAVE_PATH): node.account.remove(self.EXECUTABLE_SAVE_PATH) - if clean_installs and self._installer is not None: - # Get rid of any installed packages. - self._installer.clean(node) + if not preserve_current_install or not self._installer._started: + # Reset the binaries to use the original binaries. + # NOTE: if the installer hasn't been started, there is no + # installation to preserve! + self._installer.reset_current_install([node]) def remove_local_data(self, node): node.account.remove(f"{RedpandaService.PERSISTENT_ROOT}/data/*") @@ -1728,7 +1730,7 @@ def save_executable(self): # Any node will do. Even in a mixed-version upgrade test, we should # still have the original binaries available. node = self.nodes[0] - if self._installer and self._installer._started: + if self._installer._started: head_root_path = self._installer.path_for_version( RedpandaInstaller.HEAD) binary = f"{head_root_path}/libexec/redpanda" diff --git a/tests/rptest/services/redpanda_installer.py b/tests/rptest/services/redpanda_installer.py index e9ab2c2f2cbb..445d6e9fe8f6 100644 --- a/tests/rptest/services/redpanda_installer.py +++ b/tests/rptest/services/redpanda_installer.py @@ -7,8 +7,11 @@ # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0 +import errno +import os import re import requests + from ducktape.utils.util import wait_until # Match any version that may result from a redpanda binary, which may not be a @@ -49,9 +52,19 @@ class RedpandaInstaller: # Represents the binaries installed at the time of the call to start(). It # is expected that this is identical across all nodes initially. HEAD = "head" + + # Directory to which binaries are downloaded. + # + # In local deployments it is expected that this is shared by all nodes in a + # cluster, and that directories therein are only ever created (never + # deleted) during the lifetime of the RedpandaInstaller. INSTALLER_ROOT = "/opt/redpanda_installs" TGZ_URL_TEMPLATE = "https://packages.vectorized.io/qSZR7V26sJx7tCXe/redpanda/raw/names/redpanda-{arch}/versions/{version}/redpanda-{version}-{arch}.tar.gz" + # File path to be used for locking to prevent multiple local test processes + # from operating on the same volume mounts. + INSTALLER_LOCK_PATH = f"{INSTALLER_ROOT}/install_lock" + @staticmethod def root_for_version(version): """ @@ -78,16 +91,96 @@ def __init__(self, redpanda): """ self._started = False self._redpanda = redpanda - self._installed_per_node = dict() - # Keep track if the original install path is /opt/redpanda is used, as - # is the case for package-deployed clusters. Since the installer uses - # this directory, we'll need to be mindful not to mess with the - # original binaries. + # Keep track if the original install path is /opt/redpanda, as is the + # case for package-deployed clusters. Since the installer uses this + # directory, we'll need to be mindful not to mess with the original + # binaries. rp_install_path_root = self._redpanda._context.globals.get( "rp_install_path_root", None) self._head_backed_up = rp_install_path_root == "/opt/redpanda" + # Whether the nodes are expected to share a single mounted volume for + # their installs. If so, care should be taken to coordinate operations + # on the installer root. + self._nodes_share_installs = rp_install_path_root != "/opt/redpanda" + + # File descriptor used to coordinate access to the installer root when + # multiple test processes are running on the same machine. + # Must be acquire when operating on the contents of the installer root + # (i.e. root_for_version(), etc). + self._install_lock_fd = None + + def _acquire_install_lock(self, timeout_sec=600): + """ + Attempt to take the install lock, preventing other test processes from + operating an installer. + + Serves to prevent concurrent operations to the same local mountpoint. + """ + if not self._nodes_share_installs: + self._redpanda.logger.debug( + "Nodes don't share installs; no locking needed") + return + + def _lock(): + try: + self._redpanda.logger.debug( + f"Acquiring install lock {self.INSTALLER_LOCK_PATH}") + fd = os.open(self.INSTALLER_LOCK_PATH, + os.O_CREAT | os.O_EXCL | os.O_RDWR) + self._install_lock_fd = fd + except OSError as e: + if e.errno != errno.EEXIST: + raise + # Another process holds the lock. + return False + return True + + wait_until(lambda: _lock(), timeout_sec=timeout_sec) + self._redpanda.logger.debug( + f"Acquired install lock {self.INSTALLER_LOCK_PATH}") + + def _release_install_lock(self): + """ + Releases the install lock, allowing other test processes running + locally to perform downloads. + """ + if not self._nodes_share_installs: + self._redpanda.logger.debug( + "Nodes don't share installs; no locking needed") + return + + if not self._install_lock_fd: + self._redpanda.logger.debug("Installer lock not held") + return True + os.close(self._install_lock_fd) + os.unlink(self.INSTALLER_LOCK_PATH) + self._redpanda.logger.debug("Released install lock") + + def _setup_head_roots_unlocked(self): + """ + Sets up the head roots on each node such that they contain or point to + the original binaries installed at 'rp_install_path_root'. + + Expects that the install lock has been acquired before calling. + """ + nodes = self._redpanda.nodes + head_root_path = RedpandaInstaller.root_for_version( + RedpandaInstaller.HEAD) + rp_install_path_root = self._redpanda._context.globals.get( + "rp_install_path_root", None) + for node in nodes: + # Always end up with binaries at 'head_root_path', so we can + # continue to use root_for_version() to reference the head root. + cmd = None + if self._head_backed_up: + cmd = f"mv /opt/redpanda {head_root_path}" + elif not node.account.exists(head_root_path): + cmd = f"ln -s {rp_install_path_root} {head_root_path}" + if cmd: + node.account.ssh_output(cmd) + def start(self): """ Validates that all nodes in the service have installed the same @@ -97,6 +190,9 @@ def start(self): if self._started: return + # In case a previous test was aborted, do some cleanup. + self.reset_current_install(self._redpanda.nodes) + initial_version = None nodes = self._redpanda.nodes @@ -107,37 +203,23 @@ def start(self): initial_version = vers assert initial_version == vers, \ f"Mismatch version {node.account.hostname} has {vers}, {nodes[0].account.hostname} has {initial_version}" + node.account.ssh_output(f"mkdir -p {self.INSTALLER_ROOT}") - # Clean up the installer root directory so we start out clean. - for node in nodes: - if node.account.exists(RedpandaInstaller.INSTALLER_ROOT): - node.account.remove(f"{RedpandaInstaller.INSTALLER_ROOT}/*", - allow_fail=True) - else: - node.account.mkdir(RedpandaInstaller.INSTALLER_ROOT) + try: + self._acquire_install_lock() + self._setup_head_roots_unlocked() + finally: + self._release_install_lock() - # Now that we're at a sane starting point, set up our install path for - # ease of jumping between versions. + # Start out pointing /opt/redpanda at the current installation. ssh_setup_head_per_node = dict() - head_root_path = RedpandaInstaller.root_for_version( - RedpandaInstaller.HEAD) - rp_install_path_root = self._redpanda._context.globals.get( - "rp_install_path_root", None) + head_root_path = self.root_for_version(RedpandaInstaller.HEAD) for node in nodes: - # For simplicity's sake, always end up with binaries at - # 'head_root_path', so we can continue to use root_for_version() to - # reference the head root. - head_cmd = "" - if self._head_backed_up: - head_cmd = f"mv /opt/redpanda {head_root_path}" - else: - head_cmd = f"ln -s {rp_install_path_root} {head_root_path}" - - cmd = f"{head_cmd} && ln -s {head_root_path} /opt/redpanda" - ssh_setup_head_per_node[node] = node.account.ssh_capture(cmd) - self._installed_per_node[node] = set() + if not node.account.exists("/opt/redpanda"): + cmd = f"ln -s {head_root_path} /opt/redpanda" + ssh_setup_head_per_node[node] = node.account.ssh_capture(cmd) self.wait_for_async_ssh(self._redpanda.logger, ssh_setup_head_per_node, - "Setting up head binaries") + "Setting up /opt/redpanda") def int_tuple(str_tuple): return (int(str_tuple[0]), int(str_tuple[1]), int(str_tuple[2])) @@ -178,8 +260,8 @@ def highest_from_prior_feature_version(self, version): def install(self, nodes, version): """ - Installs the release on the given node such that the next time the node - is restarted, it will use the newly installed bits. + Installs the release on the given nodes such that the next time the + nodes are restarted, they will use the newly installed bits. TODO: abstract 'version' into a more generic installation that doesn't necessarily correspond to a released version. E.g. a custom build @@ -187,64 +269,99 @@ def install(self, nodes, version): """ if not self._started: self.start() + + try: + self._acquire_install_lock() + self._install_unlocked(nodes, version) + finally: + self._release_install_lock() + + def _install_unlocked(self, nodes, version): + """ + Like above but expects the install lock to have been taken before + calling. + """ assert version == RedpandaInstaller.HEAD or version in self._released_versions, \ f"Can't find installation for {version}" - ssh_install_per_node = dict() + version_root = self.root_for_version(version) + + nodes_to_download = nodes + if self._nodes_share_installs: + nodes_to_download = [nodes[0]] + + ssh_download_per_node = dict() + for node in nodes_to_download: + if not version == RedpandaInstaller.HEAD and not node.account.exists( + version_root): + ssh_download_per_node[ + node] = self._async_download_on_node_unlocked( + node, version) + self.wait_for_async_ssh(self._redpanda.logger, ssh_download_per_node, + "Finished downloading binaries") + + # Regardless of whether we downloaded anything, adjust the + # /opt/redpanda link to point to the appropriate version on all nodes. + relink_cmd = f"unlink /opt/redpanda && ln -s {version_root} /opt/redpanda" for node in nodes: - # If we already have this version installed, just adjust the - # symlinks. - version_root = self.root_for_version(version) - relink_cmd = f"unlink /opt/redpanda && ln -s {version_root} /opt/redpanda" - if version == RedpandaInstaller.HEAD or version in self._installed_per_node[ - node]: - ssh_install_per_node[node] = node.account.ssh_capture( - relink_cmd) - continue - - arch = "amd64" - uname = str(node.account.ssh_output("uname -m")) - if "aarch" in uname or "arm" in uname: - arch = "arm64" - self._redpanda.logger.debug( - f"{node.account.hostname} uname output: {uname}") - - self._installed_per_node[node].add(version) - url = RedpandaInstaller.TGZ_URL_TEMPLATE.format( \ - arch=arch, version=f"{version[0]}.{version[1]}.{version[2]}") - tgz = "redpanda.tar.gz" - cmd = f"curl -fsSL {url} --create-dir --output-dir {version_root} -o {tgz} && gunzip -c {version_root}/{tgz} | tar -xf - -C {version_root} && rm {version_root}/{tgz} && {relink_cmd}" - ssh_install_per_node[node] = node.account.ssh_capture(cmd) + node.account.ssh_output(relink_cmd) - self.wait_for_async_ssh(self._redpanda.logger, ssh_install_per_node, - "Finished installing binaries") - - def clean(self, node): + def _async_download_on_node_unlocked(self, node, version): """ - Cleans the node such that only the original installation remains. + Asynchonously downloads Redpanda of the given version on the given + node. Returns an iterator to the results. - This should only be called once there is no longer a need to run the - RedpandaService. + Expects the install lock to have been taken before calling. """ - if not self._started: - self._redpanda.logger.debug( - "Ignoring cleanup, installer not started") - return - - # Allow failures so the entire cleanup can proceed even on failure. - head_root_path = RedpandaInstaller.root_for_version( - RedpandaInstaller.HEAD) - if self._head_backed_up: - cmd = f"unlink /opt/redpanda && mv {head_root_path} /opt/redpanda" - node.account.ssh(cmd, allow_fail=True) - else: - cmd = f"unlink /opt/redpanda && unlink {head_root_path}" - node.account.ssh(cmd, allow_fail=True) - - # Also clean up all the downloaded published binaries. - roots_to_rm = [ - RedpandaInstaller.root_for_version(v) - for v in self._installed_per_node[node] - ] - if len(roots_to_rm) == 0: - return - node.account.remove(' '.join(roots_to_rm), allow_fail=True) + version_root = self.root_for_version(version) + arch = "amd64" + uname = str(node.account.ssh_output("uname -m")) + if "aarch" in uname or "arm" in uname: + arch = "arm64" + self._redpanda.logger.debug( + f"{node.account.hostname} uname output: {uname}") + + url = RedpandaInstaller.TGZ_URL_TEMPLATE.format( \ + arch=arch, version=f"{version[0]}.{version[1]}.{version[2]}") + tgz = "redpanda.tar.gz" + cmd = f"curl -fsSL {url} --create-dir --output-dir {version_root} -o {tgz} && gunzip -c {version_root}/{tgz} | tar -xf - -C {version_root} && rm {version_root}/{tgz}" + return node.account.ssh_capture(cmd) + + def reset_current_install(self, nodes): + """ + WARNING: should not be used to upgrade to the originally installed + binaries; use 'install(RedpandaInstaller.HEAD)' for that. This should + only be used to clean up a node to its expected starting state (the + state of the world before the first call to 'start()'). + + Resets any /opt/redpanda symlink to instead be real binaries if they + exist. This is a best attempt effort to revert the installs to their + original state (i.e. the state before installing other versions). + + Upon returning, either: + - this is a packaged deployment (CDT) and we are left with a real + /opt/redpanda directory (not a symlink) if possible, or + - this is a local deployment and we are left with no links to head + binaries + """ + head_root_path = self.root_for_version(RedpandaInstaller.HEAD) + for node in nodes: + host = node.account.hostname + if self._head_backed_up: + assert not self._nodes_share_installs + # NOTE: no locking required since installs aren't shared. + head_root_path_exists = node.account.exists(head_root_path) + opt_redpanda_exists = node.account.exists("/opt/redpanda") + if opt_redpanda_exists: + if not node.account.islink("/opt/redpanda"): + assert not head_root_path_exists, \ + f"{host}: {head_root_path} exists and /opt/redpanda exists but is not a link; unclear which to use" + continue + node.account.ssh_output("unlink /opt/redpanda", + allow_fail=True) + + assert head_root_path_exists, f"{host}: neither {head_root_path} nor /opt/redpanda exists" + node.account.ssh_output(f"mv {head_root_path} /opt/redpanda", + allow_fail=True) + else: + node.account.ssh_output("unlink /opt/redpanda", + allow_fail=True) diff --git a/tests/rptest/tests/fix_5355_upgrade_test.py b/tests/rptest/tests/fix_5355_upgrade_test.py index 455154950163..d7776299ba7d 100644 --- a/tests/rptest/tests/fix_5355_upgrade_test.py +++ b/tests/rptest/tests/fix_5355_upgrade_test.py @@ -39,7 +39,6 @@ def __init__(self, test_context): } super(Fix5355UpgradeTest, self).__init__(test_context=test_context, num_brokers=3, - enable_installer=True, extra_rp_conf=extra_rp_conf) self.installer = self.redpanda._installer diff --git a/tests/rptest/tests/redpanda_test.py b/tests/rptest/tests/redpanda_test.py index 3b69e5b44bb3..f81f7c75b1a2 100644 --- a/tests/rptest/tests/redpanda_test.py +++ b/tests/rptest/tests/redpanda_test.py @@ -34,7 +34,6 @@ def __init__(self, enable_pp=False, enable_sr=False, si_settings=None, - enable_installer=False, **kwargs): """ Any trailing keyword arguments are passed through to the @@ -43,7 +42,6 @@ def __init__(self, super(RedpandaTest, self).__init__(test_context) self.scale = Scale(test_context) self.si_settings = si_settings - self.enable_installer = enable_installer if num_brokers is None: # Default to a 3 node cluster if sufficient nodes are available, else @@ -65,7 +63,6 @@ def __init__(self, enable_pp=enable_pp, enable_sr=enable_sr, si_settings=self.si_settings, - enable_installer=enable_installer, **kwargs) self._client = DefaultClient(self.redpanda) diff --git a/tests/rptest/tests/upgrade_test.py b/tests/rptest/tests/upgrade_test.py index 7745e1ebccaf..5da4b8597363 100644 --- a/tests/rptest/tests/upgrade_test.py +++ b/tests/rptest/tests/upgrade_test.py @@ -23,9 +23,7 @@ class UpgradeFromSpecificVersion(RedpandaTest): """ def __init__(self, test_context): super(UpgradeFromSpecificVersion, - self).__init__(test_context=test_context, - num_brokers=3, - enable_installer=True) + self).__init__(test_context=test_context, num_brokers=3) self.installer = self.redpanda._installer def setUp(self): @@ -69,9 +67,7 @@ class UpgradeFromPriorFeatureVersionTest(RedpandaTest): """ def __init__(self, test_context): super(UpgradeFromPriorFeatureVersionTest, - self).__init__(test_context=test_context, - num_brokers=1, - enable_installer=True) + self).__init__(test_context=test_context, num_brokers=1) self.installer = self.redpanda._installer def setUp(self):