From d32c9a0489c7b021cc18b58a08ab1f5398b25954 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 1 Aug 2022 10:05:07 +0100 Subject: [PATCH] cluster: fix shutdown hang in health_monitor_backend If refresh_cluster_health_cache was waiting on _refresh_mutex while ::stop ran, and another fiber had a refresh in progress, then ::stop cancels the other fiber's refresh + the first fiber proceeds to try and refresh again, holding the gate open while ::stop is waiting for it to close. Fixes https://github.com/redpanda-data/redpanda/issues/5178 --- src/v/cluster/health_monitor_backend.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/v/cluster/health_monitor_backend.cc b/src/v/cluster/health_monitor_backend.cc index 0ccdd7e654a8..2111c55bdbb8 100644 --- a/src/v/cluster/health_monitor_backend.cc +++ b/src/v/cluster/health_monitor_backend.cc @@ -121,6 +121,7 @@ ss::future<> health_monitor_backend::stop() { _leadership_notification_handle); auto f = _gate.close(); + _refresh_mutex.broken(); abort_current_refresh(); _tick_timer.cancel(); @@ -479,6 +480,9 @@ health_monitor_backend::maybe_refresh_cluster_health( err.message()); co_return err; } + } catch (const ss::broken_semaphore&) { + // Refresh was waiting on _refresh_mutex during shutdown + co_return errc::shutting_down; } catch (const ss::timed_out_error&) { vlog( clusterlog.info,