From 6e75078b232efdd71ada1f239a94de57437d2c82 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 1 Aug 2022 10:05:07 +0100 Subject: [PATCH] cluster: fix shutdown hang in health_monitor_backend If refresh_cluster_health_cache was waiting on _refresh_mutex while ::stop ran, and another fiber had a refresh in progress, then ::stop cancels the other fiber's refresh + the first fiber proceeds to try and refresh again, holding the gate open while ::stop is waiting for it to close. Fixes https://github.com/redpanda-data/redpanda/issues/5178 (cherry picked from commit d32c9a0489c7b021cc18b58a08ab1f5398b25954) --- src/v/cluster/health_monitor_backend.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/v/cluster/health_monitor_backend.cc b/src/v/cluster/health_monitor_backend.cc index f199df567260..3df355a02df4 100644 --- a/src/v/cluster/health_monitor_backend.cc +++ b/src/v/cluster/health_monitor_backend.cc @@ -112,6 +112,7 @@ ss::future<> health_monitor_backend::stop() { _leadership_notification_handle); auto f = _gate.close(); + _refresh_mutex.broken(); abort_current_refresh(); _tick_timer.cancel(); @@ -426,6 +427,9 @@ health_monitor_backend::maybe_refresh_cluster_health( err.message()); co_return err; } + } catch (const ss::broken_semaphore&) { + // Refresh was waiting on _refresh_mutex during shutdown + co_return errc::shutting_down; } catch (const ss::timed_out_error&) { vlog( clusterlog.info,