From 6e75078b232efdd71ada1f239a94de57437d2c82 Mon Sep 17 00:00:00 2001
From: John Spray <jcs@redpanda.com>
Date: Mon, 1 Aug 2022 10:05:07 +0100
Subject: [PATCH] cluster: fix shutdown hang in health_monitor_backend

If refresh_cluster_health_cache was waiting on _refresh_mutex
while ::stop ran, and another fiber had a refresh in progress,
then ::stop cancels the other fiber's refresh + the first fiber
proceeds to try and refresh again, holding the gate open
while ::stop is waiting for it to close.

Fixes https://github.com/redpanda-data/redpanda/issues/5178

(cherry picked from commit d32c9a0489c7b021cc18b58a08ab1f5398b25954)
---
 src/v/cluster/health_monitor_backend.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/v/cluster/health_monitor_backend.cc b/src/v/cluster/health_monitor_backend.cc
index f199df567260..3df355a02df4 100644
--- a/src/v/cluster/health_monitor_backend.cc
+++ b/src/v/cluster/health_monitor_backend.cc
@@ -112,6 +112,7 @@ ss::future<> health_monitor_backend::stop() {
       _leadership_notification_handle);
 
     auto f = _gate.close();
+    _refresh_mutex.broken();
     abort_current_refresh();
     _tick_timer.cancel();
 
@@ -426,6 +427,9 @@ health_monitor_backend::maybe_refresh_cluster_health(
                   err.message());
                 co_return err;
             }
+        } catch (const ss::broken_semaphore&) {
+            // Refresh was waiting on _refresh_mutex during shutdown
+            co_return errc::shutting_down;
         } catch (const ss::timed_out_error&) {
             vlog(
               clusterlog.info,