From 0a2b991824b1bed784e323e10e1bcaf8d0e235b8 Mon Sep 17 00:00:00 2001
From: John Spray <jcs@redpanda.com>
Date: Wed, 25 May 2022 11:58:37 +0100
Subject: [PATCH 1/2] cluster: drop redundant set_status config updates

If a follower isn't seeing controller log updates
promptly, it may issue many set_status RPCs while
it's waiting.  The controller leader should not
turn all of these into log writes: if the status
of the node already matches what it is reporting,
then do not write anything.

Fixes https://github.com/redpanda-data/redpanda/issues/4923
---
 src/v/cluster/config_manager.h | 8 ++++++++
 src/v/cluster/fwd.h            | 1 +
 src/v/cluster/service.cc       | 6 ++++++
 src/v/cluster/service.h        | 2 ++
 src/v/redpanda/application.cc  | 1 +
 5 files changed, 18 insertions(+)

diff --git a/src/v/cluster/config_manager.h b/src/v/cluster/config_manager.h
index bd029d56d7b0..db583deabd12 100644
--- a/src/v/cluster/config_manager.h
+++ b/src/v/cluster/config_manager.h
@@ -71,6 +71,14 @@ class config_manager final {
 
     config_version get_version() const noexcept { return _seen_version; }
 
+    bool needs_update(const config_status& new_status) {
+        if (auto s = status.find(new_status.node); s != status.end()) {
+            return s->second != new_status;
+        } else {
+            return true;
+        }
+    }
+
 private:
     void merge_apply_result(
       config_status&,
diff --git a/src/v/cluster/fwd.h b/src/v/cluster/fwd.h
index 105e9ea267fc..6c3fa4c896ae 100644
--- a/src/v/cluster/fwd.h
+++ b/src/v/cluster/fwd.h
@@ -34,6 +34,7 @@ class security_frontend;
 class controller_api;
 class members_frontend;
 class config_frontend;
+class config_manager;
 class members_backend;
 class data_policy_frontend;
 class tx_gateway;
diff --git a/src/v/cluster/service.cc b/src/v/cluster/service.cc
index c18db7431c5a..3166d2e5485e 100644
--- a/src/v/cluster/service.cc
+++ b/src/v/cluster/service.cc
@@ -43,6 +43,7 @@ service::service(
   ss::sharded<controller_api>& api,
   ss::sharded<members_frontend>& members_frontend,
   ss::sharded<config_frontend>& config_frontend,
+  ss::sharded<config_manager>& config_manager,
   ss::sharded<feature_manager>& feature_manager,
   ss::sharded<feature_table>& feature_table,
   ss::sharded<health_monitor_frontend>& hm_frontend,
@@ -55,6 +56,7 @@ service::service(
   , _api(api)
   , _members_frontend(members_frontend)
   , _config_frontend(config_frontend)
+  , _config_manager(config_manager)
   , _feature_manager(feature_manager)
   , _feature_table(feature_table)
   , _hm_frontend(hm_frontend)
@@ -355,6 +357,10 @@ service::hello(hello_request&& req, rpc::streaming_context&) {
 
 ss::future<config_status_reply>
 service::config_status(config_status_request&& req, rpc::streaming_context&) {
+    if (!_config_manager.local().needs_update(req.status)) {
+        co_return config_status_reply{.error = errc::success};
+    }
+
     auto ec = co_await _config_frontend.local().set_status(
       req.status,
       config::shard_local_cfg().replicate_append_timeout_ms()
diff --git a/src/v/cluster/service.h b/src/v/cluster/service.h
index fd4dc17984e3..e324bcb99a8f 100644
--- a/src/v/cluster/service.h
+++ b/src/v/cluster/service.h
@@ -33,6 +33,7 @@ class service : public controller_service {
       ss::sharded<controller_api>&,
       ss::sharded<members_frontend>&,
       ss::sharded<config_frontend>&,
+      ss::sharded<config_manager>&,
       ss::sharded<feature_manager>&,
       ss::sharded<feature_table>&,
       ss::sharded<health_monitor_frontend>&,
@@ -130,6 +131,7 @@ class service : public controller_service {
     ss::sharded<controller_api>& _api;
     ss::sharded<members_frontend>& _members_frontend;
     ss::sharded<config_frontend>& _config_frontend;
+    ss::sharded<config_manager>& _config_manager;
     ss::sharded<feature_manager>& _feature_manager;
     ss::sharded<feature_table>& _feature_table;
     ss::sharded<health_monitor_frontend>& _hm_frontend;
diff --git a/src/v/redpanda/application.cc b/src/v/redpanda/application.cc
index 6f59434159ea..6490f8e91459 100644
--- a/src/v/redpanda/application.cc
+++ b/src/v/redpanda/application.cc
@@ -1204,6 +1204,7 @@ void application::start_redpanda(::stop_signal& app_signal) {
             std::ref(controller->get_api()),
             std::ref(controller->get_members_frontend()),
             std::ref(controller->get_config_frontend()),
+            std::ref(controller->get_config_manager()),
             std::ref(controller->get_feature_manager()),
             std::ref(controller->get_feature_table()),
             std::ref(controller->get_health_monitor()),

From 3b1e56ea303de00f7e2f7e5fb4b1195738d27b54 Mon Sep 17 00:00:00 2001
From: John Spray <jcs@redpanda.com>
Date: Wed, 25 May 2022 12:02:07 +0100
Subject: [PATCH 2/2] cluster: wait between config set_status RPCs

On a healthy system, we do want to send set_status
RPCs as soon as we're ready.  However, if the controller
log updates are not being seen promptly, this would lead
to the follower spamming the controller leader with
very many set_status RPCs in a tight loop.

Nodes will still send their status immediately when
a config change occurs: this change only effects the
behaviour if _another_ config change occurs while
it is reporting status from the first change: in this
case the follower will wait 5 seconds before sending
its next status RPC.

Related https://github.com/redpanda-data/redpanda/issues/4923
---
 src/v/cluster/config_manager.cc | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/v/cluster/config_manager.cc b/src/v/cluster/config_manager.cc
index 14e3670b851b..756e929ea2bc 100644
--- a/src/v/cluster/config_manager.cc
+++ b/src/v/cluster/config_manager.cc
@@ -516,12 +516,14 @@ ss::future<> config_manager::reconcile_status() {
     }
 
     try {
-        if (failed) {
-            // If we were dirty & failed to send our update, sleep until retry
+        if (failed || should_send_status()) {
+            // * we were dirty & failed to send our update, sleep until retry
+            // OR
+            // * our status updated while we were sending, wait a short time
+            //   before sending our next update to avoid spamming the leader
+            //   with too many set_status RPCs if we are behind on seeing
+            //   updates to the controller log.
             co_await ss::sleep_abortable(status_retry, _as.local());
-        } else if (should_send_status()) {
-            // Our status updated while we were sending, proceed
-            // immediately to next iteration of loop.
         } else {
             // We are clean: sleep until signalled.
             co_await _reconcile_wait.wait();