From d4aeba6ca8af737f22c70bc7de32b7fd07d6db7c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 29 Jun 2022 05:09:23 +0000
Subject: [PATCH 001/201] build(deps): bump jackson-databind in
 /tests/java/e2e-verifiers

Bumps [jackson-databind](https://github.com/FasterXML/jackson) from 2.13.1 to 2.13.2.1.
- [Release notes](https://github.com/FasterXML/jackson/releases)
- [Commits](https://github.com/FasterXML/jackson/commits)

---
updated-dependencies:
- dependency-name: com.fasterxml.jackson.core:jackson-databind
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 tests/java/e2e-verifiers/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tests/java/e2e-verifiers/pom.xml b/tests/java/e2e-verifiers/pom.xml
index bd4841c852566..abb1ab621d904 100644
--- a/tests/java/e2e-verifiers/pom.xml
+++ b/tests/java/e2e-verifiers/pom.xml
@@ -17,7 +17,7 @@
         <argparse4j.version>0.8.1</argparse4j.version>
         <slf4j.version>1.7.30</slf4j.version>
         <buildDir>${project.basedir}/target</buildDir>
-        <jackson.version>2.13.1</jackson.version>
+        <jackson.version>2.13.2.1</jackson.version>
         <log4j.version>1.2.17</log4j.version>
     </properties>
     <dependencies>

From 6d650b586c45a907253b365c2837fae590e10ab6 Mon Sep 17 00:00:00 2001
From: John Spray <jcs@redpanda.com>
Date: Fri, 1 Jul 2022 13:32:23 +0100
Subject: [PATCH 002/201] test: make clean_node safer against shutdown/kill
 race

This could fail spuriously if a process was just about
to terminate anyway: kill_process finds a PID, but then
fails to actually kill it because it died of its own
accord in the intervening time.
---
 tests/rptest/services/redpanda.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tests/rptest/services/redpanda.py b/tests/rptest/services/redpanda.py
index c3b6aebf1abf3..130a9abd92211 100644
--- a/tests/rptest/services/redpanda.py
+++ b/tests/rptest/services/redpanda.py
@@ -1172,8 +1172,16 @@ def clean(self, **kwargs):
             self.delete_bucket_from_si()
 
     def clean_node(self, node, preserve_logs=False, clean_installs=True):
-        node.account.kill_process("redpanda", clean_shutdown=False)
-        node.account.kill_process("bin/node", clean_shutdown=False)
+        # These are allow_fail=True to allow for a race where kill_process finds
+        # the PID, but then the process has died before it sends the SIGKILL.  This
+        # should be safe against actual failures to of the process to stop, because
+        # we're using SIGKILL which does not require the process's cooperation.
+        node.account.kill_process("redpanda",
+                                  clean_shutdown=False,
+                                  allow_fail=True)
+        node.account.kill_process("bin/node",
+                                  clean_shutdown=False,
+                                  allow_fail=True)
         if node.account.exists(RedpandaService.PERSISTENT_ROOT):
             if node.account.sftp_client.listdir(
                     RedpandaService.PERSISTENT_ROOT):

From af816a4d99bc0d20e25148a5cc8903b935997495 Mon Sep 17 00:00:00 2001
From: Kefu Chai <tchaikov@gmail.com>
Date: Fri, 1 Jul 2022 21:18:52 +0800
Subject: [PATCH 003/201] serde: use concepts instead of type traits

* rewrite the type traits using concept for better readability.
* reuse the is_envelope<> concept when appropriate
  less repeatings this way
* drop unnecessary constraits
  detail::compat_version_has_serde_version_type implies detail::has_compat_attribute,
  so we can drop the latter.
  detail::version_has_serde_version_type implies implies detail::has_version_attribute,
  so we can drop the latter.
* drop helper concepts and inline them
  they are not used elsewhere, so better off inlining them.
* replace std::decay_t with same_as<const serde::version_t&>, shorter
  this way. this helps us to drop the `#clang-format off` guard.
* check value of tags. as they should be true.

Signed-off-by: Kefu Chai <tchaikov@gmail.com>
---
 src/v/serde/envelope.h                | 75 +++------------------------
 src/v/serde/envelope_for_each_field.h |  8 +--
 src/v/serde/serde.h                   | 16 +++---
 src/v/serde/test/fuzz.cc              |  9 +---
 src/v/serde/test/serde_test.cc        | 12 ++---
 5 files changed, 28 insertions(+), 92 deletions(-)

diff --git a/src/v/serde/envelope.h b/src/v/serde/envelope.h
index 83c8d24f606df..c4c02a960c04b 100644
--- a/src/v/serde/envelope.h
+++ b/src/v/serde/envelope.h
@@ -10,6 +10,7 @@
 #pragma once
 
 #include <cinttypes>
+#include <concepts>
 #include <type_traits>
 
 namespace serde {
@@ -67,77 +68,17 @@ struct checksum_envelope {
     static constexpr auto redpanda_serde_build_checksum = true;
 };
 
-namespace detail {
-
-template<typename T, typename = void>
-struct has_compat_attribute : std::false_type {};
-
-template<typename T>
-struct has_compat_attribute<
-  T,
-  std::void_t<decltype(std::declval<T>().redpanda_serde_compat_version)>>
-  : std::true_type {};
-
-template<typename T, typename = void>
-struct has_version_attribute : std::false_type {};
-
-template<typename T>
-struct has_version_attribute<
-  T,
-  std::void_t<decltype(std::declval<T>().redpanda_serde_version)>>
-  : std::true_type {};
-
-template<typename T, typename = void>
-struct inherits_from_envelope : std::false_type {};
-
-template<typename T>
-struct inherits_from_envelope<
-  T,
-  std::void_t<decltype(std::declval<T>().redpanda_inherits_from_envelope)>>
-  : std::true_type {};
-
-template<typename T>
-struct compat_version_has_serde_version_type {
-    static constexpr auto const value = std::is_same_v<
-      std::decay_t<decltype(std::declval<T>().redpanda_serde_compat_version)>,
-      version_t>;
+template<typename T, typename Version = const serde::version_t&>
+concept is_envelope = requires {
+    { T::redpanda_serde_version } -> std::same_as<Version>;
+    { T::redpanda_serde_compat_version } -> std::same_as<Version>;
 };
 
 template<typename T>
-struct version_has_serde_version_type {
-    static constexpr auto const value = std::is_same_v<
-      std::decay_t<decltype(std::declval<T>().redpanda_serde_version)>,
-      version_t>;
-};
-
-template<typename T, typename = void>
-struct has_checksum_attribute : std::false_type {};
-
-template<typename T>
-struct has_checksum_attribute<
-  T,
-  std::void_t<decltype(std::declval<T>().redpanda_serde_build_checksum)>>
-  : std::true_type {};
-
-} // namespace detail
-
-template<typename T>
-inline constexpr auto const is_envelope_v = std::conjunction_v<
-  detail::has_compat_attribute<T>,
-  detail::has_version_attribute<T>,
-  detail::compat_version_has_serde_version_type<T>,
-  detail::version_has_serde_version_type<T>>;
-
-template<typename T>
-inline constexpr auto const is_checksum_envelope_v = std::conjunction_v<
-  detail::has_compat_attribute<T>,
-  detail::has_version_attribute<T>,
-  detail::compat_version_has_serde_version_type<T>,
-  detail::version_has_serde_version_type<T>,
-  detail::has_checksum_attribute<T>>;
+concept is_checksum_envelope
+  = is_envelope<T> && T::redpanda_serde_build_checksum;
 
 template<typename T>
-inline constexpr auto const inherits_from_envelope_v
-  = detail::inherits_from_envelope<T>::value;
+concept inherits_from_envelope = T::redpanda_inherits_from_envelope;
 
 } // namespace serde
diff --git a/src/v/serde/envelope_for_each_field.h b/src/v/serde/envelope_for_each_field.h
index 772c4f12397f4..20e452fd33e39 100644
--- a/src/v/serde/envelope_for_each_field.h
+++ b/src/v/serde/envelope_for_each_field.h
@@ -215,8 +215,8 @@ constexpr inline auto envelope_to_tuple(T& t) {
 template<typename T, typename Fn>
 inline auto envelope_for_each_field(T& t, Fn&& fn) -> std::enable_if_t<
   !std::is_convertible_v<decltype(fn(std::declval<int&>())), bool>> {
-    static_assert(is_envelope_v<std::decay_t<T>>);
-    if constexpr (inherits_from_envelope_v<std::decay_t<T>>) {
+    static_assert(is_envelope<std::decay_t<T>>);
+    if constexpr (inherits_from_envelope<std::decay_t<T>>) {
         std::apply(
           [&](auto&&... args) { (fn(args), ...); }, envelope_to_tuple(t));
     } else {
@@ -228,8 +228,8 @@ inline auto envelope_for_each_field(T& t, Fn&& fn) -> std::enable_if_t<
 template<typename T, typename Fn>
 inline auto envelope_for_each_field(T& t, Fn&& fn) -> std::enable_if_t<
   std::is_convertible_v<decltype(fn(std::declval<int&>())), bool>> {
-    static_assert(is_envelope_v<std::decay_t<T>>);
-    if constexpr (inherits_from_envelope_v<std::decay_t<T>>) {
+    static_assert(is_envelope<std::decay_t<T>>);
+    if constexpr (inherits_from_envelope<std::decay_t<T>>) {
         std::apply(
           [&](auto&&... args) { (void)(fn(args) && ...); },
           envelope_to_tuple(t));
diff --git a/src/v/serde/serde.h b/src/v/serde/serde.h
index 4174ef5a53471..015ca7e0b2f04 100644
--- a/src/v/serde/serde.h
+++ b/src/v/serde/serde.h
@@ -159,7 +159,7 @@ inline constexpr bool is_absl_node_hash_map_v = is_absl_node_hash_map<T>::value;
 
 template<typename T>
 inline constexpr auto const is_serde_compatible_v
-  = is_envelope_v<T>
+  = is_envelope<T>
     || (std::is_scalar_v<T>  //
          && (!std::is_same_v<float, T> || std::numeric_limits<float>::is_iec559)
          && (!std::is_same_v<double, T> || std::numeric_limits<double>::is_iec559)
@@ -190,14 +190,14 @@ void write(iobuf& out, T t) {
     static_assert(are_bytes_and_string_different<Type>);
     static_assert(has_serde_write<Type> || is_serde_compatible_v<Type>);
 
-    if constexpr (is_envelope_v<Type>) {
+    if constexpr (is_envelope<Type>) {
         write(out, Type::redpanda_serde_version);
         write(out, Type::redpanda_serde_compat_version);
 
         auto size_placeholder = out.reserve(sizeof(serde_size_t));
 
         auto checksum_placeholder = iobuf::placeholder{};
-        if constexpr (is_checksum_envelope_v<Type>) {
+        if constexpr (is_checksum_envelope<Type>) {
             checksum_placeholder = out.reserve(sizeof(checksum_t));
         }
 
@@ -218,7 +218,7 @@ void write(iobuf& out, T t) {
         size_placeholder.write(
           reinterpret_cast<char const*>(&size), sizeof(serde_size_t));
 
-        if constexpr (is_checksum_envelope_v<Type>) {
+        if constexpr (is_checksum_envelope<Type>) {
             auto crc = crc::crc32c{};
             auto in = iobuf_const_parser{out};
             in.skip(size_before);
@@ -388,7 +388,7 @@ header read_header(iobuf_parser& in, std::size_t const bytes_left_limit) {
     auto const size = read_nested<serde_size_t>(in, bytes_left_limit);
 
     auto checksum = checksum_t{};
-    if constexpr (is_checksum_envelope_v<T>) {
+    if constexpr (is_checksum_envelope<T>) {
         checksum = read_nested<checksum_t>(in, bytes_left_limit);
     }
 
@@ -443,10 +443,10 @@ void read_nested(iobuf_parser& in, T& t, std::size_t const bytes_left_limit) {
     static_assert(are_bytes_and_string_different<Type>);
     static_assert(has_serde_read<T> || is_serde_compatible_v<Type>);
 
-    if constexpr (is_envelope_v<Type>) {
+    if constexpr (is_envelope<Type>) {
         auto const h = read_header<Type>(in, bytes_left_limit);
 
-        if constexpr (is_checksum_envelope_v<Type>) {
+        if constexpr (is_checksum_envelope<Type>) {
             auto const shared = in.share(in.bytes_left() - h._bytes_left_limit);
             auto read_only_in = iobuf_const_parser{shared};
             auto crc = crc::crc32c{};
@@ -695,7 +695,7 @@ ss::future<std::decay_t<T>> read_async(iobuf_parser& in) {
 template<typename T>
 ss::future<> write_async(iobuf& out, T const& t) {
     using Type = std::decay_t<T>;
-    if constexpr (is_envelope_v<Type> && has_serde_async_write<Type>) {
+    if constexpr (is_envelope<Type> && has_serde_async_write<Type>) {
         write(out, Type::redpanda_serde_version);
         write(out, Type::redpanda_serde_compat_version);
 
diff --git a/src/v/serde/test/fuzz.cc b/src/v/serde/test/fuzz.cc
index 91840047863ae..115dc184a9cf2 100644
--- a/src/v/serde/test/fuzz.cc
+++ b/src/v/serde/test/fuzz.cc
@@ -18,12 +18,7 @@ bool eq(
     return ((std::get<I>(a) == std::get<I>(b)) && ...);
 }
 
-template<
-  typename T1,
-  typename T2,
-  typename std::enable_if_t<
-    serde::is_envelope_v<T1> && serde::is_envelope_v<T2>,
-    void*> = nullptr>
+template<serde::is_envelope T1, serde::is_envelope T2>
 bool operator==(T1 const& a, T2 const& b) {
     return eq(
       envelope_to_tuple(a),
@@ -69,7 +64,7 @@ void init(
   data_gen& gen,
   std::index_sequence<Generation...> generations,
   int depth = 0) {
-    if constexpr (serde::is_envelope_v<T>) {
+    if constexpr (serde::is_envelope<T>) {
         ((std::apply(
            [&](auto&&... args) {
                (init(args, gen, generations, depth + 1), ...);
diff --git a/src/v/serde/test/serde_test.cc b/src/v/serde/test/serde_test.cc
index a631f6c672ce3..f8ce5abf50708 100644
--- a/src/v/serde/test/serde_test.cc
+++ b/src/v/serde/test/serde_test.cc
@@ -90,10 +90,10 @@ struct test_msg1_new_manual {
 };
 
 struct not_an_envelope {};
-static_assert(!serde::is_envelope_v<not_an_envelope>);
-static_assert(serde::is_envelope_v<test_msg1>);
-static_assert(serde::inherits_from_envelope_v<test_msg1_new>);
-static_assert(!serde::inherits_from_envelope_v<test_msg1_new_manual>);
+static_assert(!serde::is_envelope<not_an_envelope>);
+static_assert(serde::is_envelope<test_msg1>);
+static_assert(serde::inherits_from_envelope<test_msg1_new>);
+static_assert(!serde::inherits_from_envelope<test_msg1_new_manual>);
 static_assert(test_msg1::redpanda_serde_version == 4);
 static_assert(test_msg1::redpanda_serde_compat_version == 0);
 
@@ -234,7 +234,7 @@ struct complex_msg : serde::envelope<complex_msg, serde::version<3>> {
     int32_t _x;
 };
 
-static_assert(serde::is_envelope_v<complex_msg>);
+static_assert(serde::is_envelope<complex_msg>);
 
 SEASTAR_THREAD_TEST_CASE(complex_msg_test) {
     auto b = iobuf();
@@ -386,7 +386,7 @@ struct test_snapshot_header
     int32_t metadata_size;
 };
 
-static_assert(serde::is_envelope_v<test_snapshot_header>);
+static_assert(serde::is_envelope<test_snapshot_header>);
 static_assert(serde::has_serde_async_read<test_snapshot_header>);
 static_assert(serde::has_serde_async_write<test_snapshot_header>);
 

From aa0f1d654b075cf286f84a440a7929d7c5f59c70 Mon Sep 17 00:00:00 2001
From: Kefu Chai <tchaikov@gmail.com>
Date: Fri, 1 Jul 2022 21:42:33 +0800
Subject: [PATCH 004/201] serde: convert has_serde_fields_v to a concept

more readable this way.

Signed-off-by: Kefu Chai <tchaikov@gmail.com>
---
 src/v/serde/envelope_for_each_field.h | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/src/v/serde/envelope_for_each_field.h b/src/v/serde/envelope_for_each_field.h
index 20e452fd33e39..ba179b89287a4 100644
--- a/src/v/serde/envelope_for_each_field.h
+++ b/src/v/serde/envelope_for_each_field.h
@@ -19,31 +19,21 @@ namespace serde {
 
 namespace detail {
 
-template<typename T, typename = void>
-struct has_serde_fields : std::false_type {};
-
-template<typename T>
-struct has_serde_fields<
-  T,
-  std::void_t<decltype(std::declval<std::decay_t<T>>().serde_fields())>>
-  : std::true_type {};
-
 template<typename T>
-inline constexpr auto const has_serde_fields_v = has_serde_fields<T>::value;
+concept has_serde_fields = requires(T t) {
+    t.serde_fields();
+};
 
 } // namespace detail
 
-template<
-  typename T,
-  std::enable_if_t<detail::has_serde_fields_v<T>, void*> = nullptr>
+template<detail::has_serde_fields T>
 constexpr inline auto envelope_to_tuple(T&& t) {
     return t.serde_fields();
 }
 
-template<
-  typename T,
-  std::enable_if_t<!detail::has_serde_fields_v<T>, void*> = nullptr>
-constexpr inline auto envelope_to_tuple(T& t) {
+template<typename T>
+requires(!detail::has_serde_fields<T>) constexpr inline auto envelope_to_tuple(
+  T& t) {
     static_assert(std::is_aggregate_v<T>);
     static_assert(std::is_standard_layout_v<T>);
     static_assert(!std::is_polymorphic_v<T>);

From 6eba9cab06a6b242a7aaee8e419c4870e58f4dce Mon Sep 17 00:00:00 2001
From: Kefu Chai <tchaikov@gmail.com>
Date: Wed, 6 Jul 2022 20:51:25 +0800
Subject: [PATCH 005/201] serde: add check_for_more_fn concept

instead using `std::enable_if_t<>`, define a named concept for
the type constraint of the function parameter.

better readability this way.

Signed-off-by: Kefu Chai <tchaikov@gmail.com>
---
 src/v/serde/envelope_for_each_field.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/v/serde/envelope_for_each_field.h b/src/v/serde/envelope_for_each_field.h
index ba179b89287a4..2fd696c96e2d2 100644
--- a/src/v/serde/envelope_for_each_field.h
+++ b/src/v/serde/envelope_for_each_field.h
@@ -202,10 +202,13 @@ requires(!detail::has_serde_fields<T>) constexpr inline auto envelope_to_tuple(
     }
 }
 
-template<typename T, typename Fn>
-inline auto envelope_for_each_field(T& t, Fn&& fn) -> std::enable_if_t<
-  !std::is_convertible_v<decltype(fn(std::declval<int&>())), bool>> {
-    static_assert(is_envelope<std::decay_t<T>>);
+template<typename Fn>
+concept check_for_more_fn = requires(Fn&& fn, int& f) {
+    { fn(f) } -> std::convertible_to<bool>;
+};
+
+template<is_envelope T, typename Fn>
+inline auto envelope_for_each_field(T& t, Fn&& fn) {
     if constexpr (inherits_from_envelope<std::decay_t<T>>) {
         std::apply(
           [&](auto&&... args) { (fn(args), ...); }, envelope_to_tuple(t));
@@ -215,10 +218,8 @@ inline auto envelope_for_each_field(T& t, Fn&& fn) -> std::enable_if_t<
     }
 }
 
-template<typename T, typename Fn>
-inline auto envelope_for_each_field(T& t, Fn&& fn) -> std::enable_if_t<
-  std::is_convertible_v<decltype(fn(std::declval<int&>())), bool>> {
-    static_assert(is_envelope<std::decay_t<T>>);
+template<is_envelope T, check_for_more_fn Fn>
+inline auto envelope_for_each_field(T& t, Fn&& fn) {
     if constexpr (inherits_from_envelope<std::decay_t<T>>) {
         std::apply(
           [&](auto&&... args) { (void)(fn(args) && ...); },

From 131122c1a691fdcda303984b53e910d711c87b29 Mon Sep 17 00:00:00 2001
From: Kefu Chai <tchaikov@gmail.com>
Date: Wed, 6 Jul 2022 20:52:26 +0800
Subject: [PATCH 006/201] serde: rewrite has_serde_async_{read,write} using
 concepts

for better readability

Signed-off-by: Kefu Chai <tchaikov@gmail.com>
---
 src/v/serde/serde.h | 57 +++++++++++----------------------------------
 1 file changed, 13 insertions(+), 44 deletions(-)

diff --git a/src/v/serde/serde.h b/src/v/serde/serde.h
index 015ca7e0b2f04..8cc351cbe5993 100644
--- a/src/v/serde/serde.h
+++ b/src/v/serde/serde.h
@@ -69,57 +69,26 @@ struct header {
     checksum_t _checksum;
 };
 
-template<typename T, typename = void>
-struct help_has_serde_read : std::false_type {};
-
-template<typename T>
-struct help_has_serde_read<
-  T,
-  std::void_t<decltype(std::declval<T>().serde_read(
-    std::declval<std::add_lvalue_reference_t<iobuf_parser>>(),
-    std::declval<header>()))>> : std::true_type {};
-
-template<typename T>
-inline constexpr auto const has_serde_read = help_has_serde_read<T>::value;
-
-template<typename T, typename = void>
-struct help_has_serde_write : std::false_type {};
-
-template<typename T>
-struct help_has_serde_write<
-  T,
-  std::void_t<decltype(std::declval<T>().serde_write(
-    std::declval<std::add_lvalue_reference_t<iobuf>>()))>> : std::true_type {};
-
 template<typename T>
-inline constexpr auto const has_serde_write = help_has_serde_write<T>::value;
-
-template<typename T, typename = void>
-struct help_has_serde_async_read : std::false_type {};
-
-template<typename T>
-struct help_has_serde_async_read<
-  T,
-  std::void_t<decltype(std::declval<T>().serde_async_read(
-    std::declval<std::add_lvalue_reference_t<iobuf_parser>>(),
-    std::declval<header>()))>> : std::true_type {};
+concept has_serde_read = requires(T t, iobuf_parser& in, const header& h) {
+    t.serde_read(in, h);
+};
 
 template<typename T>
-inline constexpr auto const has_serde_async_read
-  = help_has_serde_async_read<T>::value;
-
-template<typename T, typename = void>
-struct help_has_serde_async_write : std::false_type {};
+concept has_serde_write = requires(T t, iobuf& out) {
+    t.serde_write(out);
+};
 
 template<typename T>
-struct help_has_serde_async_write<
-  T,
-  std::void_t<decltype(std::declval<T>().serde_async_write(
-    std::declval<std::add_lvalue_reference_t<iobuf>>()))>> : std::true_type {};
+concept has_serde_async_read
+  = requires(T t, iobuf_parser& in, const header& h) {
+    t.serde_async_read(in, h);
+};
 
 template<typename T>
-inline constexpr auto const has_serde_async_write
-  = help_has_serde_async_write<T>::value;
+concept has_serde_async_write = requires(T t, iobuf& out) {
+    t.serde_async_write(out);
+};
 
 using serde_enum_serialized_t = int32_t;
 

From 25c31de8421340dcbf95f1d489b1e1d6fb2f8892 Mon Sep 17 00:00:00 2001
From: Kefu Chai <tchaikov@gmail.com>
Date: Mon, 4 Jul 2022 21:52:23 +0800
Subject: [PATCH 007/201] serde: make has_serde_async_{read,write} more strict

ensure that they return future<>.

Signed-off-by: Kefu Chai <tchaikov@gmail.com>
---
 src/v/serde/serde.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/v/serde/serde.h b/src/v/serde/serde.h
index 8cc351cbe5993..4bbfca2e017ac 100644
--- a/src/v/serde/serde.h
+++ b/src/v/serde/serde.h
@@ -25,6 +25,7 @@
 #include "utils/named_type.h"
 #include "vlog.h"
 
+#include <seastar/core/future.hh>
 #include <seastar/net/inet_address.hh>
 
 #include <absl/container/node_hash_map.h>
@@ -82,12 +83,12 @@ concept has_serde_write = requires(T t, iobuf& out) {
 template<typename T>
 concept has_serde_async_read
   = requires(T t, iobuf_parser& in, const header& h) {
-    t.serde_async_read(in, h);
+    { t.serde_async_read(in, h) } -> seastar::Future;
 };
 
 template<typename T>
 concept has_serde_async_write = requires(T t, iobuf& out) {
-    t.serde_async_write(out);
+    { t.serde_async_write(out) } -> seastar::Future;
 };
 
 using serde_enum_serialized_t = int32_t;

From 9b7174ae200c619cd0753f06146c73f4f5e1a389 Mon Sep 17 00:00:00 2001
From: Alexey Biryukov <alexey@redpanda.com>
Date: Wed, 6 Jul 2022 18:06:31 -0400
Subject: [PATCH 008/201] kafka: fixed: find tran coordinator was not ACL
 verified

The code to handle FindCoordinator request for transaction coordinator type
appeared before the caller is checked for authorization for this operation
against the ACL. Now the chech has been moved before any other handling.
---
 .../kafka/server/handlers/find_coordinator.cc | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/v/kafka/server/handlers/find_coordinator.cc b/src/v/kafka/server/handlers/find_coordinator.cc
index e10100e028f1f..3147c07f1eae1 100644
--- a/src/v/kafka/server/handlers/find_coordinator.cc
+++ b/src/v/kafka/server/handlers/find_coordinator.cc
@@ -72,6 +72,21 @@ ss::future<response_ptr> find_coordinator_handler::handle(
     find_coordinator_request request;
     request.decode(ctx.reader(), ctx.header().version);
 
+    if (request.data.key_type == coordinator_type::group) {
+        if (!ctx.authorized(
+              security::acl_operation::describe, group_id(request.data.key))) {
+            return ctx.respond(find_coordinator_response(
+              error_code::group_authorization_failed));
+        }
+    } else if (request.data.key_type == coordinator_type::transaction) {
+        if (!ctx.authorized(
+              security::acl_operation::describe,
+              transactional_id(request.data.key))) {
+            return ctx.respond(find_coordinator_response(
+              error_code::transactional_id_authorization_failed));
+        }
+    }
+
     if (request.data.key_type == coordinator_type::transaction) {
         if (!ctx.are_transactions_enabled()) {
             return ctx.respond(
@@ -98,21 +113,6 @@ ss::future<response_ptr> find_coordinator_handler::handle(
           find_coordinator_response(error_code::unsupported_version));
     }
 
-    if (request.data.key_type == coordinator_type::group) {
-        if (!ctx.authorized(
-              security::acl_operation::describe, group_id(request.data.key))) {
-            return ctx.respond(find_coordinator_response(
-              error_code::group_authorization_failed));
-        }
-    } else if (request.data.key_type == coordinator_type::transaction) {
-        if (!ctx.authorized(
-              security::acl_operation::describe,
-              transactional_id(request.data.key))) {
-            return ctx.respond(find_coordinator_response(
-              error_code::transactional_id_authorization_failed));
-        }
-    }
-
     return ss::do_with(
       std::move(ctx),
       [request = std::move(request)](request_context& ctx) mutable {

From 53eaaa536661361ab996772871a5980c65a476c2 Mon Sep 17 00:00:00 2001
From: Ryan Russell <git@ryanrussell.org>
Date: Thu, 7 Jul 2022 16:44:10 -0500
Subject: [PATCH 009/201] docs(/src/.md): Readability improvements

Signed-off-by: Ryan Russell <git@ryanrussell.org>
---
 src/go/k8s/README.md            | 20 ++++++++++----------
 src/go/rpk/pkg/tuners/ReadMe.md |  2 +-
 src/v/README.md                 |  4 ++--
 src/v/coding-style.md           |  2 +-
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/go/k8s/README.md b/src/go/k8s/README.md
index 22a5de3759c42..5cf39eec5e625 100644
--- a/src/go/k8s/README.md
+++ b/src/go/k8s/README.md
@@ -20,7 +20,7 @@ Official Kubernetes quick start documentation can be found at
 * kustomize v3.8.7 or newer
 * cert-manager v1.0.0 or newer
 
-Optionaly to run operator locally:
+Optionally to run operator locally:
 
 * kind v0.9.0 or newer
 
@@ -30,7 +30,7 @@ Optionaly to run operator locally:
 
 Create local Kubernetes cluster using KIND
 
-```
+```bash
 export KUBECONFIG=your/path/to/kubeconfig.yaml
 kind create cluster --config kind.yaml
 ```
@@ -47,19 +47,19 @@ resources. To verify that cert manager is ready please follow
 
 You can simply deploy the Redpanda operator with webhook (recommended) by running the following command
 
-```
+```bash
 kubectl apply -k https://github.com/redpanda-data/redpanda/src/go/k8s/config/default
 ```
 
 You can deploy the Redpanda operator without webhook by running the following command:
 
-```
+```bash
 kubectl apply -k https://github.com/redpanda-data/redpanda/src/go/k8s/config/without-webhook
 ```
 
 Install sample RedpandaCluster custom resource
 
-```
+```bash
 kubectl apply -f https://raw.githubusercontent.com/redpanda-data/redpanda/dev/src/go/k8s/config/samples/one_node_cluster.yaml
 ```
 
@@ -68,26 +68,26 @@ kubectl apply -f https://raw.githubusercontent.com/redpanda-data/redpanda/dev/sr
 
 Create kind cluster
 
-```
+```bash
 make kind-create
 ```
 
 Install cert manager
 
-```
+```bash
 make certmanager-install
 ```
 
 Build docker images for manager and configurator
 
-```
+```bash
 make docker-build
 make docker-build-configurator
 ```
 
 Deploy operator to kind
 
-```
+```bash
 make deploy-to-kind
 ```
 
@@ -96,6 +96,6 @@ make deploy-to-kind
 To remove all resources even the running Redpanda cluster
 please run the following command:
 
-```
+```bash
 kubectl delete -k https://github.com/redpanda-data/redpanda/src/go/k8s/config/default
 ```
diff --git a/src/go/rpk/pkg/tuners/ReadMe.md b/src/go/rpk/pkg/tuners/ReadMe.md
index 4f82a46515be9..180ba97067ca6 100644
--- a/src/go/rpk/pkg/tuners/ReadMe.md
+++ b/src/go/rpk/pkg/tuners/ReadMe.md
@@ -10,7 +10,7 @@ The following tuners are supported
 
 The disk IRQs tuner binds all disk IRQs to requested set of CPUs. This tuner uses `hwloc` library to compute CPU masks. Prevent IRQ Balance from moving tuned devices IRQs. CPU set that is used by the tuner can be limited by CPU mask parameter. If mask parameter is provided then only those CPUs that are masked will be considered as available. Mask covering all host CPUs is used as a default.
 
-IRQs are disstributed according to the following rules:
+IRQs are distributed according to the following rules:
 
 - Distribute NVMe disks IRQs equally among all available CPUs.
 - Distribute non-NVMe disks IRQs equally among designated CPUs or among all available CPUs in the `mq` mode.
diff --git a/src/v/README.md b/src/v/README.md
index 08778a84a0a6f..09c8023d1eb47 100644
--- a/src/v/README.md
+++ b/src/v/README.md
@@ -11,11 +11,11 @@ platform        | Machine dependent settings like ssse3 instructions |
 coproc          | WASM / Coprocessor engine for lambda transforms |
 resource_mgmt   | CPU and IO priority | 
 utils           | code utils |
-hashing         | hashing utility adaptors often used in cryptography or checksuming |
+hashing         | hashing utility adaptors often used in cryptography or checksumming |
 storage         | low level bits of the storage api |
 redpanda        | high level program - main entry point |
 finjector       | failure injector framework for testing and correctness |
-json            | json manipulation utlities |
+json            | json manipulation utilities |
 http            | HTTP conversion and utilities |
 kafka           | Kafka compatibility protocol layer |
 compression     | utilities for supporting multiple compressor types |
diff --git a/src/v/coding-style.md b/src/v/coding-style.md
index 3572cc04d43dc..87d766a02e38d 100644
--- a/src/v/coding-style.md
+++ b/src/v/coding-style.md
@@ -110,7 +110,7 @@ void a_function() {
 
 An exception is namespaces -- the body is _not_ indented, to prevent files that are almost 100% whitespace left margin.
 
-When making a change, if you need to insert an indentation level, you can temporarily break the rules by insering a half-indent, so that the patch is easily reviewable:
+When making a change, if you need to insert an indentation level, you can temporarily break the rules by inserting a half-indent, so that the patch is easily reviewable:
 
 ```c++
 void a_function() {

From 1ca7098311f25642ee76b68d9446223f93403efc Mon Sep 17 00:00:00 2001
From: Rogger Vasquez <rvasque3@gmail.com>
Date: Wed, 29 Jun 2022 16:09:29 -0500
Subject: [PATCH 010/201] tests: add rpk redpanda mode test

We want to make sure that the start/installation
path that we guide the user to follow is covered
in our CI, running rpk redpanda mode prod is a
crucial step before running the tuners.
---
 tests/rptest/clients/rpk_remote.py    |  3 ++
 tests/rptest/tests/rpk_config_test.py | 49 +++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

diff --git a/tests/rptest/clients/rpk_remote.py b/tests/rptest/clients/rpk_remote.py
index 6b93906e8a2e2..a20ac8893e3bf 100644
--- a/tests/rptest/clients/rpk_remote.py
+++ b/tests/rptest/clients/rpk_remote.py
@@ -47,6 +47,9 @@ def cluster_config_force_reset(self, property_name):
     def cluster_config_lint(self):
         return self._execute([self._rpk_binary(), 'cluster', 'config', 'lint'])
 
+    def mode_set(self, mode):
+        return self._execute([self._rpk_binary(), 'redpanda', 'mode', mode])
+
     def _run_config(self, cmd, path=None, timeout=30):
         cmd = [self._rpk_binary(), 'redpanda', 'config'] + cmd
 
diff --git a/tests/rptest/tests/rpk_config_test.py b/tests/rptest/tests/rpk_config_test.py
index 18047df74fa51..78711b6f8c4a0 100644
--- a/tests/rptest/tests/rpk_config_test.py
+++ b/tests/rptest/tests/rpk_config_test.py
@@ -234,3 +234,52 @@ def test_config_change_then_restart_node(self):
             rpk.config_set(key, value)
 
             self.redpanda.restart_nodes(node)
+
+    @cluster(num_nodes=1)
+    def test_config_change_mode_prod(self):
+        """
+        Verify that after running rpk redpanda mode prod, the 
+        configuration values of the tuners change accordingly.
+        """
+        node = self.redpanda.nodes[0]
+        rpk = RpkRemoteTool(self.redpanda, node)
+        rpk.mode_set("prod")
+        expected_config = yaml.full_load('''
+    enable_usage_stats: false
+    tune_network: true
+    tune_disk_scheduler: true
+    tune_disk_nomerges: true
+    tune_disk_write_cache: true
+    tune_disk_irq: true
+    tune_fstrim: false
+    tune_cpu: true
+    tune_aio_events: true
+    tune_clocksource: true
+    tune_swappiness: true
+    tune_transparent_hugepages: false
+    enable_memory_locking: false
+    tune_coredump: false
+    coredump_dir: /var/lib/redpanda/coredump
+    tune_ballast_file: true
+    overprovisioned: false
+''')
+        with tempfile.TemporaryDirectory() as d:
+            node.account.copy_from(RedpandaService.NODE_CONFIG_FILE, d)
+
+            with open(os.path.join(d, 'redpanda.yaml')) as f:
+                actual_config = yaml.full_load(f.read())
+
+                # Delete 'admin_api' and 'kafka_api' since they are not
+                # needed for this test and the brokers change depending
+                # on the container it's running.
+                del actual_config['rpk']['kafka_api']
+                del actual_config['rpk']['admin_api']
+
+                if actual_config['rpk'] != expected_config:
+                    self.logger.error("Configs differ")
+                    self.logger.error(
+                        f"Expected: {yaml.dump(expected_config)}")
+                    self.logger.error(
+                        f"Actual: {yaml.dump(actual_config['rpk'])}")
+                assert actual_config['rpk'] == expected_config
+                assert actual_config['redpanda']['developer_mode'] == False

From 0ca57b5d9799d7bbbdba5108352a908abe49e186 Mon Sep 17 00:00:00 2001
From: Rogger Vasquez <rvasque3@gmail.com>
Date: Wed, 6 Jul 2022 10:42:15 -0500
Subject: [PATCH 011/201] tests: add rpk tuner tests

Tests will be only available in CDT since they
rely on the environment and can't be run in
a container.
---
 tests/rptest/clients/rpk_remote.py   |  3 ++
 tests/rptest/test_suite_quick.yml    |  1 +
 tests/rptest/test_suite_rpk.yml      |  1 +
 tests/rptest/tests/rpk_tuner_test.py | 54 ++++++++++++++++++++++++++++
 4 files changed, 59 insertions(+)
 create mode 100644 tests/rptest/tests/rpk_tuner_test.py

diff --git a/tests/rptest/clients/rpk_remote.py b/tests/rptest/clients/rpk_remote.py
index a20ac8893e3bf..c124374cc93ce 100644
--- a/tests/rptest/clients/rpk_remote.py
+++ b/tests/rptest/clients/rpk_remote.py
@@ -47,6 +47,9 @@ def cluster_config_force_reset(self, property_name):
     def cluster_config_lint(self):
         return self._execute([self._rpk_binary(), 'cluster', 'config', 'lint'])
 
+    def tune(self, tuner):
+        return self._execute([self._rpk_binary(), 'redpanda', 'tune', tuner])
+
     def mode_set(self, mode):
         return self._execute([self._rpk_binary(), 'redpanda', 'mode', mode])
 
diff --git a/tests/rptest/test_suite_quick.yml b/tests/rptest/test_suite_quick.yml
index 51ff6beb922c3..f8fcdd94938fe 100644
--- a/tests/rptest/test_suite_quick.yml
+++ b/tests/rptest/test_suite_quick.yml
@@ -17,3 +17,4 @@ quick:
   - tests/wasm_identity_test.py
   - tests/wasm_partition_movement_test.py
   - tests/wasm_redpanda_failure_recovery_test.py
+  - tests/rpk_tuner_test.py
diff --git a/tests/rptest/test_suite_rpk.yml b/tests/rptest/test_suite_rpk.yml
index 07041744ba826..4a2e5159ed6a7 100644
--- a/tests/rptest/test_suite_rpk.yml
+++ b/tests/rptest/test_suite_rpk.yml
@@ -12,3 +12,4 @@ quick:
   - tests/rpk_topic_test.py
   - tests/rpk_cluster_test.py
   - tests/rpk_config_test.py
+  - tests/rpk_tuner_test.py
diff --git a/tests/rptest/tests/rpk_tuner_test.py b/tests/rptest/tests/rpk_tuner_test.py
new file mode 100644
index 0000000000000..ae4d69f304099
--- /dev/null
+++ b/tests/rptest/tests/rpk_tuner_test.py
@@ -0,0 +1,54 @@
+# Copyright 2022 Redpanda Data, Inc.
+#
+# Use of this software is governed by the Business Source License
+# included in the file licenses/BSL.md
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0
+
+from rptest.services.cluster import cluster
+from rptest.tests.redpanda_test import RedpandaTest
+from rptest.clients.rpk_remote import RpkRemoteTool
+
+
+class RpkTunerTest(RedpandaTest):
+    def __init__(self, ctx):
+        super(RpkTunerTest, self).__init__(test_context=ctx)
+        self._ctx = ctx
+
+    @cluster(num_nodes=1)
+    def test_tune_prod_all(self):
+        """
+        Test will set production mode and execute rpk redpanda tune all,
+        we expect the command to exit with 1 if an error happens.
+        """
+        node = self.redpanda.nodes[0]
+        rpk = RpkRemoteTool(self.redpanda, node)
+        rpk.mode_set("prod")
+
+        rpk.tune("all")
+
+    @cluster(num_nodes=1)
+    def test_tune_fstrim(self):
+        """
+        Validate fstrim tuner execution,
+        fstrim was disabled in production mode https://github.com/redpanda-data/redpanda/issues/3068 
+        """
+        node = self.redpanda.nodes[0]
+        rpk = RpkRemoteTool(self.redpanda, node)
+        rpk.config_set('rpk.tune_fstrim', 'true')
+
+        rpk.tune("fstrim")
+
+    @cluster(num_nodes=1)
+    def test_tune_transparent_hugepages(self):
+        """
+        Validate transparent hugepage tuner execution.
+        THP tuner is disabled in production mode
+        """
+        node = self.redpanda.nodes[0]
+        rpk = RpkRemoteTool(self.redpanda, node)
+        rpk.config_set('rpk.tune_transparent_hugepages', 'true')
+
+        rpk.tune("transparent_hugepages")

From 4224dc8d89832a1eeb69ad92efead8b604c1c460 Mon Sep 17 00:00:00 2001
From: Rogger Vasquez <rvasque3@gmail.com>
Date: Fri, 8 Jul 2022 09:16:16 -0500
Subject: [PATCH 012/201] tests: add rpk tune list ducktape test

This golden test will allow us to catch when a
new tuner is either added or removed from
production mode
---
 tests/rptest/tests/rpk_tuner_test.py | 34 ++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/tests/rptest/tests/rpk_tuner_test.py b/tests/rptest/tests/rpk_tuner_test.py
index ae4d69f304099..b0d40c197eaa1 100644
--- a/tests/rptest/tests/rpk_tuner_test.py
+++ b/tests/rptest/tests/rpk_tuner_test.py
@@ -52,3 +52,37 @@ def test_tune_transparent_hugepages(self):
         rpk.config_set('rpk.tune_transparent_hugepages', 'true')
 
         rpk.tune("transparent_hugepages")
+
+    @cluster(num_nodes=1)
+    def test_tune_list(self):
+        """
+        Forward compatible test, the purpose is to check if available
+        tuners match our current setup, if a new tuner gets added we
+        will catch it here.
+        """
+        node = self.redpanda.nodes[0]
+        rpk = RpkRemoteTool(self.redpanda, node)
+        # Set all tuners:
+        rpk.mode_set("prod")
+        rpk.config_set('rpk.tune_fstrim', 'true')
+        rpk.config_set('rpk.tune_transparent_hugepages', 'true')
+        rpk.config_set('rpk.tune_coredump', 'true')
+
+        expected = '''TUNER                  ENABLED  SUPPORTED  UNSUPPORTED-REASON
+aio_events             true     true       
+ballast_file           true     true       
+clocksource            true     true       
+coredump               true     true       
+cpu                    true     true       
+disk_irq               true     true       
+disk_nomerges          true     true       
+disk_scheduler         true     true       
+disk_write_cache       true     false      Disk write cache tuner is only supported in GCP
+fstrim                 true     true       
+net                    true     true       
+swappiness             true     true       
+transparent_hugepages  true     true       
+'''
+        output = rpk.tune("list")
+
+        assert output == expected

From 8d29b9d32b3dc9f3946c9f61c8507cc59ffc2179 Mon Sep 17 00:00:00 2001
From: Alexey Biryukov <alexey@redpanda.com>
Date: Fri, 8 Jul 2022 17:23:25 -0400
Subject: [PATCH 013/201] config: increased default partitions # for
 __consumer_offsets

When a consumer tries to locate a consumer group coordinator of a cluster
for the first time, the __consumer_offsets topic is created with the
number of partitions as per the group_topic_partitions property.
The default value for that property was 1 which means that unless
a different value was explicitly specified by the customer at a very
early stage of cluster's life, all OffsetCommit requests from all
consumers will be going to a single broker. This change increases
the default value to 16 as a reasonable trade-off between OffsetCommit
parallelism for the clusters that will use consumer groups
later in their life, and the overhead for the clusters that
won't use consumer groups.

#5222
---
 src/v/config/configuration.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/v/config/configuration.cc b/src/v/config/configuration.cc
index 0d0f72c39cd61..b4716730d019c 100644
--- a/src/v/config/configuration.cc
+++ b/src/v/config/configuration.cc
@@ -474,7 +474,7 @@ configuration::configuration()
       "group_topic_partitions",
       "Number of partitions in the internal group membership topic",
       {.needs_restart = needs_restart::no, .visibility = visibility::tunable},
-      1)
+      16)
   , default_topic_replication(
       *this,
       "default_topic_replications",

From 1a72446ab268ef66db7ac5b3baca754f5e7b7cb2 Mon Sep 17 00:00:00 2001
From: Denis Rystsov <denis@vectorized.io>
Date: Fri, 1 Jul 2022 13:55:55 -0700
Subject: [PATCH 014/201] k/produce: do not use unknown_server_error

Kafka client doesn't process unknown_server_error correctly and it may
lead to duplicates violating the idempotency. See the following issue
for more info: https://issues.apache.org/jira/browse/KAFKA-14034

request_timed_out just like unknown_server_error means that the true
outcome of the operation is unknown and unlike unknown_server_error it
doesn't cause the problem so switching to using it to avoid the problem
---
 src/v/kafka/server/handlers/produce.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/v/kafka/server/handlers/produce.cc b/src/v/kafka/server/handlers/produce.cc
index 687b127c6c0b4..8e9a2b051ecd9 100644
--- a/src/v/kafka/server/handlers/produce.cc
+++ b/src/v/kafka/server/handlers/produce.cc
@@ -139,7 +139,7 @@ static error_code map_produce_error_code(std::error_code ec) {
         case raft::errc::shutting_down:
             return error_code::request_timed_out;
         default:
-            return error_code::unknown_server_error;
+            return error_code::request_timed_out;
         }
     }
 
@@ -157,11 +157,11 @@ static error_code map_produce_error_code(std::error_code ec) {
         case cluster::errc::invalid_request:
             return error_code::invalid_request;
         default:
-            return error_code::unknown_server_error;
+            return error_code::request_timed_out;
         }
     }
 
-    return error_code::unknown_server_error;
+    return error_code::request_timed_out;
 }
 
 /*
@@ -198,7 +198,7 @@ static partition_produce_stages partition_append(
                     p.error_code = map_produce_error_code(r.error());
                 }
             } catch (...) {
-                p.error_code = error_code::unknown_server_error;
+                p.error_code = error_code::request_timed_out;
             }
             return p;
         }),

From 1dfd8d963ea88c08c339727c2001dc3ba6f44624 Mon Sep 17 00:00:00 2001
From: Denis Rystsov <denis@vectorized.io>
Date: Fri, 8 Jul 2022 21:05:35 -0700
Subject: [PATCH 015/201] cluster: remove dead code

---
 src/v/cluster/partition.cc | 53 --------------------------------------
 src/v/cluster/partition.h  |  5 ----
 2 files changed, 58 deletions(-)

diff --git a/src/v/cluster/partition.cc b/src/v/cluster/partition.cc
index 8350845f7d958..806c69a19a7e5 100644
--- a/src/v/cluster/partition.cc
+++ b/src/v/cluster/partition.cc
@@ -187,59 +187,6 @@ raft::replicate_stages partition::replicate_in_stages(
     }
 }
 
-ss::future<result<raft::replicate_result>> partition::replicate(
-  model::batch_identity bid,
-  model::record_batch_reader&& r,
-  raft::replicate_options opts) {
-    if (bid.is_transactional) {
-        if (!_is_tx_enabled) {
-            vlog(
-              clusterlog.error,
-              "Can't process a transactional request to {}. Transactional "
-              "processing isn't enabled.",
-              _raft->ntp());
-            return ss::make_ready_future<result<raft::replicate_result>>(
-              raft::errc::timeout);
-        }
-
-        if (!_rm_stm) {
-            vlog(
-              clusterlog.error,
-              "Topic {} doesn't support transactional processing.",
-              _raft->ntp());
-            return ss::make_ready_future<result<raft::replicate_result>>(
-              raft::errc::timeout);
-        }
-    }
-
-    if (bid.has_idempotent()) {
-        if (!_is_idempotence_enabled) {
-            vlog(
-              clusterlog.error,
-              "Can't process an idempotent request to {}. Idempotency isn't "
-              "enabled.",
-              _raft->ntp());
-            return ss::make_ready_future<result<raft::replicate_result>>(
-              raft::errc::timeout);
-        }
-
-        if (!_rm_stm) {
-            vlog(
-              clusterlog.error,
-              "Topic {} doesn't support idempotency.",
-              _raft->ntp());
-            return ss::make_ready_future<result<raft::replicate_result>>(
-              raft::errc::timeout);
-        }
-    }
-
-    if (_rm_stm) {
-        return _rm_stm->replicate(bid, std::move(r), opts);
-    } else {
-        return _raft->replicate(std::move(r), opts);
-    }
-}
-
 ss::future<> partition::start() {
     auto ntp = _raft->ntp();
 
diff --git a/src/v/cluster/partition.h b/src/v/cluster/partition.h
index 81107c6d0beb6..e62c856e1b51d 100644
--- a/src/v/cluster/partition.h
+++ b/src/v/cluster/partition.h
@@ -59,11 +59,6 @@ class partition {
     ss::future<result<raft::replicate_result>> replicate(
       model::term_id, model::record_batch_reader&&, raft::replicate_options);
 
-    ss::future<result<raft::replicate_result>> replicate(
-      model::batch_identity,
-      model::record_batch_reader&&,
-      raft::replicate_options);
-
     raft::replicate_stages replicate_in_stages(
       model::batch_identity,
       model::record_batch_reader&&,

From 67a3112d7359fc2d2d62befed416f11dfdd48853 Mon Sep 17 00:00:00 2001
From: Denis Rystsov <denis@vectorized.io>
Date: Fri, 1 Jul 2022 15:16:30 -0700
Subject: [PATCH 016/201] cluster: prepare partition for translating offset

Update all partition::replicate dependees which don't perform offset
translation to bypass it via a direct raft reference
---
 src/v/cluster/partition.cc                       | 12 ------------
 src/v/cluster/partition.h                        | 12 +-----------
 src/v/cluster/partition_probe.cc                 |  8 ++++----
 src/v/cluster/tests/partition_moving_test.cc     |  2 +-
 src/v/cluster/tests/rebalancing_tests_fixture.h  |  2 +-
 .../coproc/tests/fixtures/fiber_mock_fixture.cc  |  2 +-
 src/v/kafka/server/group.cc                      | 16 ++++++++--------
 src/v/kafka/server/group_metadata_migration.cc   |  1 +
 src/v/kafka/server/tests/fetch_test.cc           |  8 ++++----
 src/v/kafka/server/tests/topic_recreate_test.cc  |  2 +-
 10 files changed, 22 insertions(+), 43 deletions(-)

diff --git a/src/v/cluster/partition.cc b/src/v/cluster/partition.cc
index 806c69a19a7e5..4a5d70a878f77 100644
--- a/src/v/cluster/partition.cc
+++ b/src/v/cluster/partition.cc
@@ -107,18 +107,6 @@ ss::future<result<raft::replicate_result>> partition::replicate(
     return _raft->replicate(std::move(r), opts);
 }
 
-raft::replicate_stages partition::replicate_in_stages(
-  model::record_batch_reader&& r, raft::replicate_options opts) {
-    return _raft->replicate_in_stages(std::move(r), opts);
-}
-
-ss::future<result<raft::replicate_result>> partition::replicate(
-  model::term_id term,
-  model::record_batch_reader&& r,
-  raft::replicate_options opts) {
-    return _raft->replicate(term, std::move(r), opts);
-}
-
 ss::shared_ptr<cluster::rm_stm> partition::rm_stm() {
     if (!_rm_stm) {
         if (!_is_tx_enabled && !_is_idempotence_enabled) {
diff --git a/src/v/cluster/partition.h b/src/v/cluster/partition.h
index e62c856e1b51d..3a4e6e636acff 100644
--- a/src/v/cluster/partition.h
+++ b/src/v/cluster/partition.h
@@ -53,12 +53,6 @@ class partition {
     ss::future<result<raft::replicate_result>>
     replicate(model::record_batch_reader&&, raft::replicate_options);
 
-    raft::replicate_stages
-    replicate_in_stages(model::record_batch_reader&&, raft::replicate_options);
-
-    ss::future<result<raft::replicate_result>> replicate(
-      model::term_id, model::record_batch_reader&&, raft::replicate_options);
-
     raft::replicate_stages replicate_in_stages(
       model::batch_identity,
       model::record_batch_reader&&,
@@ -284,11 +278,7 @@ class partition {
         return _raft->abort_configuration_change(rev);
     }
 
-private:
-    friend partition_manager;
-    friend replicated_partition_probe;
-
-    consensus_ptr raft() { return _raft; }
+    consensus_ptr raft() const { return _raft; }
 
 private:
     consensus_ptr _raft;
diff --git a/src/v/cluster/partition_probe.cc b/src/v/cluster/partition_probe.cc
index 949409d0fb20d..01b21336a273f 100644
--- a/src/v/cluster/partition_probe.cc
+++ b/src/v/cluster/partition_probe.cc
@@ -89,7 +89,7 @@ void replicated_partition_probe::setup_internal_metrics(const model::ntp& ntp) {
         sm::make_gauge(
           "leader_id",
           [this] {
-              return _partition._raft->get_leader_id().value_or(
+              return _partition.raft()->get_leader_id().value_or(
                 model::node_id(-1));
           },
           sm::description("Id of current partition leader"),
@@ -98,7 +98,7 @@ void replicated_partition_probe::setup_internal_metrics(const model::ntp& ntp) {
         sm::make_gauge(
           "under_replicated_replicas",
           [this] {
-              auto metrics = _partition._raft->get_follower_metrics();
+              auto metrics = _partition.raft()->get_follower_metrics();
               return std::count_if(
                 metrics.cbegin(),
                 metrics.cend(),
@@ -181,7 +181,7 @@ void replicated_partition_probe::setup_public_metrics(const model::ntp& ntp) {
         sm::make_gauge(
           "under_replicated_replicas",
           [this] {
-              auto metrics = _partition._raft->get_follower_metrics();
+              auto metrics = _partition.raft()->get_follower_metrics();
               return std::count_if(
                 metrics.cbegin(),
                 metrics.cend(),
@@ -214,7 +214,7 @@ void replicated_partition_probe::setup_public_metrics(const model::ntp& ntp) {
           .aggregate({sm::shard_label, partition_label}),
         sm::make_gauge(
           "replicas",
-          [this] { return _partition._raft->get_follower_count(); },
+          [this] { return _partition.raft()->get_follower_count(); },
           sm::description("Number of replicas per topic"),
           labels)
           .aggregate({sm::shard_label, partition_label}),
diff --git a/src/v/cluster/tests/partition_moving_test.cc b/src/v/cluster/tests/partition_moving_test.cc
index a8b68af8fe2bc..b5d8fd9766c4a 100644
--- a/src/v/cluster/tests/partition_moving_test.cc
+++ b/src/v/cluster/tests/partition_moving_test.cc
@@ -318,7 +318,7 @@ class partition_assignment_test_fixture : public cluster_test_fixture {
             auto rdr = model::make_memory_record_batch_reader(
               std::move(batches));
             // replicate
-            auto f = pm.get(ntp)->replicate(
+            auto f = pm.get(ntp)->raft()->replicate(
               std::move(rdr),
               raft::replicate_options(raft::consistency_level::quorum_ack));
 
diff --git a/src/v/cluster/tests/rebalancing_tests_fixture.h b/src/v/cluster/tests/rebalancing_tests_fixture.h
index 830d269e5b6ef..635564406899b 100644
--- a/src/v/cluster/tests/rebalancing_tests_fixture.h
+++ b/src/v/cluster/tests/rebalancing_tests_fixture.h
@@ -159,7 +159,7 @@ class rebalancing_tests_fixture : public cluster_test_fixture {
             auto rdr = model::make_memory_record_batch_reader(
               std::move(batches));
             // replicate
-            auto f = pm.get(ntp)->replicate(
+            auto f = pm.get(ntp)->raft()->replicate(
               std::move(rdr),
               raft::replicate_options(raft::consistency_level::quorum_ack));
 
diff --git a/src/v/coproc/tests/fixtures/fiber_mock_fixture.cc b/src/v/coproc/tests/fixtures/fiber_mock_fixture.cc
index 528c78f414175..c2bc47e6651b7 100644
--- a/src/v/coproc/tests/fixtures/fiber_mock_fixture.cc
+++ b/src/v/coproc/tests/fixtures/fiber_mock_fixture.cc
@@ -178,7 +178,7 @@ ss::future<ss::lw_shared_ptr<coproc::source>> fiber_mock_fixture::make_source(
     auto batch = make_random_batch(params.records_per_input);
     co_await tests::cooperative_spin_wait_with_timeout(
       2s, [partition]() { return partition->is_elected_leader(); });
-    auto r = co_await partition->replicate(
+    auto r = co_await partition->raft()->replicate(
       std::move(batch),
       raft::replicate_options(raft::consistency_level::leader_ack));
     vassert(!r.has_error(), "Write error: {}", r.error());
diff --git a/src/v/kafka/server/group.cc b/src/v/kafka/server/group.cc
index 1441b02bfb60f..32815af490d9a 100644
--- a/src/v/kafka/server/group.cc
+++ b/src/v/kafka/server/group.cc
@@ -1689,7 +1689,7 @@ group::commit_tx(cluster::commit_group_tx_request r) {
 
     auto reader = model::make_memory_record_batch_reader(std::move(batch));
 
-    auto e = co_await _partition->replicate(
+    auto e = co_await _partition->raft()->replicate(
       _term,
       std::move(reader),
       raft::replicate_options(raft::consistency_level::quorum_ack));
@@ -1772,7 +1772,7 @@ group::begin_tx(cluster::begin_group_tx_request r) {
           r.pid,
           std::move(fence));
         auto reader = model::make_memory_record_batch_reader(std::move(batch));
-        auto e = co_await _partition->replicate(
+        auto e = co_await _partition->raft()->replicate(
           _term,
           std::move(reader),
           raft::replicate_options(raft::consistency_level::quorum_ack));
@@ -1887,7 +1887,7 @@ group::prepare_tx(cluster::prepare_group_tx_request r) {
       std::move(tx_entry));
     auto reader = model::make_memory_record_batch_reader(std::move(batch));
 
-    auto e = co_await _partition->replicate(
+    auto e = co_await _partition->raft()->replicate(
       _term,
       std::move(reader),
       raft::replicate_options(raft::consistency_level::quorum_ack));
@@ -1983,7 +1983,7 @@ group::abort_tx(cluster::abort_group_tx_request r) {
       std::move(tx));
     auto reader = model::make_memory_record_batch_reader(std::move(batch));
 
-    auto e = co_await _partition->replicate(
+    auto e = co_await _partition->raft()->replicate(
       _term,
       std::move(reader),
       raft::replicate_options(raft::consistency_level::quorum_ack));
@@ -2103,7 +2103,7 @@ group::offset_commit_stages group::store_offsets(offset_commit_request&& r) {
     auto batch = std::move(builder).build();
     auto reader = model::make_memory_record_batch_reader(std::move(batch));
 
-    auto replicate_stages = _partition->replicate_in_stages(
+    auto replicate_stages = _partition->raft()->replicate_in_stages(
       std::move(reader),
       raft::replicate_options(raft::consistency_level::quorum_ack));
 
@@ -2492,7 +2492,7 @@ ss::future<error_code> group::remove() {
     auto reader = model::make_memory_record_batch_reader(std::move(batch));
 
     try {
-        auto result = co_await _partition->replicate(
+        auto result = co_await _partition->raft()->replicate(
           std::move(reader),
           raft::replicate_options(raft::consistency_level::quorum_ack));
         if (result) {
@@ -2572,7 +2572,7 @@ group::remove_topic_partitions(const std::vector<model::topic_partition>& tps) {
     auto reader = model::make_memory_record_batch_reader(std::move(batch));
 
     try {
-        auto result = co_await _partition->replicate(
+        auto result = co_await _partition->raft()->replicate(
           std::move(reader),
           raft::replicate_options(raft::consistency_level::quorum_ack));
         if (result) {
@@ -2599,7 +2599,7 @@ group::remove_topic_partitions(const std::vector<model::topic_partition>& tps) {
 
 ss::future<result<raft::replicate_result>>
 group::store_group(model::record_batch batch) {
-    return _partition->replicate(
+    return _partition->raft()->replicate(
       model::make_memory_record_batch_reader(std::move(batch)),
       raft::replicate_options(raft::consistency_level::quorum_ack));
 }
diff --git a/src/v/kafka/server/group_metadata_migration.cc b/src/v/kafka/server/group_metadata_migration.cc
index c7a27abe203b1..a98aba8bbb5f8 100644
--- a/src/v/kafka/server/group_metadata_migration.cc
+++ b/src/v/kafka/server/group_metadata_migration.cc
@@ -332,6 +332,7 @@ ss::future<std::error_code> replicate(
       [ntp = std::move(ntp),
        f_reader = std::move(f_reader)](cluster::partition_manager& pm) mutable {
           return pm.get(ntp)
+            ->raft()
             ->replicate(
               std::move(f_reader),
               raft::replicate_options(raft::consistency_level::quorum_ack))
diff --git a/src/v/kafka/server/tests/fetch_test.cc b/src/v/kafka/server/tests/fetch_test.cc
index d69e407969434..af3bbbf539c91 100644
--- a/src/v/kafka/server/tests/fetch_test.cc
+++ b/src/v/kafka/server/tests/fetch_test.cc
@@ -418,7 +418,7 @@ FIXTURE_TEST(fetch_multi_partitions_debounce, redpanda_thread_fixture) {
                            model::offset(0), 5);
                          auto rdr = model::make_memory_record_batch_reader(
                            std::move(batches));
-                         return partition->replicate(
+                         return partition->raft()->replicate(
                            std::move(rdr),
                            raft::replicate_options(
                              raft::consistency_level::quorum_ack));
@@ -483,7 +483,7 @@ FIXTURE_TEST(fetch_one_debounce, redpanda_thread_fixture) {
                        model::offset(0), 5);
                      auto rdr = model::make_memory_record_batch_reader(
                        std::move(batches));
-                     return partition->replicate(
+                     return partition->raft()->replicate(
                        std::move(rdr),
                        raft::replicate_options(
                          raft::consistency_level::quorum_ack));
@@ -563,7 +563,7 @@ FIXTURE_TEST(fetch_multi_topics, redpanda_thread_fixture) {
                            model::offset(0), 5);
                          auto rdr = model::make_memory_record_batch_reader(
                            std::move(batches));
-                         return partition->replicate(
+                         return partition->raft()->replicate(
                            std::move(rdr),
                            raft::replicate_options(
                              raft::consistency_level::quorum_ack));
@@ -615,7 +615,7 @@ FIXTURE_TEST(fetch_request_max_bytes, redpanda_thread_fixture) {
               model::offset(0), 20);
             auto rdr = model::make_memory_record_batch_reader(
               std::move(batches));
-            return partition->replicate(
+            return partition->raft()->replicate(
               std::move(rdr),
               raft::replicate_options(raft::consistency_level::quorum_ack));
         })
diff --git a/src/v/kafka/server/tests/topic_recreate_test.cc b/src/v/kafka/server/tests/topic_recreate_test.cc
index fd081431ec407..8628cf183dc46 100644
--- a/src/v/kafka/server/tests/topic_recreate_test.cc
+++ b/src/v/kafka/server/tests/topic_recreate_test.cc
@@ -266,7 +266,7 @@ FIXTURE_TEST(test_recreated_topic_does_not_lose_data, recreate_test_fixture) {
                 auto rdr = model::make_memory_record_batch_reader(
                   std::move(batches));
                 auto p = pm.get(ntp);
-                return p
+                return p->raft()
                   ->replicate(
                     std::move(rdr),
                     raft::replicate_options(

From e693bead59ec286596f23f29b50a9f49cb6ceb9a Mon Sep 17 00:00:00 2001
From: Denis Rystsov <denis@vectorized.io>
Date: Mon, 4 Jul 2022 14:36:29 -0700
Subject: [PATCH 017/201] k/group: avoid ABA problem

Updating consumer groups to use conditional replication to prevent a
situation when after a check a leadership jumps away, invalidates the
check, jumps back just in time for the post check replication.

check condition
  leadership goes to a new node
  the node replicates something which invalidates the conditions
  the leadership jumps back
the node successfully replicates assuming that the condition is true

Switched to a conditional replicate to fix the problem. When a group
manager detects a leadership change it replays the group's records to
reconstruct the groups state. We cache the current term in the state
and use it as a condition on replicate. In this case we know that if
the leadership bounce the replication won't pass.
---
 src/v/kafka/server/group.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/v/kafka/server/group.cc b/src/v/kafka/server/group.cc
index 32815af490d9a..b7d7c2262a81b 100644
--- a/src/v/kafka/server/group.cc
+++ b/src/v/kafka/server/group.cc
@@ -2104,6 +2104,7 @@ group::offset_commit_stages group::store_offsets(offset_commit_request&& r) {
     auto reader = model::make_memory_record_batch_reader(std::move(batch));
 
     auto replicate_stages = _partition->raft()->replicate_in_stages(
+      _term,
       std::move(reader),
       raft::replicate_options(raft::consistency_level::quorum_ack));
 
@@ -2493,6 +2494,7 @@ ss::future<error_code> group::remove() {
 
     try {
         auto result = co_await _partition->raft()->replicate(
+          _term,
           std::move(reader),
           raft::replicate_options(raft::consistency_level::quorum_ack));
         if (result) {
@@ -2573,6 +2575,7 @@ group::remove_topic_partitions(const std::vector<model::topic_partition>& tps) {
 
     try {
         auto result = co_await _partition->raft()->replicate(
+          _term,
           std::move(reader),
           raft::replicate_options(raft::consistency_level::quorum_ack));
         if (result) {
@@ -2600,6 +2603,7 @@ group::remove_topic_partitions(const std::vector<model::topic_partition>& tps) {
 ss::future<result<raft::replicate_result>>
 group::store_group(model::record_batch batch) {
     return _partition->raft()->replicate(
+      _term,
       model::make_memory_record_batch_reader(std::move(batch)),
       raft::replicate_options(raft::consistency_level::quorum_ack));
 }

From 93350756123f6883192a0fb6bb2b123f394e9a4b Mon Sep 17 00:00:00 2001
From: Denis Rystsov <denis@vectorized.io>
Date: Mon, 4 Jul 2022 14:50:26 -0700
Subject: [PATCH 018/201] c/types: introduce kafka offset types

We're going to mix raft and kafka offset in the same class, since
both the offsets uses the same type it's easy to make an error and
treat one as it was another. Introducing kafka offset to rely on the
type system to prevent such errors.
---
 src/v/cluster/types.cc    | 10 ++++++++++
 src/v/cluster/types.h     | 14 ++++++++++++++
 src/v/model/fundamental.h |  6 ++++++
 3 files changed, 30 insertions(+)

diff --git a/src/v/cluster/types.cc b/src/v/cluster/types.cc
index 9769e8e07a190..474bfb6a2d9e8 100644
--- a/src/v/cluster/types.cc
+++ b/src/v/cluster/types.cc
@@ -32,6 +32,16 @@
 
 namespace cluster {
 
+kafka_stages::kafka_stages(
+  ss::future<> enq, ss::future<result<kafka_result>> offset_future)
+  : request_enqueued(std::move(enq))
+  , replicate_finished(std::move(offset_future)) {}
+
+kafka_stages::kafka_stages(raft::errc ec)
+  : request_enqueued(ss::now())
+  , replicate_finished(
+      ss::make_ready_future<result<kafka_result>>(make_error_code(ec))){};
+
 bool topic_properties::is_compacted() const {
     if (!cleanup_policy_bitflags) {
         return false;
diff --git a/src/v/cluster/types.h b/src/v/cluster/types.h
index a5abd73d555fd..b101b19a88bff 100644
--- a/src/v/cluster/types.h
+++ b/src/v/cluster/types.h
@@ -175,6 +175,20 @@ inline std::error_code make_error_code(tx_errc e) noexcept {
     return std::error_code(static_cast<int>(e), tx_error_category());
 }
 
+struct kafka_result {
+    kafka::offset last_offset;
+};
+struct kafka_stages {
+    kafka_stages(ss::future<>, ss::future<result<kafka_result>>);
+    explicit kafka_stages(raft::errc);
+    // after this future is ready, request in enqueued in raft and it will not
+    // be reorderd
+    ss::future<> request_enqueued;
+    // after this future is ready, request was successfully replicated with
+    // requested consistency level
+    ss::future<result<kafka_result>> replicate_finished;
+};
+
 struct try_abort_request
   : serde::envelope<try_abort_request, serde::version<0>> {
     model::partition_id tm;
diff --git a/src/v/model/fundamental.h b/src/v/model/fundamental.h
index 3ecfdf1cb6887..86e92f865e14a 100644
--- a/src/v/model/fundamental.h
+++ b/src/v/model/fundamental.h
@@ -29,6 +29,12 @@
 #include <string_view>
 #include <type_traits>
 
+namespace kafka {
+
+using offset = named_type<int64_t, struct kafka_offset_type>;
+
+} // namespace kafka
+
 namespace model {
 
 // Named after Kafka cleanup.policy topic property

From e3d24d951206fe51a68e0cf0c1965e0525ad07f7 Mon Sep 17 00:00:00 2001
From: Denis Rystsov <denis@vectorized.io>
Date: Mon, 4 Jul 2022 14:50:39 -0700
Subject: [PATCH 019/201] cluster: shift offset translation to partition

Shifting offset translation down the abstraction well to eventually
reach rm_stm
---
 src/v/cluster/partition.cc                 | 42 ++++++++++++++++------
 src/v/cluster/partition.h                  |  5 +--
 src/v/kafka/server/replicated_partition.cc | 17 +++++----
 3 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/src/v/cluster/partition.cc b/src/v/cluster/partition.cc
index 4a5d70a878f77..4004498c2d47d 100644
--- a/src/v/cluster/partition.cc
+++ b/src/v/cluster/partition.cc
@@ -102,9 +102,15 @@ partition::partition(
     }
 }
 
-ss::future<result<raft::replicate_result>> partition::replicate(
+ss::future<result<kafka_result>> partition::replicate(
   model::record_batch_reader&& r, raft::replicate_options opts) {
-    return _raft->replicate(std::move(r), opts);
+    using ret_t = result<kafka_result>;
+    auto res = co_await _raft->replicate(std::move(r), opts);
+    if (!res) {
+        co_return ret_t(res.error());
+    }
+    co_return ret_t(kafka_result{
+      kafka::offset(_translator->from_log_offset(res.value().last_offset)())});
 }
 
 ss::shared_ptr<cluster::rm_stm> partition::rm_stm() {
@@ -126,10 +132,11 @@ ss::shared_ptr<cluster::rm_stm> partition::rm_stm() {
     return _rm_stm;
 }
 
-raft::replicate_stages partition::replicate_in_stages(
+kafka_stages partition::replicate_in_stages(
   model::batch_identity bid,
   model::record_batch_reader&& r,
   raft::replicate_options opts) {
+    using ret_t = result<kafka_result>;
     if (bid.is_transactional) {
         if (!_is_tx_enabled) {
             vlog(
@@ -137,7 +144,7 @@ raft::replicate_stages partition::replicate_in_stages(
               "Can't process a transactional request to {}. Transactional "
               "processing isn't enabled.",
               _raft->ntp());
-            return raft::replicate_stages(raft::errc::timeout);
+            return kafka_stages(raft::errc::timeout);
         }
 
         if (!_rm_stm) {
@@ -145,7 +152,7 @@ raft::replicate_stages partition::replicate_in_stages(
               clusterlog.error,
               "Topic {} doesn't support transactional processing.",
               _raft->ntp());
-            return raft::replicate_stages(raft::errc::timeout);
+            return kafka_stages(raft::errc::timeout);
         }
     }
 
@@ -156,7 +163,7 @@ raft::replicate_stages partition::replicate_in_stages(
               "Can't process an idempotent request to {}. Idempotency isn't "
               "enabled.",
               _raft->ntp());
-            return raft::replicate_stages(raft::errc::timeout);
+            return kafka_stages(raft::errc::timeout);
         }
 
         if (!_rm_stm) {
@@ -164,15 +171,29 @@ raft::replicate_stages partition::replicate_in_stages(
               clusterlog.error,
               "Topic {} doesn't support idempotency.",
               _raft->ntp());
-            return raft::replicate_stages(raft::errc::timeout);
+            return kafka_stages(raft::errc::timeout);
         }
     }
 
+    ss::lw_shared_ptr<raft::replicate_stages> res;
     if (_rm_stm) {
-        return _rm_stm->replicate_in_stages(bid, std::move(r), opts);
+        res = _rm_stm->replicate_in_stages(bid, std::move(r), opts);
     } else {
-        return _raft->replicate_in_stages(std::move(r), opts);
+        res = _raft->replicate_in_stages(std::move(r), opts);
     }
+
+    auto replicate_finished = res->replicate_finished.then(
+      [this](result<raft::replicate_result> r) {
+          if (!r) {
+              return ret_t(r.error());
+          }
+          auto old_offset = r.value().last_offset;
+          auto new_offset = kafka::offset(
+            _translator->from_log_offset(old_offset)());
+          return ret_t(kafka_result{new_offset});
+      });
+    return kafka_stages(
+      std::move(res->request_enqueued), std::move(replicate_finished));
 }
 
 ss::future<> partition::start() {
@@ -180,7 +201,8 @@ ss::future<> partition::start() {
 
     _probe.setup_metrics(ntp);
 
-    auto f = _raft->start();
+    auto f = _raft->start().then(
+      [this] { _translator = _raft->get_offset_translator_state(); });
 
     if (is_id_allocator_topic(ntp)) {
         return f.then([this] { return _id_allocator_stm->start(); });
diff --git a/src/v/cluster/partition.h b/src/v/cluster/partition.h
index 3a4e6e636acff..93f6b19b4387e 100644
--- a/src/v/cluster/partition.h
+++ b/src/v/cluster/partition.h
@@ -50,10 +50,10 @@ class partition {
     ss::future<> start();
     ss::future<> stop();
 
-    ss::future<result<raft::replicate_result>>
+    ss::future<result<kafka_result>>
     replicate(model::record_batch_reader&&, raft::replicate_options);
 
-    raft::replicate_stages replicate_in_stages(
+    kafka_stages replicate_in_stages(
       model::batch_identity,
       model::record_batch_reader&&,
       raft::replicate_options);
@@ -293,6 +293,7 @@ class partition {
     bool _is_tx_enabled{false};
     bool _is_idempotence_enabled{false};
     ss::lw_shared_ptr<cloud_storage::remote_partition> _cloud_storage_partition;
+    ss::lw_shared_ptr<const storage::offset_translator_state> _translator;
 
     friend std::ostream& operator<<(std::ostream& o, const partition& x);
 };
diff --git a/src/v/kafka/server/replicated_partition.cc b/src/v/kafka/server/replicated_partition.cc
index faf03c03d452a..d24fe5fed21b9 100644
--- a/src/v/kafka/server/replicated_partition.cc
+++ b/src/v/kafka/server/replicated_partition.cc
@@ -165,11 +165,11 @@ ss::future<result<model::offset>> replicated_partition::replicate(
   model::record_batch_reader rdr, raft::replicate_options opts) {
     using ret_t = result<model::offset>;
     return _partition->replicate(std::move(rdr), opts)
-      .then([this](result<raft::replicate_result> r) {
+      .then([](result<cluster::kafka_result> r) {
           if (!r) {
               return ret_t(r.error());
           }
-          return ret_t(_translator->from_log_offset(r.value().last_offset));
+          return ret_t(model::offset(r.value().last_offset()));
       });
 }
 
@@ -179,15 +179,18 @@ raft::replicate_stages replicated_partition::replicate(
   raft::replicate_options opts) {
     using ret_t = result<raft::replicate_result>;
     auto res = _partition->replicate_in_stages(batch_id, std::move(rdr), opts);
-    res.replicate_finished = res.replicate_finished.then(
-      [this](result<raft::replicate_result> r) {
+
+    raft::replicate_stages out(raft::errc::success);
+    out.request_enqueued = std::move(res.request_enqueued);
+    out.replicate_finished = res.replicate_finished.then(
+      [](result<cluster::kafka_result> r) {
           if (!r) {
               return ret_t(r.error());
           }
-          return ret_t(raft::replicate_result{
-            _translator->from_log_offset(r.value().last_offset)});
+          return ret_t(
+            raft::replicate_result{model::offset(r.value().last_offset())});
       });
-    return res;
+    return out;
 }
 
 std::optional<model::offset> replicated_partition::get_leader_epoch_last_offset(

From 63c5883435da22b30f2fbc772932766a9b2e7465 Mon Sep 17 00:00:00 2001
From: Denis Rystsov <denis@vectorized.io>
Date: Mon, 4 Jul 2022 14:48:55 -0700
Subject: [PATCH 020/201] rm_stm: prepare to use kafka::offset based cache

Preparing rm_stm to use kafka::offset based seq-offset cache. Right
now it uses raft offsets but there is a problem with it: once the
cache items become older that the head of the log (eviction) panda
becomes unable to use offset translation so we need to store already
translated offsets.

Since the cache is persisted as a part the snapshot so we need to
change the disk format and provide backward compatibility. The change
is splitted into two commits. Current commit introduces types to
represent old format seq_cache_entry_v1 and tx_snapshot_v1 and adds
compatibility machinary to convert old snapshot (tx_snapshot_v1) to new
snapshot (tx_snapshot).

The follow up commit updates the default types to use new format and
updates the mapping between old and default types.
---
 src/v/cluster/rm_stm.cc | 135 +++++++++++++++++++++++++++++++---------
 src/v/cluster/rm_stm.h  |   7 ++-
 2 files changed, 111 insertions(+), 31 deletions(-)

diff --git a/src/v/cluster/rm_stm.cc b/src/v/cluster/rm_stm.cc
index 4766ad986f21d..f1435e79a4975 100644
--- a/src/v/cluster/rm_stm.cc
+++ b/src/v/cluster/rm_stm.cc
@@ -187,6 +187,31 @@ struct tx_snapshot_v0 {
     std::vector<seq_entry_v0> seqs;
 };
 
+struct seq_cache_entry_v1 {
+    int32_t seq{-1};
+    model::offset offset;
+};
+
+struct seq_entry_v1 {
+    model::producer_identity pid;
+    int32_t seq{-1};
+    model::offset last_offset{-1};
+    ss::circular_buffer<seq_cache_entry_v1> seq_cache;
+    model::timestamp::type last_write_timestamp;
+};
+
+struct tx_snapshot_v1 {
+    static constexpr uint8_t version = 1;
+
+    std::vector<model::producer_identity> fenced;
+    std::vector<rm_stm::tx_range> ongoing;
+    std::vector<rm_stm::prepare_marker> prepared;
+    std::vector<rm_stm::tx_range> aborted;
+    std::vector<rm_stm::abort_index> abort_indexes;
+    model::offset offset;
+    std::vector<seq_entry_v1> seqs;
+};
+
 rm_stm::rm_stm(
   ss::logger& logger,
   raft::consensus* c,
@@ -1812,14 +1837,35 @@ rm_stm::apply_snapshot(stm_snapshot_header hdr, iobuf&& tx_ss_buf) {
     iobuf_parser data_parser(std::move(tx_ss_buf));
     if (hdr.version == tx_snapshot::version) {
         data = reflection::adl<tx_snapshot>{}.from(data_parser);
+    } else if (hdr.version == tx_snapshot_v1::version) {
+        auto data_v1 = reflection::adl<tx_snapshot_v1>{}.from(data_parser);
+        data.fenced = std::move(data_v1.fenced);
+        data.ongoing = std::move(data_v1.ongoing);
+        data.prepared = std::move(data_v1.prepared);
+        data.aborted = std::move(data_v1.aborted);
+        data.abort_indexes = std::move(data_v1.abort_indexes);
+        data.offset = std::move(data_v1.offset);
+        for (auto& seq_v1 : data_v1.seqs) {
+            seq_entry seq;
+            seq.pid = seq_v1.pid;
+            seq.seq = seq_v1.seq;
+            seq.last_offset = seq_v1.last_offset;
+            seq.seq_cache.reserve(seq_v1.seq_cache.size());
+            for (auto& item : seq_v1.seq_cache) {
+                seq.seq_cache.push_back(
+                  seq_cache_entry{.seq = item.seq, .offset = item.offset});
+            }
+            seq.last_write_timestamp = seq_v1.last_write_timestamp;
+            data.seqs.push_back(std::move(seq));
+        }
     } else if (hdr.version == tx_snapshot_v0::version) {
         auto data_v0 = reflection::adl<tx_snapshot_v0>{}.from(data_parser);
-        data.fenced = data_v0.fenced;
-        data.ongoing = data_v0.ongoing;
-        data.prepared = data_v0.prepared;
-        data.aborted = data_v0.aborted;
-        data.abort_indexes = data_v0.abort_indexes;
-        data.offset = data_v0.offset;
+        data.fenced = std::move(data_v0.fenced);
+        data.ongoing = std::move(data_v0.ongoing);
+        data.prepared = std::move(data_v0.prepared);
+        data.aborted = std::move(data_v0.aborted);
+        data.abort_indexes = std::move(data_v0.abort_indexes);
+        data.offset = std::move(data_v0.offset);
         for (auto seq_v0 : data_v0.seqs) {
             auto seq = seq_entry{
               .pid = seq_v0.pid,
@@ -1879,6 +1925,27 @@ rm_stm::apply_snapshot(stm_snapshot_header hdr, iobuf&& tx_ss_buf) {
     _insync_offset = data.offset;
 }
 
+uint8_t rm_stm::active_snapshot_version() { return tx_snapshot_v1::version; }
+
+template<class T>
+void rm_stm::fill_snapshot_wo_seqs(T& snapshot) {
+    for (auto const& [k, v] : _log_state.fence_pid_epoch) {
+        snapshot.fenced.push_back(model::producer_identity{k(), v()});
+    }
+    for (auto& entry : _log_state.ongoing_map) {
+        snapshot.ongoing.push_back(entry.second);
+    }
+    for (auto& entry : _log_state.prepared) {
+        snapshot.prepared.push_back(entry.second);
+    }
+    for (auto& entry : _log_state.aborted) {
+        snapshot.aborted.push_back(entry);
+    }
+    for (auto& entry : _log_state.abort_indexes) {
+        snapshot.abort_indexes.push_back(entry);
+    }
+}
+
 ss::future<stm_snapshot> rm_stm::take_snapshot() {
     if (_log_state.aborted.size() > _abort_index_segment_size) {
         std::sort(
@@ -1904,33 +1971,41 @@ ss::future<stm_snapshot> rm_stm::take_snapshot() {
         _log_state.aborted = snapshot.aborted;
     }
 
-    tx_snapshot tx_ss;
-
-    for (auto const& [k, v] : _log_state.fence_pid_epoch) {
-        tx_ss.fenced.push_back(model::producer_identity{k(), v()});
-    }
-    for (auto& entry : _log_state.ongoing_map) {
-        tx_ss.ongoing.push_back(entry.second);
-    }
-    for (auto& entry : _log_state.prepared) {
-        tx_ss.prepared.push_back(entry.second);
-    }
-    for (auto& entry : _log_state.aborted) {
-        tx_ss.aborted.push_back(entry);
-    }
-    for (auto& entry : _log_state.abort_indexes) {
-        tx_ss.abort_indexes.push_back(entry);
-    }
-    for (const auto& entry : _log_state.seq_table) {
-        tx_ss.seqs.push_back(entry.second.copy());
-    }
-    tx_ss.offset = _insync_offset;
-
     iobuf tx_ss_buf;
-    reflection::adl<tx_snapshot>{}.to(tx_ss_buf, std::move(tx_ss));
+    auto version = active_snapshot_version();
+    if (version == tx_snapshot::version) {
+        tx_snapshot tx_ss;
+        fill_snapshot_wo_seqs(tx_ss);
+        for (const auto& entry : _log_state.seq_table) {
+            tx_ss.seqs.push_back(entry.second.copy());
+        }
+        tx_ss.offset = _insync_offset;
+        reflection::adl<tx_snapshot>{}.to(tx_ss_buf, std::move(tx_ss));
+    } else if (version == tx_snapshot_v1::version) {
+        tx_snapshot_v1 tx_ss;
+        fill_snapshot_wo_seqs(tx_ss);
+        for (const auto& it : _log_state.seq_table) {
+            auto& entry = it.second;
+            seq_entry_v1 seqs;
+            seqs.pid = entry.pid;
+            seqs.seq = entry.seq;
+            seqs.last_offset = entry.last_offset;
+            seqs.last_write_timestamp = entry.last_write_timestamp;
+            seqs.seq_cache.reserve(seqs.seq_cache.size());
+            for (auto& item : entry.seq_cache) {
+                seqs.seq_cache.push_back(
+                  seq_cache_entry_v1{.seq = item.seq, .offset = item.offset});
+            }
+            tx_ss.seqs.push_back(std::move(seqs));
+        }
+        tx_ss.offset = _insync_offset;
+        reflection::adl<tx_snapshot_v1>{}.to(tx_ss_buf, std::move(tx_ss));
+    } else {
+        vassert(false, "unsupported tx_snapshot version {}", version);
+    }
 
     co_return stm_snapshot::create(
-      tx_snapshot::version, _insync_offset, std::move(tx_ss_buf));
+      version, _insync_offset, std::move(tx_ss_buf));
 }
 
 ss::future<> rm_stm::save_abort_snapshot(abort_snapshot snapshot) {
diff --git a/src/v/cluster/rm_stm.h b/src/v/cluster/rm_stm.h
index 48f3fa0da9645..fcb0ea4b46563 100644
--- a/src/v/cluster/rm_stm.h
+++ b/src/v/cluster/rm_stm.h
@@ -123,7 +123,7 @@ class rm_stm final : public persisted_stm {
     };
 
     struct tx_snapshot {
-        static constexpr uint8_t version = 1;
+        static constexpr uint8_t version = 2;
 
         std::vector<model::producer_identity> fenced;
         std::vector<tx_range> ongoing;
@@ -492,6 +492,11 @@ class rm_stm final : public persisted_stm {
     std::optional<expiration_info>
     get_expiration_info(model::producer_identity pid) const;
 
+    uint8_t active_snapshot_version();
+
+    template<class T>
+    void fill_snapshot_wo_seqs(T&);
+
     ss::basic_rwlock<> _state_lock;
     absl::flat_hash_map<model::producer_id, ss::lw_shared_ptr<mutex>> _tx_locks;
     absl::flat_hash_map<

From d8998bb55132077cb52da574fa74bcd9ec5f9ef0 Mon Sep 17 00:00:00 2001
From: Rob Blafford <rob@vectorized.io>
Date: Wed, 15 Jun 2022 13:52:15 -0400
Subject: [PATCH 021/201] cleanup: Reduce redundent calls to call with one

- This macro will produce some code that is wrapped with a tag version
guard in all cases

- In each conditional the `tag_version_guard` method will be called, all
this commit does it move it out and have the caller of the method call
it once, in one place - removing redundent code.
---
 src/v/kafka/protocol/schemata/generator.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/v/kafka/protocol/schemata/generator.py b/src/v/kafka/protocol/schemata/generator.py
index 9b6b9cc89166f..85fb3984d0754 100644
--- a/src/v/kafka/protocol/schemata/generator.py
+++ b/src/v/kafka/protocol/schemata/generator.py
@@ -1146,24 +1146,18 @@ class response;
 {% macro conditional_tag_encode(tdef, vec) %}
 {%- if tdef.is_array %}
 {%- if tdef.nullable() %}
-{%- call tag_version_guard(tdef) %}
 if ({{ tdef.name }}) {
     {{ vec }}.push_back({{ tdef.tag() }});
 }
-{%- endcall %}
 {%- else %}
-{%- call tag_version_guard(tdef) %}
 if (!{{ tdef.name }}.empty()) {
     {{ vec }}.push_back({{ tdef.tag() }});
 }
-{%- endcall %}
 {%- endif %}
 {%- elif tdef.default_value() != "" %}
-{%- call tag_version_guard(tdef) %}
 if ({{ tdef.name }} != {{ tdef.default_value() }}) {
     {{ vec }}.push_back({{ tdef.tag() }});
 }
-{%- endcall %}
 {%- endif %}
 {%- endmacro %}
 
@@ -1171,7 +1165,9 @@ class response;
 /// Tags encoding section
 std::vector<uint32_t> to_encode;
 {%- for tdef in tag_definitions -%}
+{%- call tag_version_guard(tdef) %}
 {{- conditional_tag_encode(tdef, "to_encode") }}
+{%- endcall %}
 {%- endfor %}
 writer.write_unsigned_varint(to_encode.size());
 for(size_t tag : to_encode) {

From f37c05992ce54a388451e407906ca15e2f76bb4f Mon Sep 17 00:00:00 2001
From: Rob Blafford <rob@vectorized.io>
Date: Wed, 15 Jun 2022 13:53:12 -0400
Subject: [PATCH 022/201] kafka/schemata: Fix tag scalar encoding bug

- Kafka tag encoding would be previously skipped for scalar types that
did not have a default value
---
 src/v/kafka/protocol/schemata/generator.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/v/kafka/protocol/schemata/generator.py b/src/v/kafka/protocol/schemata/generator.py
index 85fb3984d0754..9b6de3cc45acf 100644
--- a/src/v/kafka/protocol/schemata/generator.py
+++ b/src/v/kafka/protocol/schemata/generator.py
@@ -1158,6 +1158,8 @@ class response;
 if ({{ tdef.name }} != {{ tdef.default_value() }}) {
     {{ vec }}.push_back({{ tdef.tag() }});
 }
+{%- else %}
+{{ vec }}.push_back({{ tdef.tag() }});
 {%- endif %}
 {%- endmacro %}
 

From 617898f550ecd0be3b514021d408ec58adae056d Mon Sep 17 00:00:00 2001
From: Rob Blafford <rob@vectorized.io>
Date: Wed, 15 Jun 2022 13:54:26 -0400
Subject: [PATCH 023/201] kafka/protocol: Fix bug w/ encoding nullable tags

- Currently there are no nullable tags so the generator isn't generating
any buggy code

- The nullable check would only be called if the previous is_array check
passed, only checking nullable types against arrays, nothing else.
---
 src/v/kafka/protocol/schemata/generator.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/v/kafka/protocol/schemata/generator.py b/src/v/kafka/protocol/schemata/generator.py
index 9b6de3cc45acf..305c4a6da2f9c 100644
--- a/src/v/kafka/protocol/schemata/generator.py
+++ b/src/v/kafka/protocol/schemata/generator.py
@@ -1144,16 +1144,14 @@ class response;
 {%- endmacro %}
 
 {% macro conditional_tag_encode(tdef, vec) %}
-{%- if tdef.is_array %}
 {%- if tdef.nullable() %}
 if ({{ tdef.name }}) {
     {{ vec }}.push_back({{ tdef.tag() }});
 }
-{%- else %}
+{%- elif tdef.is_array %}
 if (!{{ tdef.name }}.empty()) {
     {{ vec }}.push_back({{ tdef.tag() }});
 }
-{%- endif %}
 {%- elif tdef.default_value() != "" %}
 if ({{ tdef.name }} != {{ tdef.default_value() }}) {
     {{ vec }}.push_back({{ tdef.tag() }});

From ee216bb3a1bfa6fb0e4eaf9518197a2047972f41 Mon Sep 17 00:00:00 2001
From: Rob Blafford <rob@vectorized.io>
Date: Wed, 18 May 2022 15:35:28 -0400
Subject: [PATCH 024/201] kafka/server: Replace hardcoded cfgs w/ constants

---
 src/v/kafka/server/handlers/create_topics.cc | 24 ++++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/v/kafka/server/handlers/create_topics.cc b/src/v/kafka/server/handlers/create_topics.cc
index ffc33cb3106a2..db6d45eb9e52b 100644
--- a/src/v/kafka/server/handlers/create_topics.cc
+++ b/src/v/kafka/server/handlers/create_topics.cc
@@ -34,18 +34,18 @@
 namespace kafka {
 
 static constexpr std::array<std::string_view, 12> supported_configs{
-  {"compression.type",
-   "cleanup.policy",
-   "message.timestamp.type",
-   "segment.bytes",
-   "compaction.strategy",
-   "retention.bytes",
-   "retention.ms",
-   "redpanda.remote.recovery",
-   "redpanda.remote.write",
-   "redpanda.remote.read",
-   "redpanda.remote.readreplica",
-   "redpanda.remote.readreplica.bucket"}};
+  topic_property_compression,
+  topic_property_cleanup_policy,
+  topic_property_timestamp_type,
+  topic_property_segment_size,
+  topic_property_compaction_strategy,
+  topic_property_retention_bytes,
+  topic_property_retention_duration,
+  topic_property_recovery,
+  topic_property_remote_write,
+  topic_property_remote_read,
+  topic_property_read_replica,
+  topic_property_read_replica_bucket};
 
 bool is_supported(std::string_view name) {
     return std::any_of(

From 7a78f6d5d7a07bf3deb4236d799ce41f84c3a802 Mon Sep 17 00:00:00 2001
From: Rob Blafford <rob@vectorized.io>
Date: Thu, 19 May 2022 15:06:21 -0400
Subject: [PATCH 025/201] kafka/protcol/schemata: Apply type for config_src

---
 src/v/kafka/protocol/schemata/generator.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/v/kafka/protocol/schemata/generator.py b/src/v/kafka/protocol/schemata/generator.py
index 305c4a6da2f9c..12481a57b75a3 100644
--- a/src/v/kafka/protocol/schemata/generator.py
+++ b/src/v/kafka/protocol/schemata/generator.py
@@ -160,6 +160,14 @@
             },
         },
     },
+    "CreateTopicsResponseData": {
+        "Topics": {
+            "Configs": {
+                "ConfigSource": ("kafka::describe_configs_source", "int8"),
+            },
+            "TopicConfigErrorCode": ("kafka::error_code", "int16"),
+        },
+    },
     "FindCoordinatorRequestData": {
         "KeyType": ("kafka::coordinator_type", "int8"),
     },

From 9bb28272b8b2961d3f957d857a0cb4b51b2309e1 Mon Sep 17 00:00:00 2001
From: Rob Blafford <rob@vectorized.io>
Date: Wed, 8 Jun 2022 13:16:15 -0400
Subject: [PATCH 026/201] cluster: Method to obtain default_topic_properties

---
 src/v/cluster/metadata_cache.cc | 16 ++++++++++++++++
 src/v/cluster/metadata_cache.h  |  1 +
 2 files changed, 17 insertions(+)

diff --git a/src/v/cluster/metadata_cache.cc b/src/v/cluster/metadata_cache.cc
index 2cadfe9cb1ee3..85f16c74d25f1 100644
--- a/src/v/cluster/metadata_cache.cc
+++ b/src/v/cluster/metadata_cache.cc
@@ -254,4 +254,20 @@ metadata_cache::get_default_shadow_indexing_mode() const {
     }
     return m;
 }
+
+topic_properties metadata_cache::get_default_properties() const {
+    topic_properties tp;
+    tp.compression = {get_default_compression()};
+    tp.cleanup_policy_bitflags = {get_default_cleanup_policy_bitflags()};
+    tp.compaction_strategy = {get_default_compaction_strategy()};
+    tp.timestamp_type = {get_default_timestamp_type()};
+    tp.segment_size = {get_default_segment_size()};
+    tp.retention_bytes = tristate<size_t>({get_default_retention_bytes()});
+    tp.retention_duration = tristate<std::chrono::milliseconds>(
+      {get_default_retention_duration()});
+    tp.recovery = {false};
+    tp.shadow_indexing = {get_default_shadow_indexing_mode()};
+    return tp;
+}
+
 } // namespace cluster
diff --git a/src/v/cluster/metadata_cache.h b/src/v/cluster/metadata_cache.h
index 28f04bfeec467..ea343d3f8a401 100644
--- a/src/v/cluster/metadata_cache.h
+++ b/src/v/cluster/metadata_cache.h
@@ -166,6 +166,7 @@ class metadata_cache {
     std::optional<std::chrono::milliseconds>
     get_default_retention_duration() const;
     model::shadow_indexing_mode get_default_shadow_indexing_mode() const;
+    topic_properties get_default_properties() const;
 
 private:
     ss::sharded<topic_table>& _topics_state;

From ce7928904bfea021d90e46d1af9b4e82572c87b7 Mon Sep 17 00:00:00 2001
From: Rob Blafford <rob@vectorized.io>
Date: Wed, 18 May 2022 15:49:22 -0400
Subject: [PATCH 027/201] kafka/server/handlers: Convert properties to map

- Useful to iterate over values to return to client
---
 src/v/kafka/server/handlers/topics/types.cc | 82 +++++++++++++++++++++
 src/v/kafka/server/handlers/topics/types.h  |  1 +
 2 files changed, 83 insertions(+)

diff --git a/src/v/kafka/server/handlers/topics/types.cc b/src/v/kafka/server/handlers/topics/types.cc
index 97362b777d705..2e5f8bfef3c47 100644
--- a/src/v/kafka/server/handlers/topics/types.cc
+++ b/src/v/kafka/server/handlers/topics/types.cc
@@ -143,6 +143,8 @@ to_cluster_type(const creatable_topic& t) {
       config_entries, topic_property_read_replica);
     cfg.properties.read_replica_bucket = get_string_value(
       config_entries, topic_property_read_replica_bucket);
+    /// Final topic_property not decoded here is \ref remote_topic_properties,
+    /// is more of an implementation detail no need to ever show user
 
     auto ret = cluster::custom_assignable_topic_configuration(cfg);
     /**
@@ -164,4 +166,84 @@ to_cluster_type(const creatable_topic& t) {
     return ret;
 }
 
+template<typename T>
+static ss::sstring from_config_type(const T& v) {
+    if constexpr (std::is_enum_v<T>) {
+        return ss::to_sstring(static_cast<std::underlying_type_t<T>>(v));
+    } else if constexpr (std::is_same_v<bool, T>) {
+        return v ? "true" : "false";
+    } else if constexpr (std::is_same_v<T, std::chrono::milliseconds>) {
+        return ss::to_sstring(
+          std::chrono::duration_cast<std::chrono::milliseconds>(v).count());
+    } else {
+        return ss::to_sstring(v);
+    }
+}
+
+config_map_t from_cluster_type(const cluster::topic_properties& properties) {
+    config_map_t config_entries;
+    if (properties.compression) {
+        config_entries[topic_property_compression] = from_config_type(
+          *properties.compression);
+    }
+    if (properties.cleanup_policy_bitflags) {
+        config_entries[topic_property_cleanup_policy] = from_config_type(
+          *properties.cleanup_policy_bitflags);
+    }
+    if (properties.compaction_strategy) {
+        config_entries[topic_property_compaction_strategy] = from_config_type(
+          *properties.compaction_strategy);
+    }
+    if (properties.timestamp_type) {
+        config_entries[topic_property_timestamp_type] = from_config_type(
+          *properties.timestamp_type);
+    }
+    if (properties.segment_size) {
+        config_entries[topic_property_segment_size] = from_config_type(
+          *properties.segment_size);
+    }
+    if (properties.retention_bytes.has_value()) {
+        config_entries[topic_property_retention_bytes] = from_config_type(
+          properties.retention_bytes.value());
+    }
+    if (properties.retention_duration.has_value()) {
+        config_entries[topic_property_retention_duration] = from_config_type(
+          *properties.retention_duration);
+    }
+    if (properties.recovery) {
+        config_entries[topic_property_recovery] = from_config_type(
+          *properties.recovery);
+    }
+    if (properties.shadow_indexing) {
+        config_entries[topic_property_remote_write] = "false";
+        config_entries[topic_property_remote_read] = "false";
+
+        switch (*properties.shadow_indexing) {
+        case model::shadow_indexing_mode::archival:
+            config_entries[topic_property_remote_write] = "true";
+            break;
+        case model::shadow_indexing_mode::fetch:
+            config_entries[topic_property_remote_read] = "true";
+            break;
+        case model::shadow_indexing_mode::full:
+            config_entries[topic_property_remote_write] = "true";
+            config_entries[topic_property_remote_read] = "true";
+            break;
+        default:
+            break;
+        }
+    }
+    if (properties.read_replica) {
+        config_entries[topic_property_read_replica] = from_config_type(
+          *properties.read_replica);
+    }
+    if (properties.read_replica_bucket) {
+        config_entries[topic_property_read_replica_bucket] = from_config_type(
+          *properties.read_replica_bucket);
+    }
+    /// Final topic_property not encoded here is \ref remote_topic_properties,
+    /// is more of an implementation detail no need to ever show user
+    return config_entries;
+}
+
 } // namespace kafka
diff --git a/src/v/kafka/server/handlers/topics/types.h b/src/v/kafka/server/handlers/topics/types.h
index fa96d73862237..13434c4fe0d4a 100644
--- a/src/v/kafka/server/handlers/topics/types.h
+++ b/src/v/kafka/server/handlers/topics/types.h
@@ -113,4 +113,5 @@ config_map_t config_map(const std::vector<createable_topic_config>& config);
 cluster::custom_assignable_topic_configuration
 to_cluster_type(const creatable_topic& t);
 
+config_map_t from_cluster_type(const cluster::topic_properties&);
 } // namespace kafka

From d94bf32c990ce8ab9768a37407817633accb3725 Mon Sep 17 00:00:00 2001
From: Rob Blafford <rob@vectorized.io>
Date: Wed, 18 May 2022 15:49:57 -0400
Subject: [PATCH 028/201] kafka: Return configs to client in create_topics

- KIP-525 expands the request/response protocol to include topic
configuration properties.

- This is mainly a nice-to-have saving the clients from making another
round trip call for this data.

- Furthermore when calling create_topics with the `validate_only`
option, the response returned would be the default topic config options
that would be used if the topic was to be actually created.
---
 src/v/kafka/server/handlers/create_topics.cc  | 56 ++++++++++++++++++-
 src/v/kafka/server/handlers/create_topics.h   |  2 +-
 .../server/handlers/topics/topic_utils.h      |  5 +-
 3 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/src/v/kafka/server/handlers/create_topics.cc b/src/v/kafka/server/handlers/create_topics.cc
index db6d45eb9e52b..67b6ee2e667f0 100644
--- a/src/v/kafka/server/handlers/create_topics.cc
+++ b/src/v/kafka/server/handlers/create_topics.cc
@@ -69,6 +69,48 @@ using validators = make_validator_types<
   s3_bucket_is_required_for_read_replica,
   s3_bucket_is_supported_only_for_read_replica>;
 
+static std::vector<creatable_topic_configs>
+properties_to_result_configs(config_map_t config_map) {
+    std::vector<creatable_topic_configs> configs;
+    configs.reserve(config_map.size());
+    std::transform(
+      config_map.begin(),
+      config_map.end(),
+      std::back_inserter(configs),
+      [](auto& cfg) {
+          return creatable_topic_configs{
+            .name = cfg.first,
+            .value = {std::move(cfg.second)},
+            .config_source = kafka::describe_configs_source::default_config,
+          };
+      });
+    return configs;
+}
+
+static void
+append_topic_configs(request_context& ctx, create_topics_response& response) {
+    for (auto& ct_result : response.data.topics) {
+        if (ct_result.error_code != kafka::error_code::none) {
+            ct_result.topic_config_error_code = ct_result.error_code;
+            continue;
+        }
+        auto cfg = ctx.metadata_cache().get_topic_cfg(
+          model::topic_namespace_view{model::kafka_namespace, ct_result.name});
+        if (cfg) {
+            auto config_map = from_cluster_type(cfg->properties);
+            ct_result.configs = {
+              properties_to_result_configs(std::move(config_map))};
+            ct_result.topic_config_error_code = kafka::error_code::none;
+        } else {
+            // Topic was sucessfully created but metadata request did not
+            // succeed, if possible, could mean topic was deleted just after
+            // creation
+            ct_result.topic_config_error_code
+              = kafka::error_code::unknown_server_error;
+        }
+    }
+}
+
 template<>
 ss::future<response_ptr> create_topics_handler::handle(
   request_context ctx, [[maybe_unused]] ss::smp_service_group g) {
@@ -156,8 +198,15 @@ ss::future<response_ptr> create_topics_handler::handle(
                 begin,
                 valid_range_end,
                 std::back_inserter(response.data.topics),
-                [](const creatable_topic& t) {
-                    return generate_successfull_result(t);
+                [&ctx](const creatable_topic& t) {
+                    auto result = generate_successfull_result(t);
+                    if (ctx.header().version >= api_version(5)) {
+                        auto default_properties
+                          = ctx.metadata_cache().get_default_properties();
+                        result.configs = {properties_to_result_configs(
+                          from_cluster_type(default_properties))};
+                    }
+                    return result;
                 });
               return ctx.respond(std::move(response));
           }
@@ -190,6 +239,9 @@ ss::future<response_ptr> create_topics_handler::handle(
                     std::vector<cluster::topic_result> c_res) mutable {
                 // Append controller results to validation errors
                 append_cluster_results(c_res, response.data.topics);
+                if (ctx.header().version >= api_version(5)) {
+                    append_topic_configs(ctx, response);
+                }
                 return ctx.respond(response);
             });
       });
diff --git a/src/v/kafka/server/handlers/create_topics.h b/src/v/kafka/server/handlers/create_topics.h
index 2f2774856173b..e3c3958584566 100644
--- a/src/v/kafka/server/handlers/create_topics.h
+++ b/src/v/kafka/server/handlers/create_topics.h
@@ -14,6 +14,6 @@
 
 namespace kafka {
 
-using create_topics_handler = handler<create_topics_api, 0, 4>;
+using create_topics_handler = handler<create_topics_api, 0, 5>;
 
 }
diff --git a/src/v/kafka/server/handlers/topics/topic_utils.h b/src/v/kafka/server/handlers/topics/topic_utils.h
index 957ecb0cc3816..2023a28e3d310 100644
--- a/src/v/kafka/server/handlers/topics/topic_utils.h
+++ b/src/v/kafka/server/handlers/topics/topic_utils.h
@@ -45,7 +45,10 @@ template<typename T>
 requires TopicRequestItem<T> creatable_topic_result
 generate_error(T item, error_code code, const ss::sstring& msg) {
     return creatable_topic_result{
-      .name = item.name, .error_code = code, .error_message = msg};
+      .name = item.name,
+      .error_code = code,
+      .error_message = msg,
+      .topic_config_error_code = code};
 }
 
 /// Generates successfull creatable_topic_result for single topic request item

From f3190df75159833587e028d8a8b8b2c58394a02b Mon Sep 17 00:00:00 2001
From: Rob Blafford <rob@vectorized.io>
Date: Wed, 8 Jun 2022 13:16:42 -0400
Subject: [PATCH 029/201] kafka/s/tests: create_topics_v5 config resp tests

- Ensures that kip-525 configs returned by create topics v5 responses
are correct in normal scenario and when validate_only flag is passed
---
 src/v/kafka/server/handlers/topics/types.cc   | 15 +++-
 src/v/kafka/server/handlers/topics/types.h    |  1 +
 .../kafka/server/tests/create_topics_test.cc  | 79 ++++++++++++++++++-
 3 files changed, 91 insertions(+), 4 deletions(-)

diff --git a/src/v/kafka/server/handlers/topics/types.cc b/src/v/kafka/server/handlers/topics/types.cc
index 2e5f8bfef3c47..8eda38c9996b6 100644
--- a/src/v/kafka/server/handlers/topics/types.cc
+++ b/src/v/kafka/server/handlers/topics/types.cc
@@ -33,7 +33,12 @@
 
 namespace kafka {
 
-config_map_t config_map(const std::vector<createable_topic_config>& config) {
+template<typename T>
+concept CreatableTopicCfg = std::is_same_v<T, creatable_topic_configs> || std::
+  is_same_v<T, createable_topic_config>;
+
+template<CreatableTopicCfg T>
+config_map_t make_config_map(const std::vector<T>& config) {
     config_map_t ret;
     ret.reserve(config.size());
     for (const auto& c : config) {
@@ -44,6 +49,14 @@ config_map_t config_map(const std::vector<createable_topic_config>& config) {
     return ret;
 }
 
+config_map_t config_map(const std::vector<createable_topic_config>& config) {
+    return make_config_map(config);
+}
+
+config_map_t config_map(const std::vector<creatable_topic_configs>& config) {
+    return make_config_map(config);
+}
+
 // Either parse configuration or return nullopt
 template<typename T>
 static std::optional<T>
diff --git a/src/v/kafka/server/handlers/topics/types.h b/src/v/kafka/server/handlers/topics/types.h
index 13434c4fe0d4a..b2a66303ebe43 100644
--- a/src/v/kafka/server/handlers/topics/types.h
+++ b/src/v/kafka/server/handlers/topics/types.h
@@ -109,6 +109,7 @@ from_cluster_topic_result(const cluster::topic_result& err) {
 }
 
 config_map_t config_map(const std::vector<createable_topic_config>& config);
+config_map_t config_map(const std::vector<creatable_topic_configs>& config);
 
 cluster::custom_assignable_topic_configuration
 to_cluster_type(const creatable_topic& t);
diff --git a/src/v/kafka/server/tests/create_topics_test.cc b/src/v/kafka/server/tests/create_topics_test.cc
index ac7596021fadc..afb0532b41eec 100644
--- a/src/v/kafka/server/tests/create_topics_test.cc
+++ b/src/v/kafka/server/tests/create_topics_test.cc
@@ -9,6 +9,7 @@
 
 #include "kafka/protocol/create_topics.h"
 #include "kafka/protocol/metadata.h"
+#include "kafka/server/handlers/topics/types.h"
 #include "redpanda/tests/fixture.h"
 #include "resource_mgmt/io_priority.h"
 #include "s3_imposter_fixture.h"
@@ -97,17 +98,18 @@ class create_topic_fixture
     void test_create_topic(
       kafka::create_topics_request req,
       std::optional<int> partition_count = std::nullopt,
-      std::optional<int> revision_id = std::nullopt) {
+      std::optional<int> revision_id = std::nullopt,
+      kafka::api_version version = kafka::api_version(2)) {
         auto client = make_kafka_client().get0();
         client.connect().get();
-        auto resp = client.dispatch(req, kafka::api_version(2)).get0();
+        auto resp = client.dispatch(req, version).get0();
 
         // todo: here
         for (auto req : get_requests()) {
             vlog(test_log.info, "{} {}", req._method, req._url);
         }
 
-        BOOST_TEST(
+        BOOST_REQUIRE_MESSAGE(
           std::all_of(
             std::cbegin(resp.data.topics),
             std::cend(resp.data.topics),
@@ -118,6 +120,15 @@ class create_topic_fixture
 
         for (auto& topic : req.data.topics) {
             verify_metadata(client, req, topic, partition_count, revision_id);
+
+            auto it = std::find_if(
+              resp.data.topics.begin(),
+              resp.data.topics.end(),
+              [name = topic.name](const auto& t) { return t.name == name; });
+
+            BOOST_CHECK(it != resp.data.topics.end());
+            verify_response(topic, *it, version, req.data.validate_only);
+
             // TODO: one we combine the cluster fixture with the redpanda
             // fixture and enable multiple RP instances to run at the same time
             // in the test, then we should create two clients in this test where
@@ -134,6 +145,44 @@ class create_topic_fixture
         test_create_topic(req, partition_count, revision_id);
     }
 
+    void verify_response(
+      const kafka::creatable_topic& req,
+      const kafka::creatable_topic_result& topic_res,
+      kafka::api_version version,
+      bool validate_only) {
+        if (version < kafka::api_version(5)) {
+            /// currently this method only verifies configurations in v5
+            /// responses
+            return;
+        }
+        if (validate_only) {
+            /// Server should return default configs
+            BOOST_TEST(topic_res.configs, "empty config response");
+            auto cfg_map = config_map(*topic_res.configs);
+            const auto default_topic_properties = kafka::from_cluster_type(
+              app.metadata_cache.local().get_default_properties());
+            BOOST_TEST(
+              cfg_map == default_topic_properties,
+              "incorrect default properties");
+            BOOST_CHECK_EQUAL(
+              topic_res.topic_config_error_code, kafka::error_code::none);
+            return;
+        }
+        if (req.configs.empty()) {
+            /// no custom configs were passed
+            return;
+        }
+        BOOST_TEST(topic_res.configs, "Expecting configs");
+        auto resp_cfgs = kafka::config_map(*topic_res.configs);
+        auto cfg = app.metadata_cache.local().get_topic_cfg(
+          model::topic_namespace_view{model::kafka_namespace, topic_res.name});
+        BOOST_TEST(cfg, "missing topic config");
+        auto config_map = kafka::from_cluster_type(cfg->properties);
+        BOOST_TEST(config_map == resp_cfgs, "configs didn't match");
+        BOOST_CHECK_EQUAL(
+          topic_res.topic_config_error_code, kafka::error_code::none);
+    }
+
     void test_create_non_replicable_topic(
       model::topic src, kafka::create_topics_request req) {
         std::vector<cluster::non_replicable_topic> non_reps;
@@ -425,3 +474,27 @@ FIXTURE_TEST(read_replica_and_remote_write, create_topic_fixture) {
       == "remote read and write are not supported for read replicas");
     BOOST_CHECK(resp.data.topics[0].name == "topic1");
 }
+
+FIXTURE_TEST(test_v5_validate_configs_resp, create_topic_fixture) {
+    wait_for_controller_leadership().get();
+
+    /// Test conditions in create_topic_fixture::verify_metadata will run
+    test_create_topic(
+      make_req({make_topic("topicA"), make_topic("topicB")}, true),
+      kafka::api_version(5));
+
+    /// Test create topic with custom configs, verify that they have been set
+    /// and correctly returned in response
+    std::map<ss::sstring, ss::sstring> config_map{
+      {ss::sstring(kafka::topic_property_retention_bytes), "1234567"},
+      {ss::sstring(kafka::topic_property_segment_size), "7654321"}};
+
+    test_create_topic(
+      make_req(
+        {make_topic("topicC", 3, 1, config_map),
+         make_topic("topicD", 3, 1, config_map)},
+        false),
+      std::nullopt,
+      std::nullopt,
+      kafka::api_version(5));
+}

From cc5c2373399ab41634bd614358e5b8da8f662030 Mon Sep 17 00:00:00 2001
From: Rob Blafford <rob@vectorized.io>
Date: Mon, 27 Jun 2022 12:56:27 -0400
Subject: [PATCH 030/201] kafka: Add v::coproc to link deps of v::kafka

- When compiling this branch, this linker error is observed
ld.lld: error: undefined symbol: coproc::partition_manager::get(model::ntp const&) const
>>> referenced by partition_proxy.cc:29 (/home/robert/workspace/redpanda/src/v/kafka/server/partition_proxy.cc:29)
>>>               partition_proxy.cc.o:(kafka::make_partition_proxy(model::ntp const&, cluster::partition_manager&, coproc::partition_manager&)) in archive lib/libv_v_kaf
ka.a
- Adding v::coproc to the list of dependencies of v::kafka as there is a dependency.
---
 src/v/kafka/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/v/kafka/CMakeLists.txt b/src/v/kafka/CMakeLists.txt
index c29967aec406b..6e8da95e85afc 100644
--- a/src/v/kafka/CMakeLists.txt
+++ b/src/v/kafka/CMakeLists.txt
@@ -67,6 +67,7 @@ v_cc_library(
     Seastar::seastar
     v::bytes
     v::rpc
+    v::coproc
     v::cluster
     v::kafka_protocol
     v::security

From eebf5131e366dfa8b7dfb87dd7bdf675a0519050 Mon Sep 17 00:00:00 2001
From: nm <nagamocha@gmail.com>
Date: Tue, 12 Jul 2022 00:23:37 +0300
Subject: [PATCH 031/201] rpk: fix panic on cluster config export

When the file cannot be created, the File object returned is nil.
Therefore when printing out the error message, rather than call Name()
on the file object to retrieve the filename, just use the filename
string directly.
---
 src/go/rpk/pkg/cli/cmd/cluster/config/export.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/go/rpk/pkg/cli/cmd/cluster/config/export.go b/src/go/rpk/pkg/cli/cmd/cluster/config/export.go
index 4e9fb008cbe0c..6d11b999517f4 100644
--- a/src/go/rpk/pkg/cli/cmd/cluster/config/export.go
+++ b/src/go/rpk/pkg/cli/cmd/cluster/config/export.go
@@ -166,11 +166,12 @@ to include all properties including these low level tunables.
 			var file *os.File
 			if filename == "" {
 				file, err = ioutil.TempFile("/tmp", "config_*.yaml")
+				filename = "/tmp/config_*.yaml"
 			} else {
 				file, err = os.Create(filename)
 			}
 
-			out.MaybeDie(err, "unable to create file %q: %v", file.Name(), err)
+			out.MaybeDie(err, "unable to create file %q: %v", filename, err)
 			err = exportConfig(file, schema, currentConfig, *all)
 			out.MaybeDie(err, "failed to write out config %q: %v", file.Name(), err)
 			err = file.Close()

From 08d112a8552ef0aa9dd920424032a95dc01ffbbf Mon Sep 17 00:00:00 2001
From: John Spray <jcs@redpanda.com>
Date: Fri, 24 Jun 2022 22:40:26 +0100
Subject: [PATCH 032/201] cluster: add function for creating bootstrap user

If RP_BOOTSTRAP_USER environment variable is set,
parse it as user:password, and write the user account
to the credentials store.
---
 src/v/cluster/security_frontend.cc | 56 ++++++++++++++++++++++++++++++
 src/v/cluster/security_frontend.h  |  2 ++
 2 files changed, 58 insertions(+)

diff --git a/src/v/cluster/security_frontend.cc b/src/v/cluster/security_frontend.cc
index 18deff751ace7..32ef272f7d65c 100644
--- a/src/v/cluster/security_frontend.cc
+++ b/src/v/cluster/security_frontend.cc
@@ -30,6 +30,7 @@
 #include "rpc/errc.h"
 #include "rpc/types.h"
 #include "security/authorizer.h"
+#include "security/scram_algorithm.h"
 
 #include <seastar/core/coroutine.hh>
 
@@ -308,4 +309,59 @@ security_frontend::dispatch_delete_acls_to_leader(
       });
 }
 
+/**
+ * For use during cluster creation, if RP_BOOTSTRAP_USER is set
+ * then write a user creation message to the controller log.
+ *
+ * @returns an error code if controller log write failed.  If the
+ *          environment variable is missing or malformed this is
+ *          not considered an error.
+ *
+ */
+ss::future<std::error_code> security_frontend::maybe_create_bootstrap_user() {
+    static const ss::sstring bootstrap_user_env_key{"RP_BOOTSTRAP_USER"};
+
+    auto creds_str_ptr = std::getenv(bootstrap_user_env_key.c_str());
+    if (creds_str_ptr == nullptr) {
+        // Environment variable is not set
+        co_return errc::success;
+    }
+
+    ss::sstring creds_str = creds_str_ptr;
+    auto colon = creds_str.find(":");
+    if (colon == ss::sstring::npos || colon == creds_str.size() - 1) {
+        // Malformed value.  Do not log the value, it may be malformed
+        // but it is still a secret.
+        vlog(
+          clusterlog.warn,
+          "Invalid value of {} (expected \"username:password\")",
+          bootstrap_user_env_key);
+        co_return errc::success;
+    }
+
+    auto username = security::credential_user{creds_str.substr(0, colon)};
+    auto password = creds_str.substr(colon + 1);
+    auto credentials = security::scram_sha256::make_credentials(
+      password, security::scram_sha256::min_iterations);
+
+    auto err = co_await create_user(
+      username, credentials, model::timeout_clock::now() + 5s);
+
+    if (err) {
+        vlog(
+          clusterlog.warn,
+          "Failed to apply {}: {}",
+          bootstrap_user_env_key,
+          err.message());
+    } else {
+        vlog(
+          clusterlog.info,
+          "Created user '{}' via {}",
+          username,
+          bootstrap_user_env_key);
+    }
+
+    co_return err;
+}
+
 } // namespace cluster
diff --git a/src/v/cluster/security_frontend.h b/src/v/cluster/security_frontend.h
index ae4924b39af9e..527143dcb4cce 100644
--- a/src/v/cluster/security_frontend.h
+++ b/src/v/cluster/security_frontend.h
@@ -54,6 +54,8 @@ class security_frontend final {
       std::vector<security::acl_binding_filter>,
       model::timeout_clock::duration);
 
+    ss::future<std::error_code> maybe_create_bootstrap_user();
+
 private:
     ss::future<std::vector<errc>> do_create_acls(
       std::vector<security::acl_binding>, model::timeout_clock::duration);

From 954c629c6f2126615b91bdbd84deef712a6b2f25 Mon Sep 17 00:00:00 2001
From: John Spray <jcs@redpanda.com>
Date: Fri, 24 Jun 2022 23:28:10 +0100
Subject: [PATCH 033/201] cluster: bootstrap user creation during cluster
 creation

---
 src/v/cluster/controller.cc | 36 ++++++++++++++++++++++++++++++++++++
 src/v/cluster/controller.h  |  2 +-
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/src/v/cluster/controller.cc b/src/v/cluster/controller.cc
index de6aae85ad68d..05ffbc0847008 100644
--- a/src/v/cluster/controller.cc
+++ b/src/v/cluster/controller.cc
@@ -233,6 +233,7 @@ ss::future<> controller::start() {
               return stm.wait(stm.bootstrap_last_applied(), model::no_timeout);
           });
       })
+      .then([this] { return cluster_creation_hook(); })
       .then(
         [this] { return _backend.invoke_on_all(&controller_backend::start); })
       .then([this] {
@@ -384,4 +385,39 @@ ss::future<> controller::stop() {
     });
 }
 
+/**
+ * This function provides for writing the controller log immediately
+ * after it has been created, before anything else has been written
+ * to it, and before we have started communicating with peers.
+ */
+ss::future<> controller::cluster_creation_hook() {
+    if (!config::node().seed_servers().empty()) {
+        // We are not on the root node
+        co_return;
+    } else if (
+      _raft0->last_visible_index() > model::offset{}
+      || _raft0->config().brokers().size() > 1) {
+        // The controller log has already been written to
+        co_return;
+    }
+
+    // Internal RPC does not start until after controller startup
+    // is complete (we are called during controller startup), so
+    // it is guaranteed that if we were single node/empty controller
+    // log at start of this function, we will still be in that state
+    // here.  The wait for leadership is really just a wait for the
+    // consensus object to finish writing its last_voted_for from
+    // its self-vote.
+    while (!_raft0->is_leader()) {
+        co_await ss::sleep(100ms);
+    }
+
+    auto err
+      = co_await _security_frontend.local().maybe_create_bootstrap_user();
+    vassert(
+      err == errc::success,
+      "Controller write should always succeed in single replica state during "
+      "creation");
+}
+
 } // namespace cluster
diff --git a/src/v/cluster/controller.h b/src/v/cluster/controller.h
index 8a285cd5c6b50..dfc6a7b0993d2 100644
--- a/src/v/cluster/controller.h
+++ b/src/v/cluster/controller.h
@@ -118,7 +118,7 @@ class controller {
 private:
     friend controller_probe;
 
-private:
+    ss::future<> cluster_creation_hook();
     config_manager::preload_result _config_preload;
 
     ss::sharded<ss::abort_source> _as;                     // instance per core

From 442291a98cbb2d29c131bd9430910f252ede4288 Mon Sep 17 00:00:00 2001
From: John Spray <jcs@redpanda.com>
Date: Tue, 28 Jun 2022 14:26:07 +0100
Subject: [PATCH 034/201] tests: enable custom superuser in RedpandaService

This enables writing a test that bootstraps a superuser.

- Use the authenticated admin API client in registered()
- Pass authentication params into librdkafka constructor
  in registered()
- Do not create the superuser if one was passed in at
  construction time.
---
 tests/rptest/services/redpanda.py | 39 +++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/tests/rptest/services/redpanda.py b/tests/rptest/services/redpanda.py
index 981ab71e1d68a..bb97a8a8b7bcf 100644
--- a/tests/rptest/services/redpanda.py
+++ b/tests/rptest/services/redpanda.py
@@ -443,7 +443,8 @@ def __init__(self,
                  environment: Optional[dict[str, str]] = None,
                  security: SecurityConfig = SecurityConfig(),
                  node_ready_timeout_s=None,
-                 enable_installer=False):
+                 enable_installer=False,
+                 superuser: Optional[SaslCredentials] = None):
         super(RedpandaService, self).__init__(context, num_nodes=num_brokers)
         self._context = context
         self._enable_rp = enable_rp
@@ -455,6 +456,16 @@ def __init__(self,
         if enable_installer:
             self._installer = RedpandaInstaller(self)
 
+        if superuser is None:
+            superuser = self.SUPERUSER_CREDENTIALS
+            self._skip_create_superuser = False
+        else:
+            # When we are passed explicit superuser credentials, presume that the caller
+            # is taking care of user creation themselves (e.g. when testing credential bootstrap)
+            self._skip_create_superuser = True
+
+        self._superuser = superuser
+
         if node_ready_timeout_s is None:
             node_ready_timeout_s = RedpandaService.DEFAULT_NODE_READY_TIMEOUT_SEC
         self.node_ready_timeout_s = node_ready_timeout_s
@@ -469,10 +480,9 @@ def __init__(self,
         else:
             self._log_level = log_level
 
-        self._admin = Admin(self)
         self._admin = Admin(self,
-                            auth=(self.SUPERUSER_CREDENTIALS.username,
-                                  self.SUPERUSER_CREDENTIALS.password))
+                            auth=(self._superuser.username,
+                                  self._superuser.password))
         self._started = []
         self._security_config = dict()
 
@@ -607,7 +617,8 @@ def start(self, nodes=None, clean_nodes=True):
         if self._start_duration_seconds < 0:
             self._start_duration_seconds = time.time() - self._start_time
 
-        self._admin.create_user(*self.SUPERUSER_CREDENTIALS)
+        if not self._skip_create_superuser:
+            self._admin.create_user(*self._superuser)
 
         self.logger.info("Waiting for all brokers to join cluster")
         expected = set(self._started)
@@ -629,7 +640,7 @@ def start(self, nodes=None, clean_nodes=True):
                 raise RuntimeError("Unexpected files in data directory")
 
         if self.sasl_enabled():
-            username, password, algorithm = self.SUPERUSER_CREDENTIALS
+            username, password, algorithm = self._superuser
             self._security_config = dict(security_protocol='SASL_PLAINTEXT',
                                          sasl_mechanism=algorithm,
                                          sasl_plain_username=username,
@@ -1268,7 +1279,7 @@ def write_node_conf_file(self, node, override_cfg_params=None):
                            enable_rp=self._enable_rp,
                            enable_pp=self._enable_pp,
                            enable_sr=self._enable_sr,
-                           superuser=self.SUPERUSER_CREDENTIALS,
+                           superuser=self._superuser,
                            sasl_enabled=self.sasl_enabled())
 
         if override_cfg_params or self._extra_node_conf[node]:
@@ -1355,10 +1366,9 @@ def registered(self, node):
         # the node is stored in raft0 AND has been replayed on all nodes.  Otherwise
         # a kafka metadata request to the last node to join could return incomplete
         # metadata and cause strange issues within a test.
-        admin = Admin(self)
         for peer in self._started:
             try:
-                admin_brokers = admin.get_brokers(node=peer)
+                admin_brokers = self._admin.get_brokers(node=peer)
             except requests.exceptions.RequestException as e:
                 # We run during startup, when admin API may not even be listening yet: tolerate
                 # API errors but presume that if some APIs are not up yet, then node registration
@@ -1388,7 +1398,16 @@ def registered(self, node):
                     f"registered: node {node.name} now visible in peer {peer.name}'s broker list ({admin_brokers})"
                 )
 
-        client = PythonLibrdkafka(self, tls_cert=self._tls_cert)
+        auth_args = {}
+        if self.sasl_enabled():
+            auth_args = {
+                'username': self._superuser.username,
+                'password': self._superuser.password,
+                'algorithm': self._superuser.algorithm
+            }
+
+        client = PythonLibrdkafka(self, tls_cert=self._tls_cert, **auth_args)
+
         brokers = client.brokers()
         broker = brokers.get(idx, None)
         if broker is None:

From 76bede975b59e30022cfedbb0f1e9216659b4dc7 Mon Sep 17 00:00:00 2001
From: John Spray <jcs@redpanda.com>
Date: Tue, 28 Jun 2022 14:27:32 +0100
Subject: [PATCH 035/201] tests: add Admin.update_user helper

Wraps the admin API endpoint of the same name, for
changing a user's password.
---
 tests/rptest/services/admin.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/rptest/services/admin.py b/tests/rptest/services/admin.py
index 539dccece0807..db35abdd52e17 100644
--- a/tests/rptest/services/admin.py
+++ b/tests/rptest/services/admin.py
@@ -564,6 +564,18 @@ def delete_user(self, username):
 
         self._request("delete", path)
 
+    def update_user(self, username, password, algorithm):
+        self.redpanda.logger.info(
+            f"Updating user {username}:{password}:{algorithm}")
+
+        self._request("PUT",
+                      f"security/users/{username}",
+                      json=dict(
+                          username=username,
+                          password=password,
+                          algorithm=algorithm,
+                      ))
+
     def list_users(self, node=None):
         return self._request("get", "security/users", node=node).json()
 

From b4fe6136fc8e22757a978897e32edb0fd6f60ba7 Mon Sep 17 00:00:00 2001
From: John Spray <jcs@redpanda.com>
Date: Fri, 24 Jun 2022 23:29:22 +0100
Subject: [PATCH 036/201] tests: add ScramBootstrapUserTest

---
 tests/rptest/tests/scram_test.py | 66 +++++++++++++++++++++++++++++++-
 1 file changed, 65 insertions(+), 1 deletion(-)

diff --git a/tests/rptest/tests/scram_test.py b/tests/rptest/tests/scram_test.py
index 55bd61538acc4..1e7f1bfc9c5ab 100644
--- a/tests/rptest/tests/scram_test.py
+++ b/tests/rptest/tests/scram_test.py
@@ -19,7 +19,8 @@
 from rptest.clients.types import TopicSpec
 from rptest.clients.python_librdkafka import PythonLibrdkafka
 from rptest.services.admin import Admin
-from rptest.services.redpanda import SecurityConfig
+from rptest.services.redpanda import SecurityConfig, SaslCredentials, SecurityConfig
+from rptest.util import expect_http_error
 
 
 class ScramTest(RedpandaTest):
@@ -299,3 +300,66 @@ def test_enable_sasl_live(self):
 
         # An unauthenticated client should be accepted again
         assert len(unauthenticated_client.topics()) == 1
+
+
+class ScramBootstrapUserTest(RedpandaTest):
+    BOOTSTRAP_USERNAME = 'bob'
+    BOOTSTRAP_PASSWORD = 'sekrit'
+
+    def __init__(self, *args, **kwargs):
+        # Configure the cluster as a user might configure it for secure
+        # bootstrap: i.e. all auth turned on from moment of creation.
+
+        security_config = SecurityConfig()
+        security_config.enable_sasl = True
+
+        super().__init__(
+            *args,
+            environment={
+                'RP_BOOTSTRAP_USER':
+                f'{self.BOOTSTRAP_USERNAME}:{self.BOOTSTRAP_PASSWORD}'
+            },
+            extra_rp_conf={
+                'enable_sasl': True,
+                'admin_api_require_auth': True,
+                'superusers': ['bob']
+            },
+            security=security_config,
+            superuser=SaslCredentials(self.BOOTSTRAP_USERNAME,
+                                      self.BOOTSTRAP_PASSWORD,
+                                      "SCRAM-SHA-256"),
+            **kwargs)
+
+    @cluster(num_nodes=3)
+    def test_bootstrap_user(self):
+        # Anonymous access should be refused
+        admin = Admin(self.redpanda)
+        with expect_http_error(403):
+            admin.list_users()
+
+        # Access using the bootstrap credentials should succeed
+        admin = Admin(self.redpanda,
+                      auth=(self.BOOTSTRAP_USERNAME, self.BOOTSTRAP_PASSWORD))
+        assert self.BOOTSTRAP_USERNAME in admin.list_users()
+
+        # Modify the bootstrap user's credential
+        admin.update_user(self.BOOTSTRAP_USERNAME, "newpassword",
+                          "SCRAM-SHA-256")
+
+        # We do not have a hook for synchronously waiting for a credential update to propagate
+        time.sleep(5)
+
+        # Using old password should fail
+        with expect_http_error(401):
+            admin.list_users()
+
+        # Using new credential should succeed
+        admin = Admin(self.redpanda,
+                      auth=(self.BOOTSTRAP_USERNAME, 'newpassword'))
+        admin.list_users()
+
+        # Modified credential should survive a restart: this verifies that
+        # the RP_BOOTSTRAP_USER setting does not fight with changes made
+        # by other means.
+        self.redpanda.restart_nodes(self.redpanda.nodes)
+        admin.list_users()

From 9cbc9af22c9794e7289e7d19bc16d6633823284f Mon Sep 17 00:00:00 2001
From: John Spray <jcs@redpanda.com>
Date: Tue, 5 Jul 2022 14:02:39 +0100
Subject: [PATCH 037/201] tests/scram_test: use wait_until for waiting for
 credentials to propagate

This makes us a bit more tolerant of slowness and also a bit
faster when things are propagating in milliseconds the way
we expect.
---
 tests/rptest/tests/scram_test.py | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/tests/rptest/tests/scram_test.py b/tests/rptest/tests/scram_test.py
index 1e7f1bfc9c5ab..d0b7603b62fe9 100644
--- a/tests/rptest/tests/scram_test.py
+++ b/tests/rptest/tests/scram_test.py
@@ -10,9 +10,11 @@
 import socket
 import string
 import requests
+from requests.exceptions import HTTPError
 import time
 
 from ducktape.mark import parametrize
+from ducktape.utils.util import wait_until
 
 from rptest.services.cluster import cluster
 from rptest.tests.redpanda_test import RedpandaTest
@@ -330,6 +332,26 @@ def __init__(self, *args, **kwargs):
                                       "SCRAM-SHA-256"),
             **kwargs)
 
+    def _check_http_status_everywhere(self, expect_status, callable):
+        """
+        Check that the callback results in an HTTP error with the
+        given status code from all nodes in the cluster.  This enables
+        checking that auth state has propagated as expected.
+
+        :returns: true if all nodes throw an error with the expected status code
+        """
+
+        for n in self.redpanda.nodes:
+            try:
+                callable(n)
+            except HTTPError as e:
+                if e.response.status_code != expect_status:
+                    return False
+            else:
+                return False
+
+        return True
+
     @cluster(num_nodes=3)
     def test_bootstrap_user(self):
         # Anonymous access should be refused
@@ -346,8 +368,12 @@ def test_bootstrap_user(self):
         admin.update_user(self.BOOTSTRAP_USERNAME, "newpassword",
                           "SCRAM-SHA-256")
 
-        # We do not have a hook for synchronously waiting for a credential update to propagate
-        time.sleep(5)
+        # Getting 401 with old credentials everywhere will show that the
+        # credential update has propagated to all nodes
+        wait_until(lambda: self._check_http_status_everywhere(
+            401, lambda n: admin.list_users(node=n)),
+                   timeout_sec=10,
+                   backoff_sec=0.5)
 
         # Using old password should fail
         with expect_http_error(401):

From 351beaabfd599eed177f665e9dc1f2bf8c9621f3 Mon Sep 17 00:00:00 2001
From: Elena Anyusheva <lena@vectorized.io>
Date: Mon, 27 Jun 2022 19:45:25 +0200
Subject: [PATCH 038/201] cluster, storage: add read_replica to ntp config

cluster::partition will use info from ntp config to determine if
partition is part of read replica topic

Co-authored-by: Eugene Lazin <evgeny@vectorized.io>
---
 src/v/cluster/types.cc     | 7 +++++--
 src/v/storage/ntp_config.h | 7 +++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/v/cluster/types.cc b/src/v/cluster/types.cc
index 9769e8e07a190..c697cdeeb1d20 100644
--- a/src/v/cluster/types.cc
+++ b/src/v/cluster/types.cc
@@ -45,7 +45,8 @@ bool topic_properties::has_overrides() const {
     return cleanup_policy_bitflags || compaction_strategy || segment_size
            || retention_bytes.has_value() || retention_bytes.is_disabled()
            || retention_duration.has_value() || retention_duration.is_disabled()
-           || recovery.has_value() || shadow_indexing.has_value();
+           || recovery.has_value() || shadow_indexing.has_value()
+           || read_replica.has_value();
 }
 
 storage::ntp_config::default_overrides
@@ -59,6 +60,7 @@ topic_properties::get_ntp_cfg_overrides() const {
     ret.shadow_indexing_mode = shadow_indexing
                                  ? *shadow_indexing
                                  : model::shadow_indexing_mode::disabled;
+    ret.read_replica = read_replica;
     return ret;
 }
 
@@ -91,7 +93,8 @@ storage::ntp_config topic_configuration::make_ntp_config(
               properties.recovery ? *properties.recovery : false),
             .shadow_indexing_mode = properties.shadow_indexing
                                       ? *properties.shadow_indexing
-                                      : model::shadow_indexing_mode::disabled});
+                                      : model::shadow_indexing_mode::disabled,
+            .read_replica = properties.read_replica});
     }
     return {
       model::ntp(tp_ns.ns, tp_ns.tp, p_id),
diff --git a/src/v/storage/ntp_config.h b/src/v/storage/ntp_config.h
index 4a0e6113a0570..b9e205840e366 100644
--- a/src/v/storage/ntp_config.h
+++ b/src/v/storage/ntp_config.h
@@ -47,6 +47,8 @@ class ntp_config {
         model::shadow_indexing_mode shadow_indexing_mode
           = model::shadow_indexing_mode::disabled;
 
+        std::optional<bool> read_replica;
+
         friend std::ostream&
         operator<<(std::ostream&, const default_overrides&);
     };
@@ -150,6 +152,11 @@ class ntp_config {
                && model::is_fetch_enabled(_overrides->shadow_indexing_mode);
     }
 
+    bool is_read_replica_mode_enabled() const {
+        return _overrides != nullptr && _overrides->read_replica
+               && _overrides->read_replica.value();
+    }
+
 private:
     model::ntp _ntp;
     /// \brief currently this is the basedir. In the future

From d6b5ef671ab15bc9074429b4e410965dcb45cd30 Mon Sep 17 00:00:00 2001
From: Elena Anyusheva <lena@vectorized.io>
Date: Mon, 27 Jun 2022 19:58:03 +0200
Subject: [PATCH 039/201] cluster: add is_read_replica_mode_enabled to
 partition

partition should know if it's part of read replica topic because code
path for start/last offset and high_watermark depends on that.

Co-authored-by: Eugene Lazin <evgeny@vectorized.io>
---
 src/v/cluster/partition.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/v/cluster/partition.h b/src/v/cluster/partition.h
index 81107c6d0beb6..eb8fb0d806efe 100644
--- a/src/v/cluster/partition.h
+++ b/src/v/cluster/partition.h
@@ -220,6 +220,11 @@ class partition {
         return _archival_meta_stm;
     }
 
+    bool is_read_replica_mode_enabled() const {
+        const auto& cfg = _raft->log_config();
+        return cfg.is_read_replica_mode_enabled();
+    }
+
     /// Return true if shadow indexing is enabled for the partition
     bool is_remote_fetch_enabled() const {
         const auto& cfg = _raft->log_config();

From 7586e0229b73a090f8be9019d1cd5fd16e5e4e8d Mon Sep 17 00:00:00 2001
From: Elena Anyusheva <lena@vectorized.io>
Date: Mon, 27 Jun 2022 14:26:47 +0200
Subject: [PATCH 040/201] cluster: add last cloud offset to partition

last cloud offset will be used to determine high_watermark and
last_stable offset for read replica topics.

Co-authored-by: Eugene Lazin <evgeny@vectorized.io>
---
 src/v/cloud_storage/remote_partition.cc |  8 ++++++++
 src/v/cloud_storage/remote_partition.h  |  3 +++
 src/v/cluster/partition.h               | 15 +++++++++++++--
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/v/cloud_storage/remote_partition.cc b/src/v/cloud_storage/remote_partition.cc
index efd96d33986b0..ef90f003ad621 100644
--- a/src/v/cloud_storage/remote_partition.cc
+++ b/src/v/cloud_storage/remote_partition.cc
@@ -459,6 +459,14 @@ model::offset remote_partition::first_uploaded_offset() {
     }
 }
 
+model::offset remote_partition::last_uploaded_offset() {
+    vassert(
+      _manifest.size() > 0,
+      "The manifest for {} is not expected to be empty",
+      _manifest.get_ntp());
+    return _manifest.get_last_offset();
+}
+
 const model::ntp& remote_partition::get_ntp() const {
     return _manifest.get_ntp();
 }
diff --git a/src/v/cloud_storage/remote_partition.h b/src/v/cloud_storage/remote_partition.h
index f622daec685ec..929c9010c858f 100644
--- a/src/v/cloud_storage/remote_partition.h
+++ b/src/v/cloud_storage/remote_partition.h
@@ -180,6 +180,9 @@ class remote_partition
     /// Return first uploaded kafka offset
     model::offset first_uploaded_offset();
 
+    /// Return last uploaded kafka offset
+    model::offset last_uploaded_offset();
+
     /// Get partition NTP
     const model::ntp& get_ntp() const;
 
diff --git a/src/v/cluster/partition.h b/src/v/cluster/partition.h
index eb8fb0d806efe..87002b7795d4c 100644
--- a/src/v/cluster/partition.h
+++ b/src/v/cluster/partition.h
@@ -246,17 +246,28 @@ class partition {
     model::offset start_cloud_offset() const {
         vassert(
           cloud_data_available(),
-          "Method can only be called if cloud data is available");
+          "Method can only be called if cloud data is available, ntp: {}",
+          _raft->ntp());
         return _cloud_storage_partition->first_uploaded_offset();
     }
 
+    /// Last available cloud offset
+    model::offset last_cloud_offset() const {
+        vassert(
+          cloud_data_available(),
+          "Method can only be called if cloud data is available, ntp: {}",
+          _raft->ntp());
+        return _cloud_storage_partition->last_uploaded_offset();
+    }
+
     /// Create a reader that will fetch data from remote storage
     ss::future<storage::translating_reader> make_cloud_reader(
       storage::log_reader_config config,
       std::optional<model::timeout_clock::time_point> deadline = std::nullopt) {
         vassert(
           cloud_data_available(),
-          "Method can only be called if cloud data is available");
+          "Method can only be called if cloud data is available, ntp: {}",
+          _raft->ntp());
         return _cloud_storage_partition->make_reader(config, deadline);
     }
 

From 8b31b4027514de4f1c2086bb4dcc9fbff0263cb1 Mon Sep 17 00:00:00 2001
From: Elena Anyusheva <lena@vectorized.io>
Date: Mon, 27 Jun 2022 14:41:30 +0200
Subject: [PATCH 041/201] kafka: use cloud offsets for read replicas

For read replica topics start_offset, high_watermark and
last_stable_offset will return data from cloud skipping offset
translation.

Co-authored-by: Eugene Lazin <evgeny@vectorized.io>
---
 src/v/kafka/server/replicated_partition.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/src/v/kafka/server/replicated_partition.h b/src/v/kafka/server/replicated_partition.h
index 5c4379762f680..187a2f4e89ce5 100644
--- a/src/v/kafka/server/replicated_partition.h
+++ b/src/v/kafka/server/replicated_partition.h
@@ -37,6 +37,13 @@ class replicated_partition final : public kafka::partition_proxy::impl {
     const model::ntp& ntp() const final { return _partition->ntp(); }
 
     model::offset start_offset() const final {
+        if (
+          _partition->is_read_replica_mode_enabled()
+          && _partition->cloud_data_available()) {
+            // Always assume remote read in this case.
+            return _partition->start_cloud_offset();
+        }
+
         auto local_kafka_start_offset = _translator->from_log_offset(
           _partition->start_offset());
         if (
@@ -49,10 +56,25 @@ class replicated_partition final : public kafka::partition_proxy::impl {
     }
 
     model::offset high_watermark() const final {
+        if (_partition->is_read_replica_mode_enabled()) {
+            if (_partition->cloud_data_available()) {
+                return model::next_offset(_partition->last_cloud_offset());
+            } else {
+                return model::offset(0);
+            }
+        }
         return _translator->from_log_offset(_partition->high_watermark());
     }
 
     model::offset last_stable_offset() const final {
+        if (_partition->is_read_replica_mode_enabled()) {
+            if (_partition->cloud_data_available()) {
+                // There is no difference between HWM and LO in this mode
+                return model::next_offset(_partition->last_cloud_offset());
+            } else {
+                return model::offset(0);
+            }
+        }
         return _translator->from_log_offset(_partition->last_stable_offset());
     }
 

From 1f9c8ec78f973d32116ad3698a944603376f0931 Mon Sep 17 00:00:00 2001
From: Elena Anyusheva <lena@vectorized.io>
Date: Mon, 27 Jun 2022 14:44:06 +0200
Subject: [PATCH 042/201] kafka: create cloud_reader for read_replica

cloud_reader will be used read replica topic.

Co-authored-by: Eugene Lazin <evgeny@vectorized.io>
---
 src/v/kafka/server/replicated_partition.cc | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/v/kafka/server/replicated_partition.cc b/src/v/kafka/server/replicated_partition.cc
index faf03c03d452a..7955092cc77a3 100644
--- a/src/v/kafka/server/replicated_partition.cc
+++ b/src/v/kafka/server/replicated_partition.cc
@@ -38,6 +38,15 @@ replicated_partition::replicated_partition(
 ss::future<storage::translating_reader> replicated_partition::make_reader(
   storage::log_reader_config cfg,
   std::optional<model::timeout_clock::time_point> deadline) {
+    if (
+      _partition->is_read_replica_mode_enabled()
+      && _partition->cloud_data_available()) {
+        // No need to translate the offsets in this case since all fetch
+        // requests in read replica are served via remote_partition which
+        // does its own translation.
+        co_return co_await _partition->make_cloud_reader(cfg);
+    }
+
     auto local_kafka_start_offset = _translator->from_log_offset(
       _partition->start_offset());
     if (

From 283b3abac75586998a72b0450e10b8f8e27eb0dc Mon Sep 17 00:00:00 2001
From: Elena Anyusheva <lena@vectorized.io>
Date: Mon, 27 Jun 2022 16:13:15 +0200
Subject: [PATCH 043/201] archival: add ntp_archiver for read_replica topic

read replica topic will use ntp_archiver to periodically pull remote
manifest and sync local manifest to it.

Co-authored-by: Eugene Lazin <evgeny@vectorized.io>
---
 src/v/archival/service.cc | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/v/archival/service.cc b/src/v/archival/service.cc
index b0d474c5a089d..6fc4179eff32b 100644
--- a/src/v/archival/service.cc
+++ b/src/v/archival/service.cc
@@ -284,9 +284,13 @@ scheduler_service_impl::create_archivers(std::vector<model::ntp> to_create) {
       std::move(to_create), concurrency, [this](const model::ntp& ntp) {
           auto log = _partition_manager.local().log(ntp);
           auto part = _partition_manager.local().get(ntp);
-          if (log.has_value() && part && part->is_elected_leader()
-              && (part->get_ntp_config().is_archival_enabled()
-                  || config::shard_local_cfg().cloud_storage_enable_remote_write())) {
+          if (!log.has_value() || !part || !part->is_elected_leader()) {
+              return ss::now();
+          }
+          if (
+            part->get_ntp_config().is_archival_enabled()
+            || part->get_ntp_config().is_read_replica_mode_enabled()
+            || config::shard_local_cfg().cloud_storage_enable_remote_write()) {
               auto archiver = ss::make_lw_shared<ntp_archiver>(
                 log->config(),
                 _partition_manager.local(),

From f27446d15dc76282ef3880d3d93d24992b2b41eb Mon Sep 17 00:00:00 2001
From: Elena Anyusheva <lena@vectorized.io>
Date: Tue, 5 Jul 2022 17:06:33 +0200
Subject: [PATCH 044/201] config, archival: add config for manifest sync
 timeout

---
 src/v/archival/ntp_archiver_service.cc | 3 +++
 src/v/archival/ntp_archiver_service.h  | 1 +
 src/v/config/configuration.cc          | 7 +++++++
 src/v/config/configuration.h           | 2 ++
 4 files changed, 13 insertions(+)

diff --git a/src/v/archival/ntp_archiver_service.cc b/src/v/archival/ntp_archiver_service.cc
index 7c8c301914b19..4587d42f907ff 100644
--- a/src/v/archival/ntp_archiver_service.cc
+++ b/src/v/archival/ntp_archiver_service.cc
@@ -67,6 +67,9 @@ ntp_archiver::ntp_archiver(
   , _manifest_upload_timeout(conf.manifest_upload_timeout)
   , _upload_loop_initial_backoff(conf.upload_loop_initial_backoff)
   , _upload_loop_max_backoff(conf.upload_loop_max_backoff)
+  , _sync_manifest_timeout(
+      config::shard_local_cfg()
+        .cloud_storage_readreplica_manifest_sync_timeout_ms.bind())
   , _upload_sg(conf.upload_scheduling_group)
   , _io_priority(conf.upload_io_priority) {
     vassert(
diff --git a/src/v/archival/ntp_archiver_service.h b/src/v/archival/ntp_archiver_service.h
index 1cad7145c8227..af28ff17f2a54 100644
--- a/src/v/archival/ntp_archiver_service.h
+++ b/src/v/archival/ntp_archiver_service.h
@@ -190,6 +190,7 @@ class ntp_archiver {
     ss::semaphore _mutex{1};
     ss::lowres_clock::duration _upload_loop_initial_backoff;
     ss::lowres_clock::duration _upload_loop_max_backoff;
+    config::binding<std::chrono::milliseconds> _sync_manifest_timeout;
     simple_time_jitter<ss::lowres_clock> _backoff_jitter{100ms};
     size_t _concurrency{4};
     ss::lowres_clock::time_point _last_upload_time;
diff --git a/src/v/config/configuration.cc b/src/v/config/configuration.cc
index bce4f0f6891bc..a6b2c0274dbc8 100644
--- a/src/v/config/configuration.cc
+++ b/src/v/config/configuration.cc
@@ -1029,6 +1029,13 @@ configuration::configuration()
       "remote storage (sec)",
       {.visibility = visibility::tunable},
       std::nullopt)
+  , cloud_storage_readreplica_manifest_sync_timeout_ms(
+      *this,
+      "cloud_storage_readreplica_manifest_sync_timeout_ms",
+      "Timeout to check if new data is available for partition in S3 for read "
+      "replica",
+      {.needs_restart = needs_restart::no, .visibility = visibility::tunable},
+      30s)
   , cloud_storage_upload_ctrl_update_interval_ms(
       *this,
       "cloud_storage_upload_ctrl_update_interval_ms",
diff --git a/src/v/config/configuration.h b/src/v/config/configuration.h
index 8cb40c1c28d2c..10fa20a4e7d00 100644
--- a/src/v/config/configuration.h
+++ b/src/v/config/configuration.h
@@ -218,6 +218,8 @@ struct configuration final : public config_store {
       cloud_storage_max_connection_idle_time_ms;
     property<std::optional<std::chrono::seconds>>
       cloud_storage_segment_max_upload_interval_sec;
+    property<std::chrono::milliseconds>
+      cloud_storage_readreplica_manifest_sync_timeout_ms;
 
     // Archival upload controller
     property<std::chrono::milliseconds>

From d76975af617eea02a161ba59010d3f9f518e4736 Mon Sep 17 00:00:00 2001
From: Elena Anyusheva <lena@vectorized.io>
Date: Mon, 27 Jun 2022 18:50:39 +0200
Subject: [PATCH 045/201] archival: add sync_manifest loop for read replica

ntp_archiver_service will periodically download topic manifest and sync
local manifest to it.

Co-authored-by: Eugene Lazin <evgeny@vectorized.io>
---
 src/v/archival/ntp_archiver_service.cc | 84 +++++++++++++++++++++++
 src/v/archival/ntp_archiver_service.h  | 14 ++++
 src/v/archival/service.cc              | 94 ++++++++++++++------------
 3 files changed, 150 insertions(+), 42 deletions(-)

diff --git a/src/v/archival/ntp_archiver_service.cc b/src/v/archival/ntp_archiver_service.cc
index 4587d42f907ff..bdb481ae53430 100644
--- a/src/v/archival/ntp_archiver_service.cc
+++ b/src/v/archival/ntp_archiver_service.cc
@@ -84,6 +84,30 @@ ntp_archiver::ntp_archiver(
       _start_term);
 }
 
+void ntp_archiver::run_sync_manifest_loop() {
+    vassert(
+      !_sync_manifest_loop_started,
+      "sync manifest loop for ntp {} already started",
+      _ntp);
+    _sync_manifest_loop_started = true;
+
+    // NOTE: not using ssx::spawn_with_gate_then here because we want to log
+    // inside the gate (so that _rtclog is guaranteed to be alive).
+    ssx::spawn_with_gate(_gate, [this] {
+        return sync_manifest_loop()
+          .handle_exception_type([](const ss::abort_requested_exception&) {})
+          .handle_exception_type([](const ss::sleep_aborted&) {})
+          .handle_exception_type([](const ss::gate_closed_exception&) {})
+          .handle_exception([this](std::exception_ptr e) {
+              vlog(_rtclog.error, "sync manifest loop error: {}", e);
+          })
+          .finally([this] {
+              vlog(_rtclog.debug, "sync manifest loop stopped");
+              _sync_manifest_loop_stopped = true;
+          });
+    });
+}
+
 void ntp_archiver::run_upload_loop() {
     vassert(
       !_upload_loop_started, "upload loop for ntp {} already started", _ntp);
@@ -153,12 +177,72 @@ ss::future<> ntp_archiver::upload_loop() {
     }
 }
 
+ss::future<> ntp_archiver::sync_manifest_loop() {
+    while (sync_manifest_loop_can_continue()) {
+        cloud_storage::download_result result = co_await sync_manifest();
+
+        if (result != cloud_storage::download_result::success) {
+            // The logic in class `remote` already does retries: if we get here,
+            // it means the download failed after several retries, indicating
+            // something non-transient may be wrong. Hence error severity.
+            vlog(
+              _rtclog.error,
+              "Failed to download manifest {}",
+              _manifest.get_manifest_path());
+        } else {
+            vlog(
+              _rtclog.debug,
+              "Successfuly downloaded manifest {}",
+              _manifest.get_manifest_path());
+        }
+        co_await ss::sleep_abortable(_sync_manifest_timeout(), _as);
+    }
+}
+
+ss::future<cloud_storage::download_result> ntp_archiver::sync_manifest() {
+    cloud_storage::download_result r = co_await download_manifest();
+    if (r == cloud_storage::download_result::success) {
+        vlog(_rtclog.debug, "Downloading manifest in read-replica mode");
+        if (_partition->archival_meta_stm()) {
+            vlog(
+              _rtclog.debug,
+              "Updating the archival_meta_stm in read-replica mode");
+            auto deadline = ss::lowres_clock::now() + _manifest_upload_timeout;
+            auto error = co_await _partition->archival_meta_stm()->add_segments(
+              _manifest, deadline, _as);
+            if (
+              error != cluster::errc::success
+              && error != cluster::errc::not_leader) {
+                vlog(
+                  _rtclog.warn,
+                  "archival metadata STM update failed: {}",
+                  error);
+            }
+            auto last_offset
+              = _partition->archival_meta_stm()->manifest().get_last_offset();
+            vlog(_rtclog.debug, "manifest last_offset: {}", last_offset);
+        }
+    } else {
+        vlog(
+          _rtclog.error,
+          "Failed to download partition manifest in read-replica mode");
+    }
+    co_return r;
+}
+
 bool ntp_archiver::upload_loop_can_continue() const {
     return !_as.abort_requested() && !_gate.is_closed()
            && _partition->is_elected_leader()
            && _partition->term() == _start_term;
 }
 
+bool ntp_archiver::sync_manifest_loop_can_continue() const {
+    // todo: think about it
+    return !_as.abort_requested() && !_gate.is_closed()
+           && _partition->is_elected_leader()
+           && _partition->term() == _start_term;
+}
+
 ss::future<> ntp_archiver::stop() {
     _as.request_abort();
     return _gate.close();
diff --git a/src/v/archival/ntp_archiver_service.h b/src/v/archival/ntp_archiver_service.h
index af28ff17f2a54..ca45e7c5868a0 100644
--- a/src/v/archival/ntp_archiver_service.h
+++ b/src/v/archival/ntp_archiver_service.h
@@ -73,6 +73,8 @@ class ntp_archiver {
     /// storage. Can be started only once.
     void run_upload_loop();
 
+    void run_sync_manifest_loop();
+
     /// Stop archiver.
     ///
     /// \return future that will become ready when all async operation will be
@@ -80,6 +82,9 @@ class ntp_archiver {
     ss::future<> stop();
 
     bool upload_loop_stopped() const { return _upload_loop_stopped; }
+    bool sync_manifest_loop_stopped() const {
+        return _sync_manifest_loop_stopped;
+    }
 
     /// Get NTP
     const model::ntp& get_ntp() const;
@@ -112,6 +117,8 @@ class ntp_archiver {
     ss::future<batch_result> upload_next_candidates(
       std::optional<model::offset> last_stable_offset_override = std::nullopt);
 
+    ss::future<cloud_storage::download_result> sync_manifest();
+
     uint64_t estimate_backlog_size();
 
     /// \brief Probe remote storage and truncate the manifest if needed
@@ -166,7 +173,11 @@ class ntp_archiver {
     /// Launch the upload loop fiber.
     ss::future<> upload_loop();
 
+    /// Launch the sync manifest loop fiber.
+    ss::future<> sync_manifest_loop();
+
     bool upload_loop_can_continue() const;
+    bool sync_manifest_loop_can_continue() const;
 
     ntp_level_probe _probe;
     model::ntp _ntp;
@@ -198,6 +209,9 @@ class ntp_archiver {
     ss::io_priority_class _io_priority;
     bool _upload_loop_started = false;
     bool _upload_loop_stopped = false;
+
+    bool _sync_manifest_loop_started = false;
+    bool _sync_manifest_loop_stopped = false;
 };
 
 } // namespace archival
diff --git a/src/v/archival/service.cc b/src/v/archival/service.cc
index 6fc4179eff32b..6060f5d0532f3 100644
--- a/src/v/archival/service.cc
+++ b/src/v/archival/service.cc
@@ -231,48 +231,56 @@ ss::future<> scheduler_service_impl::add_ntp_archiver(
     if (_gate.is_closed()) {
         return ss::now();
     }
-    return archiver->download_manifest().then(
-      [this, archiver](cloud_storage::download_result result) {
-          auto ntp = archiver->get_ntp();
-          switch (result) {
-          case cloud_storage::download_result::success:
-              vlog(
-                _rtclog.info,
-                "Found manifest for partition {}",
-                archiver->get_ntp());
-              _probe.start_archiving_ntp();
-
-              _archivers.emplace(archiver->get_ntp(), archiver);
-              archiver->run_upload_loop();
-
-              return ss::now();
-          case cloud_storage::download_result::notfound:
-              vlog(
-                _rtclog.info,
-                "Start archiving new partition {}",
-                archiver->get_ntp());
-              // Start topic manifest upload
-              // asynchronously
-              if (ntp.tp.partition == 0) {
-                  // Upload manifest once per topic. GCS has strict
-                  // limits for single object updates.
-                  (void)upload_topic_manifest(
-                    model::topic_namespace(ntp.ns, ntp.tp.topic),
-                    archiver->get_revision_id());
-              }
-              _probe.start_archiving_ntp();
-
-              _archivers.emplace(archiver->get_ntp(), archiver);
-              archiver->run_upload_loop();
+    return archiver->download_manifest().then([this, archiver](
+                                                cloud_storage::download_result
+                                                  result) {
+        auto ntp = archiver->get_ntp();
+        auto part = _partition_manager.local().get(ntp);
+        switch (result) {
+        case cloud_storage::download_result::success:
+            vlog(_rtclog.info, "Found manifest for partition {}", ntp);
+
+            if (part->get_ntp_config().is_read_replica_mode_enabled()) {
+                archiver->run_sync_manifest_loop();
+            } else {
+                _probe.start_archiving_ntp();
+                archiver->run_upload_loop();
+            }
+            _archivers.emplace(ntp, archiver);
+
+            return ss::now();
+        case cloud_storage::download_result::notfound:
+            if (part->get_ntp_config().is_read_replica_mode_enabled()) {
+                vlog(
+                  _rtclog.info,
+                  "Couldn't download manifest for partition {} in read replica",
+                  ntp);
+                archiver->run_sync_manifest_loop();
+            } else {
+                vlog(_rtclog.info, "Start archiving new partition {}", ntp);
+                // Start topic manifest upload
+                // asynchronously
+                if (ntp.tp.partition == 0) {
+                    // Upload manifest once per topic. GCS has strict
+                    // limits for single object updates.
+                    (void)upload_topic_manifest(
+                      model::topic_namespace(ntp.ns, ntp.tp.topic),
+                      archiver->get_revision_id());
+                }
+                _probe.start_archiving_ntp();
+
+                archiver->run_upload_loop();
+            }
+            _archivers.emplace(ntp, archiver);
 
-              return ss::now();
-          case cloud_storage::download_result::failed:
-          case cloud_storage::download_result::timedout:
-              vlog(_rtclog.warn, "Manifest download failed");
-              return ss::make_exception_future<>(ss::timed_out_error());
-          }
-          return ss::now();
-      });
+            return ss::now();
+        case cloud_storage::download_result::failed:
+        case cloud_storage::download_result::timedout:
+            vlog(_rtclog.warn, "Manifest download failed");
+            return ss::make_exception_future<>(ss::timed_out_error());
+        }
+        return ss::now();
+    });
 }
 
 ss::future<>
@@ -329,7 +337,9 @@ ss::future<> scheduler_service_impl::reconcile_archivers() {
     // find archivers that have already stopped
     for (const auto& [ntp, archiver] : _archivers) {
         auto p = pm.get(ntp);
-        if (!p || archiver->upload_loop_stopped()) {
+        if (
+          !p
+          || (archiver->upload_loop_stopped() && archiver->sync_manifest_loop_stopped())) {
             to_remove.push_back(ntp);
         }
     }

From 4fae430d44fc69699516dc2265446917ba78e106 Mon Sep 17 00:00:00 2001
From: Elena Anyusheva <lena@vectorized.io>
Date: Fri, 8 Jul 2022 15:54:01 +0200
Subject: [PATCH 046/201] ducky: use si_settings in e2e test

---
 tests/rptest/tests/end_to_end.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/rptest/tests/end_to_end.py b/tests/rptest/tests/end_to_end.py
index ecdda96790149..594be6b244d3b 100644
--- a/tests/rptest/tests/end_to_end.py
+++ b/tests/rptest/tests/end_to_end.py
@@ -86,7 +86,8 @@ def start_redpanda(self,
         self.redpanda = RedpandaService(self.test_context,
                                         num_nodes,
                                         extra_rp_conf=self._extra_rp_conf,
-                                        extra_node_conf=self._extra_node_conf)
+                                        extra_node_conf=self._extra_node_conf,
+                                        si_settings=self.si_settings)
         self.redpanda.start()
         self._client = DefaultClient(self.redpanda)
 
@@ -153,8 +154,8 @@ def has_finished_consuming():
 
         wait_until(has_finished_consuming,
                    timeout_sec=timeout_sec,
-                   err_msg="Consumer failed to consume up to offsets %s after waiting %ds." %\
-                   (str(last_acked_offsets), timeout_sec))
+                   err_msg="Consumer failed to consume up to offsets %s after waiting %ds, last consumed offsets: %s." %\
+                   (str(last_acked_offsets), timeout_sec, list(self.last_consumed_offsets)))
 
     def _collect_all_logs(self):
         for s in self.test_context.services:

From 4afe69eac99187c130d5f7acba6457a52bdff957 Mon Sep 17 00:00:00 2001
From: Elena Anyusheva <lena@vectorized.io>
Date: Fri, 8 Jul 2022 17:28:17 +0200
Subject: [PATCH 047/201] ducky: add
 cloud_storage_readreplica_manifest_sync_timeout_ms

add cloud_storage_readreplica_manifest_sync_timeout_ms to SISetings.
This config affects how often read replica topic pulls data from S3.
---
 tests/rptest/services/redpanda.py | 39 +++++++++++++++++--------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/tests/rptest/services/redpanda.py b/tests/rptest/services/redpanda.py
index ecbafdd30c0b5..864baed73c64c 100644
--- a/tests/rptest/services/redpanda.py
+++ b/tests/rptest/services/redpanda.py
@@ -222,23 +222,24 @@ class SISettings:
     GLOBAL_S3_REGION_KEY = "s3_region"
 
     def __init__(
-            self,
-            *,
-            log_segment_size: int = 16 * 1000000,
-            cloud_storage_access_key: str = 'panda-user',
-            cloud_storage_secret_key: str = 'panda-secret',
-            cloud_storage_region: str = 'panda-region',
-            cloud_storage_bucket: Optional[str] = None,
-            cloud_storage_api_endpoint: str = 'minio-s3',
-            cloud_storage_api_endpoint_port: int = 9000,
-            cloud_storage_cache_size: int = 160 * 1000000,
-            cloud_storage_enable_remote_read: bool = True,
-            cloud_storage_enable_remote_write: bool = True,
-            cloud_storage_reconciliation_interval_ms: Optional[int] = None,
-            cloud_storage_max_connections: Optional[int] = None,
-            cloud_storage_disable_tls: bool = True,
-            cloud_storage_segment_max_upload_interval_sec: Optional[int] = None
-    ):
+        self,
+        *,
+        log_segment_size: int = 16 * 1000000,
+        cloud_storage_access_key: str = 'panda-user',
+        cloud_storage_secret_key: str = 'panda-secret',
+        cloud_storage_region: str = 'panda-region',
+        cloud_storage_bucket: Optional[str] = None,
+        cloud_storage_api_endpoint: str = 'minio-s3',
+        cloud_storage_api_endpoint_port: int = 9000,
+        cloud_storage_cache_size: int = 160 * 1000000,
+        cloud_storage_enable_remote_read: bool = True,
+        cloud_storage_enable_remote_write: bool = True,
+        cloud_storage_reconciliation_interval_ms: Optional[int] = None,
+        cloud_storage_max_connections: Optional[int] = None,
+        cloud_storage_disable_tls: bool = True,
+        cloud_storage_segment_max_upload_interval_sec: Optional[int] = None,
+        cloud_storage_readreplica_manifest_sync_timeout_ms: Optional[
+            int] = None):
         self.log_segment_size = log_segment_size
         self.cloud_storage_access_key = cloud_storage_access_key
         self.cloud_storage_secret_key = cloud_storage_secret_key
@@ -253,6 +254,7 @@ def __init__(
         self.cloud_storage_max_connections = cloud_storage_max_connections
         self.cloud_storage_disable_tls = cloud_storage_disable_tls
         self.cloud_storage_segment_max_upload_interval_sec = cloud_storage_segment_max_upload_interval_sec
+        self.cloud_storage_readreplica_manifest_sync_timeout_ms = cloud_storage_readreplica_manifest_sync_timeout_ms
         self.endpoint_url = f'http://{self.cloud_storage_api_endpoint}:{self.cloud_storage_api_endpoint_port}'
 
     def load_context(self, logger, test_context):
@@ -309,6 +311,9 @@ def update_rp_conf(self, conf) -> dict[str, Any]:
         if self.cloud_storage_max_connections:
             conf[
                 'cloud_storage_max_connections'] = self.cloud_storage_max_connections
+        if self.cloud_storage_readreplica_manifest_sync_timeout_ms:
+            conf[
+                'cloud_storage_readreplica_manifest_sync_timeout_ms'] = self.cloud_storage_readreplica_manifest_sync_timeout_ms
         if self.cloud_storage_segment_max_upload_interval_sec:
             conf[
                 'cloud_storage_segment_max_upload_interval_sec'] = self.cloud_storage_segment_max_upload_interval_sec

From 4b42c7eca8a8bfa0e17cd72a694ce982584626d4 Mon Sep 17 00:00:00 2001
From: Denis Rystsov <denis@vectorized.io>
Date: Fri, 1 Jul 2022 22:54:49 -0700
Subject: [PATCH 048/201] rm_stm: shift offset translation to rm_stm

switching to caching seq-kafka offsets cache to avoid out of
range errors on translating offsets beyond the eviction point
---
 src/v/cluster/partition.cc               |  10 +-
 src/v/cluster/rm_stm.cc                  | 118 +++++++++++++++--------
 src/v/cluster/rm_stm.h                   |  50 ++++++----
 src/v/cluster/tests/idempotency_tests.cc |   2 +-
 4 files changed, 118 insertions(+), 62 deletions(-)

diff --git a/src/v/cluster/partition.cc b/src/v/cluster/partition.cc
index 4004498c2d47d..0547135d06177 100644
--- a/src/v/cluster/partition.cc
+++ b/src/v/cluster/partition.cc
@@ -175,14 +175,12 @@ kafka_stages partition::replicate_in_stages(
         }
     }
 
-    ss::lw_shared_ptr<raft::replicate_stages> res;
     if (_rm_stm) {
-        res = _rm_stm->replicate_in_stages(bid, std::move(r), opts);
-    } else {
-        res = _raft->replicate_in_stages(std::move(r), opts);
+        return _rm_stm->replicate_in_stages(bid, std::move(r), opts);
     }
 
-    auto replicate_finished = res->replicate_finished.then(
+    auto res = _raft->replicate_in_stages(std::move(r), opts);
+    auto replicate_finished = res.replicate_finished.then(
       [this](result<raft::replicate_result> r) {
           if (!r) {
               return ret_t(r.error());
@@ -193,7 +191,7 @@ kafka_stages partition::replicate_in_stages(
           return ret_t(kafka_result{new_offset});
       });
     return kafka_stages(
-      std::move(res->request_enqueued), std::move(replicate_finished));
+      std::move(res.request_enqueued), std::move(replicate_finished));
 }
 
 ss::future<> partition::start() {
diff --git a/src/v/cluster/rm_stm.cc b/src/v/cluster/rm_stm.cc
index f1435e79a4975..328ebfca2b670 100644
--- a/src/v/cluster/rm_stm.cc
+++ b/src/v/cluster/rm_stm.cc
@@ -785,7 +785,7 @@ ss::future<tx_errc> rm_stm::do_abort_tx(
     co_return tx_errc::none;
 }
 
-raft::replicate_stages rm_stm::replicate_in_stages(
+kafka_stages rm_stm::replicate_in_stages(
   model::batch_identity bid,
   model::record_batch_reader r,
   raft::replicate_options opts) {
@@ -804,10 +804,10 @@ raft::replicate_stages rm_stm::replicate_in_stages(
                 enqueued->set_value();
             }
         });
-    return raft::replicate_stages(std::move(f), std::move(replicate_finished));
+    return kafka_stages(std::move(f), std::move(replicate_finished));
 }
 
-ss::future<result<raft::replicate_result>> rm_stm::replicate(
+ss::future<result<kafka_result>> rm_stm::replicate(
   model::batch_identity bid,
   model::record_batch_reader r,
   raft::replicate_options opts) {
@@ -824,7 +824,7 @@ rm_stm::transfer_leadership(std::optional<model::node_id> target) {
       });
 }
 
-ss::future<result<raft::replicate_result>> rm_stm::do_replicate(
+ss::future<result<kafka_result>> rm_stm::do_replicate(
   model::batch_identity bid,
   model::record_batch_reader b,
   raft::replicate_options opts,
@@ -854,6 +854,11 @@ ss::future<> rm_stm::stop() {
     return raft::state_machine::stop();
 }
 
+ss::future<> rm_stm::start() {
+    _translator = _c->get_offset_translator_state();
+    return persisted_stm::start();
+}
+
 rm_stm::transaction_info::status_t
 rm_stm::get_tx_status(model::producer_identity pid) const {
     if (_mem_state.preparing.contains(pid)) {
@@ -947,7 +952,7 @@ bool rm_stm::check_seq(model::batch_identity bid) {
         return false;
     }
 
-    seq.update(bid.last_seq, model::offset{-1});
+    seq.update(bid.last_seq, kafka::offset{-1});
 
     seq.pid = bid.pid;
     seq.last_write_timestamp = last_write_timestamp;
@@ -957,7 +962,7 @@ bool rm_stm::check_seq(model::batch_identity bid) {
     return true;
 }
 
-std::optional<model::offset>
+std::optional<kafka::offset>
 rm_stm::known_seq(model::batch_identity bid) const {
     auto pid_seq = _log_state.seq_table.find(bid.pid);
     if (pid_seq == _log_state.seq_table.end()) {
@@ -982,7 +987,7 @@ std::optional<int32_t> rm_stm::tail_seq(model::producer_identity pid) const {
     return pid_seq->second.seq;
 }
 
-void rm_stm::set_seq(model::batch_identity bid, model::offset last_offset) {
+void rm_stm::set_seq(model::batch_identity bid, kafka::offset last_offset) {
     auto pid_seq = _log_state.seq_table.find(bid.pid);
     if (pid_seq != _log_state.seq_table.end()) {
         if (pid_seq->second.seq == bid.last_seq) {
@@ -995,14 +1000,14 @@ void rm_stm::reset_seq(model::batch_identity bid) {
     _log_state.seq_table.erase(bid.pid);
     auto& seq = _log_state.seq_table[bid.pid];
     seq.seq = bid.last_seq;
-    seq.last_offset = model::offset{-1};
+    seq.last_offset = kafka::offset{-1};
     seq.pid = bid.pid;
     seq.last_write_timestamp = model::timestamp::now().value();
     _oldest_session = std::min(
       _oldest_session, model::timestamp(seq.last_write_timestamp));
 }
 
-ss::future<result<raft::replicate_result>>
+ss::future<result<kafka_result>>
 rm_stm::replicate_tx(model::batch_identity bid, model::record_batch_reader br) {
     if (!check_tx_permitted()) {
         co_return errc::generic_tx_error;
@@ -1056,7 +1061,7 @@ rm_stm::replicate_tx(model::batch_identity bid, model::record_batch_reader br) {
         // this isn't the first attempt in the tx we should try dedupe
         auto cached_offset = known_seq(bid);
         if (cached_offset) {
-            if (cached_offset.value() < model::offset{0}) {
+            if (cached_offset.value() < kafka::offset{0}) {
                 vlog(
                   clusterlog.warn,
                   "Status of the original attempt is unknown (still is "
@@ -1070,8 +1075,7 @@ rm_stm::replicate_tx(model::batch_identity bid, model::record_batch_reader br) {
                 // to propagate it to the app layer
                 co_return errc::generic_tx_error;
             }
-            co_return raft::replicate_result{
-              .last_offset = cached_offset.value()};
+            co_return kafka_result{.last_offset = cached_offset.value()};
         }
 
         if (!check_seq(bid)) {
@@ -1107,26 +1111,28 @@ rm_stm::replicate_tx(model::batch_identity bid, model::record_batch_reader br) {
     expiration_it->second.last_update = clock_type::now();
     expiration_it->second.is_expiration_requested = false;
 
-    auto replicated = r.value();
+    auto old_offset = r.value().last_offset;
+    auto new_offset = from_log_offset(old_offset);
 
-    set_seq(bid, replicated.last_offset);
+    set_seq(bid, new_offset);
 
-    auto last_offset = model::offset(replicated.last_offset());
     if (!_mem_state.tx_start.contains(bid.pid)) {
-        auto base_offset = model::offset(
-          last_offset() - (bid.record_count - 1));
+        auto base_offset = model::offset(old_offset() - (bid.record_count - 1));
         _mem_state.tx_start.emplace(bid.pid, base_offset);
         _mem_state.tx_starts.insert(base_offset);
         _mem_state.estimated.erase(bid.pid);
     }
-    co_return replicated;
+
+    co_return kafka_result{.last_offset = new_offset};
 }
 
-ss::future<result<raft::replicate_result>> rm_stm::replicate_seq(
+ss::future<result<kafka_result>> rm_stm::replicate_seq(
   model::batch_identity bid,
   model::record_batch_reader br,
   raft::replicate_options opts,
   ss::lw_shared_ptr<available_promise<>> enqueued) {
+    using ret_t = result<kafka_result>;
+
     if (!co_await sync(_sync_timeout)) {
         // it's ok not to set enqueued on early return because
         // the safety check in replicate_in_stages sets it automatically
@@ -1185,7 +1191,7 @@ ss::future<result<raft::replicate_result>> rm_stm::replicate_seq(
     // checking among the responded requests
     auto cached_offset = known_seq(bid);
     if (cached_offset) {
-        co_return raft::replicate_result{.last_offset = cached_offset.value()};
+        co_return kafka_result{.last_offset = cached_offset.value()};
     }
 
     // checking if the request is already being processed
@@ -1193,8 +1199,8 @@ ss::future<result<raft::replicate_result>> rm_stm::replicate_seq(
         if (inflight->last_seq == bid.last_seq && inflight->is_processing) {
             // found an inflight request, parking the current request
             // until the former is resolved
-            auto promise = ss::make_lw_shared<
-              available_promise<result<raft::replicate_result>>>();
+            auto promise
+              = ss::make_lw_shared<available_promise<result<kafka_result>>>();
             inflight->parked.push_back(promise);
             u.return_all();
             co_return co_await promise->get_future();
@@ -1279,20 +1285,26 @@ ss::future<result<raft::replicate_result>> rm_stm::replicate_seq(
     // we don't need session->lock because we never interleave
     // access to is_processing and offset with sync point (await)
     request->is_processing = false;
-    request->r = r;
+    if (r) {
+        auto old_offset = r.value().last_offset;
+        auto new_offset = from_log_offset(old_offset);
+        request->r = ret_t(kafka_result{new_offset});
+    } else {
+        request->r = ret_t(r.error());
+    }
     for (auto& pending : request->parked) {
-        pending->set_value(r);
+        pending->set_value(request->r);
     }
     request->parked.clear();
 
-    if (!r) {
+    if (!request->r) {
         // if r was failed at the consensus level (not because has_failed)
         // it should guarantee that all follow up replication requests fail
         // too but just in case stepping down to minimize the risk
         if (_c->is_leader() && _c->term() == synced_term) {
             co_await _c->step_down();
         }
-        co_return r;
+        co_return request->r;
     }
 
     // requests get into session->cache in seq order so when we iterate
@@ -1324,13 +1336,15 @@ ss::future<result<raft::replicate_result>> rm_stm::replicate_seq(
         _inflight_requests.erase(bid.pid);
     }
 
-    co_return r;
+    co_return request->r;
 }
 
-ss::future<result<raft::replicate_result>> rm_stm::replicate_msg(
+ss::future<result<kafka_result>> rm_stm::replicate_msg(
   model::record_batch_reader br,
   raft::replicate_options opts,
   ss::lw_shared_ptr<available_promise<>> enqueued) {
+    using ret_t = result<kafka_result>;
+
     if (!co_await sync(_sync_timeout)) {
         co_return errc::not_leader;
     }
@@ -1338,7 +1352,14 @@ ss::future<result<raft::replicate_result>> rm_stm::replicate_msg(
     auto ss = _c->replicate_in_stages(_insync_term, std::move(br), opts);
     co_await std::move(ss.request_enqueued);
     enqueued->set_value();
-    co_return co_await std::move(ss.replicate_finished);
+    auto r = co_await std::move(ss.replicate_finished);
+
+    if (!r) {
+        co_return ret_t(r.error());
+    }
+    auto old_offset = r.value().last_offset;
+    auto new_offset = from_log_offset(old_offset);
+    co_return ret_t(kafka_result{new_offset});
 }
 
 model::offset rm_stm::last_stable_offset() {
@@ -1785,12 +1806,13 @@ ss::future<> rm_stm::apply_control(
 void rm_stm::apply_data(model::batch_identity bid, model::offset last_offset) {
     if (bid.has_idempotent()) {
         auto [seq_it, inserted] = _log_state.seq_table.try_emplace(bid.pid);
+        auto translated = from_log_offset(last_offset);
         if (inserted) {
             seq_it->second.pid = bid.pid;
             seq_it->second.seq = bid.last_seq;
-            seq_it->second.last_offset = last_offset;
+            seq_it->second.last_offset = translated;
         } else {
-            seq_it->second.update(bid.last_seq, last_offset);
+            seq_it->second.update(bid.last_seq, translated);
         }
         seq_it->second.last_write_timestamp = bid.first_timestamp.value();
         _oldest_session = std::min(_oldest_session, bid.first_timestamp);
@@ -1849,11 +1871,21 @@ rm_stm::apply_snapshot(stm_snapshot_header hdr, iobuf&& tx_ss_buf) {
             seq_entry seq;
             seq.pid = seq_v1.pid;
             seq.seq = seq_v1.seq;
-            seq.last_offset = seq_v1.last_offset;
+            try {
+                seq.last_offset = from_log_offset(seq_v1.last_offset);
+            } catch (...) {
+                // ignoring outside the translation range errors
+                continue;
+            }
             seq.seq_cache.reserve(seq_v1.seq_cache.size());
             for (auto& item : seq_v1.seq_cache) {
-                seq.seq_cache.push_back(
-                  seq_cache_entry{.seq = item.seq, .offset = item.offset});
+                try {
+                    seq.seq_cache.push_back(seq_cache_entry{
+                      .seq = item.seq, .offset = from_log_offset(item.offset)});
+                } catch (...) {
+                    // ignoring outside the translation range errors
+                    continue;
+                }
             }
             seq.last_write_timestamp = seq_v1.last_write_timestamp;
             data.seqs.push_back(std::move(seq));
@@ -1870,7 +1902,7 @@ rm_stm::apply_snapshot(stm_snapshot_header hdr, iobuf&& tx_ss_buf) {
             auto seq = seq_entry{
               .pid = seq_v0.pid,
               .seq = seq_v0.seq,
-              .last_offset = model::offset{-1},
+              .last_offset = kafka::offset{-1},
               .last_write_timestamp = seq_v0.last_write_timestamp};
             data.seqs.push_back(std::move(seq));
         }
@@ -1989,12 +2021,22 @@ ss::future<stm_snapshot> rm_stm::take_snapshot() {
             seq_entry_v1 seqs;
             seqs.pid = entry.pid;
             seqs.seq = entry.seq;
-            seqs.last_offset = entry.last_offset;
+            try {
+                seqs.last_offset = to_log_offset(entry.last_offset);
+            } catch (...) {
+                // ignoring outside the translation range errors
+                continue;
+            }
             seqs.last_write_timestamp = entry.last_write_timestamp;
             seqs.seq_cache.reserve(seqs.seq_cache.size());
             for (auto& item : entry.seq_cache) {
-                seqs.seq_cache.push_back(
-                  seq_cache_entry_v1{.seq = item.seq, .offset = item.offset});
+                try {
+                    seqs.seq_cache.push_back(seq_cache_entry_v1{
+                      .seq = item.seq, .offset = to_log_offset(item.offset)});
+                } catch (...) {
+                    // ignoring outside the translation range errors
+                    continue;
+                }
             }
             tx_ss.seqs.push_back(std::move(seqs));
         }
diff --git a/src/v/cluster/rm_stm.h b/src/v/cluster/rm_stm.h
index fcb0ea4b46563..0be0ae8915982 100644
--- a/src/v/cluster/rm_stm.h
+++ b/src/v/cluster/rm_stm.h
@@ -21,6 +21,7 @@
 #include "raft/logger.h"
 #include "raft/state_machine.h"
 #include "raft/types.h"
+#include "storage/offset_translator_state.h"
 #include "storage/snapshot.h"
 #include "utils/available_promise.h"
 #include "utils/expiring_promise.h"
@@ -74,14 +75,14 @@ class rm_stm final : public persisted_stm {
 
     struct seq_cache_entry {
         int32_t seq{-1};
-        model::offset offset;
+        kafka::offset offset;
     };
 
     struct seq_entry {
         static const int seq_cache_size = 5;
         model::producer_identity pid;
         int32_t seq{-1};
-        model::offset last_offset{-1};
+        kafka::offset last_offset{-1};
         ss::circular_buffer<seq_cache_entry> seq_cache;
         model::timestamp::type last_write_timestamp;
 
@@ -99,7 +100,7 @@ class rm_stm final : public persisted_stm {
             return ret;
         }
 
-        void update(int32_t new_seq, model::offset new_offset) {
+        void update(int32_t new_seq, kafka::offset new_offset) {
             if (new_seq < seq) {
                 return;
             }
@@ -109,7 +110,7 @@ class rm_stm final : public persisted_stm {
                 return;
             }
 
-            if (seq >= 0 && last_offset >= model::offset{0}) {
+            if (seq >= 0 && last_offset >= kafka::offset{0}) {
                 auto entry = seq_cache_entry{.seq = seq, .offset = last_offset};
                 seq_cache.push_back(entry);
                 while (seq_cache.size() >= seq_entry::seq_cache_size) {
@@ -169,12 +170,12 @@ class rm_stm final : public persisted_stm {
     ss::future<std::vector<rm_stm::tx_range>>
       aborted_transactions(model::offset, model::offset);
 
-    raft::replicate_stages replicate_in_stages(
+    kafka_stages replicate_in_stages(
       model::batch_identity,
       model::record_batch_reader,
       raft::replicate_options);
 
-    ss::future<result<raft::replicate_result>> replicate(
+    ss::future<result<kafka_result>> replicate(
       model::batch_identity,
       model::record_batch_reader,
       raft::replicate_options);
@@ -184,6 +185,8 @@ class rm_stm final : public persisted_stm {
 
     ss::future<> stop() override;
 
+    ss::future<> start() override;
+
     void testing_only_disable_auto_abort() { _is_autoabort_enabled = false; }
 
     void testing_only_enable_transactions() { _is_tx_enabled = true; }
@@ -273,27 +276,27 @@ class rm_stm final : public persisted_stm {
     ss::future<> save_abort_snapshot(abort_snapshot);
 
     bool check_seq(model::batch_identity);
-    std::optional<model::offset> known_seq(model::batch_identity) const;
-    void set_seq(model::batch_identity, model::offset);
+    std::optional<kafka::offset> known_seq(model::batch_identity) const;
+    void set_seq(model::batch_identity, kafka::offset);
     void reset_seq(model::batch_identity);
     std::optional<int32_t> tail_seq(model::producer_identity) const;
 
-    ss::future<result<raft::replicate_result>> do_replicate(
+    ss::future<result<kafka_result>> do_replicate(
       model::batch_identity,
       model::record_batch_reader,
       raft::replicate_options,
       ss::lw_shared_ptr<available_promise<>>);
 
-    ss::future<result<raft::replicate_result>>
+    ss::future<result<kafka_result>>
       replicate_tx(model::batch_identity, model::record_batch_reader);
 
-    ss::future<result<raft::replicate_result>> replicate_seq(
+    ss::future<result<kafka_result>> replicate_seq(
       model::batch_identity,
       model::record_batch_reader,
       raft::replicate_options,
       ss::lw_shared_ptr<available_promise<>>);
 
-    ss::future<result<raft::replicate_result>> replicate_msg(
+    ss::future<result<kafka_result>> replicate_msg(
       model::record_batch_reader,
       raft::replicate_options,
       ss::lw_shared_ptr<available_promise<>>);
@@ -423,10 +426,9 @@ class rm_stm final : public persisted_stm {
     // original request is replicated.
     struct inflight_request {
         int32_t last_seq{-1};
-        result<raft::replicate_result> r = errc::success;
+        result<kafka_result> r = errc::success;
         bool is_processing;
-        std::vector<
-          ss::lw_shared_ptr<available_promise<result<raft::replicate_result>>>>
+        std::vector<ss::lw_shared_ptr<available_promise<result<kafka_result>>>>
           parked;
     };
 
@@ -466,8 +468,7 @@ class rm_stm final : public persisted_stm {
             tail_seq = -1;
         }
 
-        std::optional<result<raft::replicate_result>>
-        known_seq(int32_t last_seq) const {
+        std::optional<result<kafka_result>> known_seq(int32_t last_seq) const {
             for (auto& seq : cache) {
                 if (seq->last_seq == last_seq && !seq->is_processing) {
                     return seq->r;
@@ -487,6 +488,20 @@ class rm_stm final : public persisted_stm {
         return lock_it->second;
     }
 
+    kafka::offset from_log_offset(model::offset old_offset) {
+        if (old_offset > model::offset{-1}) {
+            return kafka::offset(_translator->from_log_offset(old_offset)());
+        }
+        return kafka::offset(old_offset());
+    }
+
+    model::offset to_log_offset(kafka::offset new_offset) {
+        if (new_offset > model::offset{-1}) {
+            return _translator->to_log_offset(model::offset(new_offset()));
+        }
+        return model::offset(new_offset());
+    }
+
     transaction_info::status_t
     get_tx_status(model::producer_identity pid) const;
     std::optional<expiration_info>
@@ -519,6 +534,7 @@ class rm_stm final : public persisted_stm {
     bool _is_tx_enabled{false};
     ss::sharded<cluster::tx_gateway_frontend>& _tx_gateway_frontend;
     storage::snapshot_manager _abort_snapshot_mgr;
+    ss::lw_shared_ptr<const storage::offset_translator_state> _translator;
 };
 
 } // namespace cluster
diff --git a/src/v/cluster/tests/idempotency_tests.cc b/src/v/cluster/tests/idempotency_tests.cc
index 1d3187181691b..a9c1d6a3b252b 100644
--- a/src/v/cluster/tests/idempotency_tests.cc
+++ b/src/v/cluster/tests/idempotency_tests.cc
@@ -150,7 +150,7 @@ FIXTURE_TEST(test_rm_stm_caches_last_5_offsets, mux_state_machine_fixture) {
     wait_for_confirmed_leader();
     wait_for_meta_initialized();
 
-    std::vector<model::offset> offsets;
+    std::vector<kafka::offset> offsets;
 
     auto count = 5;
 

From 065fb54aeb28f38aecd67ccc2d26a6dbc5c9ef9f Mon Sep 17 00:00:00 2001
From: Denis Rystsov <denis@vectorized.io>
Date: Mon, 4 Jul 2022 19:16:14 -0700
Subject: [PATCH 049/201] rm_stm: remove dead code

---
 src/v/cluster/rm_stm.h | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/src/v/cluster/rm_stm.h b/src/v/cluster/rm_stm.h
index 0be0ae8915982..aee55034860a6 100644
--- a/src/v/cluster/rm_stm.h
+++ b/src/v/cluster/rm_stm.h
@@ -409,18 +409,6 @@ class rm_stm final : public persisted_stm {
         }
     };
 
-    struct request_id {
-        model::producer_identity pid;
-        int32_t seq;
-
-        auto operator<=>(const request_id&) const = default;
-
-        template<typename H>
-        friend H AbslHashValue(H h, const request_id& bid) {
-            return H::combine(std::move(h), bid.pid, bid.seq);
-        }
-    };
-
     // When a request is retried while the first appempt is still
     // being replicated the retried request is parked until the
     // original request is replicated.

From 90007628f63115bc322fc1731c582ec4b2fac237 Mon Sep 17 00:00:00 2001
From: Denis Rystsov <denis@vectorized.io>
Date: Tue, 5 Jul 2022 21:54:40 -0700
Subject: [PATCH 050/201] rm_stm: add feature_table as a dependency

---
 src/v/cluster/partition.cc               |  6 ++--
 src/v/cluster/partition.h                |  5 ++-
 src/v/cluster/partition_manager.cc       | 12 +++++--
 src/v/cluster/partition_manager.h        |  5 ++-
 src/v/cluster/rm_stm.cc                  |  6 ++--
 src/v/cluster/rm_stm.h                   |  5 ++-
 src/v/cluster/tests/idempotency_tests.cc | 43 ++++++++++++++++++++----
 src/v/cluster/tests/rm_stm_tests.cc      | 37 ++++++++++++++++----
 src/v/redpanda/application.cc            |  3 +-
 9 files changed, 98 insertions(+), 24 deletions(-)

diff --git a/src/v/cluster/partition.cc b/src/v/cluster/partition.cc
index 0547135d06177..090616884b616 100644
--- a/src/v/cluster/partition.cc
+++ b/src/v/cluster/partition.cc
@@ -32,10 +32,12 @@ partition::partition(
   consensus_ptr r,
   ss::sharded<cluster::tx_gateway_frontend>& tx_gateway_frontend,
   ss::sharded<cloud_storage::remote>& cloud_storage_api,
-  ss::sharded<cloud_storage::cache>& cloud_storage_cache)
+  ss::sharded<cloud_storage::cache>& cloud_storage_cache,
+  ss::sharded<feature_table>& feature_table)
   : _raft(r)
   , _probe(std::make_unique<replicated_partition_probe>(*this))
   , _tx_gateway_frontend(tx_gateway_frontend)
+  , _feature_table(feature_table)
   , _is_tx_enabled(config::shard_local_cfg().enable_transactions.value())
   , _is_idempotence_enabled(
       config::shard_local_cfg().enable_idempotence.value()) {
@@ -70,7 +72,7 @@ partition::partition(
 
         if (has_rm_stm) {
             _rm_stm = ss::make_shared<cluster::rm_stm>(
-              clusterlog, _raft.get(), _tx_gateway_frontend);
+              clusterlog, _raft.get(), _tx_gateway_frontend, _feature_table);
             stm_manager->add_stm(_rm_stm);
         }
 
diff --git a/src/v/cluster/partition.h b/src/v/cluster/partition.h
index 93f6b19b4387e..060153c9cc0e0 100644
--- a/src/v/cluster/partition.h
+++ b/src/v/cluster/partition.h
@@ -13,6 +13,7 @@
 
 #include "cloud_storage/remote_partition.h"
 #include "cluster/archival_metadata_stm.h"
+#include "cluster/feature_table.h"
 #include "cluster/id_allocator_stm.h"
 #include "cluster/partition_probe.h"
 #include "cluster/rm_stm.h"
@@ -44,7 +45,8 @@ class partition {
       consensus_ptr r,
       ss::sharded<cluster::tx_gateway_frontend>&,
       ss::sharded<cloud_storage::remote>&,
-      ss::sharded<cloud_storage::cache>&);
+      ss::sharded<cloud_storage::cache>&,
+      ss::sharded<feature_table>&);
 
     raft::group_id group() const { return _raft->group(); }
     ss::future<> start();
@@ -290,6 +292,7 @@ class partition {
     ss::abort_source _as;
     partition_probe _probe;
     ss::sharded<cluster::tx_gateway_frontend>& _tx_gateway_frontend;
+    ss::sharded<feature_table>& _feature_table;
     bool _is_tx_enabled{false};
     bool _is_idempotence_enabled{false};
     ss::lw_shared_ptr<cloud_storage::remote_partition> _cloud_storage_partition;
diff --git a/src/v/cluster/partition_manager.cc b/src/v/cluster/partition_manager.cc
index 406baf98795c8..263b3f1857ef5 100644
--- a/src/v/cluster/partition_manager.cc
+++ b/src/v/cluster/partition_manager.cc
@@ -48,13 +48,15 @@ partition_manager::partition_manager(
   ss::sharded<cluster::tx_gateway_frontend>& tx_gateway_frontend,
   ss::sharded<cloud_storage::partition_recovery_manager>& recovery_mgr,
   ss::sharded<cloud_storage::remote>& cloud_storage_api,
-  ss::sharded<cloud_storage::cache>& cloud_storage_cache)
+  ss::sharded<cloud_storage::cache>& cloud_storage_cache,
+  ss::sharded<feature_table>& feature_table)
   : _storage(storage.local())
   , _raft_manager(raft)
   , _tx_gateway_frontend(tx_gateway_frontend)
   , _partition_recovery_mgr(recovery_mgr)
   , _cloud_storage_api(cloud_storage_api)
-  , _cloud_storage_cache(cloud_storage_cache) {}
+  , _cloud_storage_cache(cloud_storage_cache)
+  , _feature_table(feature_table) {}
 
 partition_manager::ntp_table_container
 partition_manager::get_topic_partition_table(
@@ -120,7 +122,11 @@ ss::future<consensus_ptr> partition_manager::manage(
         group, std::move(initial_nodes), log);
 
     auto p = ss::make_lw_shared<partition>(
-      c, _tx_gateway_frontend, _cloud_storage_api, _cloud_storage_cache);
+      c,
+      _tx_gateway_frontend,
+      _cloud_storage_api,
+      _cloud_storage_cache,
+      _feature_table);
 
     _ntp_table.emplace(log.config().ntp(), p);
     _raft_table.emplace(group, p);
diff --git a/src/v/cluster/partition_manager.h b/src/v/cluster/partition_manager.h
index fa66f5ea31a09..8f45302e4abc8 100644
--- a/src/v/cluster/partition_manager.h
+++ b/src/v/cluster/partition_manager.h
@@ -14,6 +14,7 @@
 #include "cloud_storage/cache_service.h"
 #include "cloud_storage/partition_recovery_manager.h"
 #include "cloud_storage/remote.h"
+#include "cluster/feature_table.h"
 #include "cluster/ntp_callbacks.h"
 #include "cluster/partition.h"
 #include "model/metadata.h"
@@ -37,7 +38,8 @@ class partition_manager {
       ss::sharded<cluster::tx_gateway_frontend>&,
       ss::sharded<cloud_storage::partition_recovery_manager>&,
       ss::sharded<cloud_storage::remote>&,
-      ss::sharded<cloud_storage::cache>&);
+      ss::sharded<cloud_storage::cache>&,
+      ss::sharded<feature_table>&);
 
     using manage_cb_t
       = ss::noncopyable_function<void(ss::lw_shared_ptr<partition>)>;
@@ -190,6 +192,7 @@ class partition_manager {
       _partition_recovery_mgr;
     ss::sharded<cloud_storage::remote>& _cloud_storage_api;
     ss::sharded<cloud_storage::cache>& _cloud_storage_cache;
+    ss::sharded<feature_table>& _feature_table;
     ss::gate _gate;
     bool _block_new_leadership{false};
 
diff --git a/src/v/cluster/rm_stm.cc b/src/v/cluster/rm_stm.cc
index 328ebfca2b670..3658c61db0327 100644
--- a/src/v/cluster/rm_stm.cc
+++ b/src/v/cluster/rm_stm.cc
@@ -215,7 +215,8 @@ struct tx_snapshot_v1 {
 rm_stm::rm_stm(
   ss::logger& logger,
   raft::consensus* c,
-  ss::sharded<cluster::tx_gateway_frontend>& tx_gateway_frontend)
+  ss::sharded<cluster::tx_gateway_frontend>& tx_gateway_frontend,
+  ss::sharded<feature_table>& feature_table)
   : persisted_stm("tx.snapshot", logger, c)
   , _oldest_session(model::timestamp::now())
   , _sync_timeout(config::shard_local_cfg().rm_sync_timeout_ms.value())
@@ -234,7 +235,8 @@ rm_stm::rm_stm(
   , _abort_snapshot_mgr(
       "abort.idx",
       std::filesystem::path(c->log_config().work_directory()),
-      ss::default_priority_class()) {
+      ss::default_priority_class())
+  , _feature_table(feature_table) {
     if (!_is_tx_enabled) {
         _is_autoabort_enabled = false;
     }
diff --git a/src/v/cluster/rm_stm.h b/src/v/cluster/rm_stm.h
index aee55034860a6..6e6c09c542e25 100644
--- a/src/v/cluster/rm_stm.h
+++ b/src/v/cluster/rm_stm.h
@@ -11,6 +11,7 @@
 
 #pragma once
 
+#include "cluster/feature_table.h"
 #include "cluster/persisted_stm.h"
 #include "cluster/tx_utils.h"
 #include "cluster/types.h"
@@ -151,7 +152,8 @@ class rm_stm final : public persisted_stm {
     explicit rm_stm(
       ss::logger&,
       raft::consensus*,
-      ss::sharded<cluster::tx_gateway_frontend>&);
+      ss::sharded<cluster::tx_gateway_frontend>&,
+      ss::sharded<feature_table>&);
 
     ss::future<checked<model::term_id, tx_errc>> begin_tx(
       model::producer_identity, model::tx_seq, std::chrono::milliseconds);
@@ -523,6 +525,7 @@ class rm_stm final : public persisted_stm {
     ss::sharded<cluster::tx_gateway_frontend>& _tx_gateway_frontend;
     storage::snapshot_manager _abort_snapshot_mgr;
     ss::lw_shared_ptr<const storage::offset_translator_state> _translator;
+    ss::sharded<feature_table>& _feature_table;
 };
 
 } // namespace cluster
diff --git a/src/v/cluster/tests/idempotency_tests.cc b/src/v/cluster/tests/idempotency_tests.cc
index a9c1d6a3b252b..f79f59eaccd06 100644
--- a/src/v/cluster/tests/idempotency_tests.cc
+++ b/src/v/cluster/tests/idempotency_tests.cc
@@ -8,6 +8,7 @@
 // by the Apache License, Version 2.0
 
 #include "cluster/errc.h"
+#include "cluster/feature_table.h"
 #include "cluster/rm_stm.h"
 #include "finjector/hbadger.h"
 #include "model/fundamental.h"
@@ -36,7 +37,10 @@ FIXTURE_TEST(
     start_raft();
 
     ss::sharded<cluster::tx_gateway_frontend> tx_gateway_frontend;
-    cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend);
+    ss::sharded<cluster::feature_table> feature_table;
+    feature_table.start().get0();
+    cluster::rm_stm stm(
+      logger, _raft.get(), tx_gateway_frontend, feature_table);
     stm.testing_only_disable_auto_abort();
 
     stm.start().get0();
@@ -81,6 +85,7 @@ FIXTURE_TEST(
                   raft::replicate_options(raft::consistency_level::quorum_ack))
                 .get0();
     BOOST_REQUIRE((bool)r2);
+    feature_table.stop().get0();
 }
 
 FIXTURE_TEST(
@@ -88,7 +93,10 @@ FIXTURE_TEST(
     start_raft();
 
     ss::sharded<cluster::tx_gateway_frontend> tx_gateway_frontend;
-    cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend);
+    ss::sharded<cluster::feature_table> feature_table;
+    feature_table.start().get0();
+    cluster::rm_stm stm(
+      logger, _raft.get(), tx_gateway_frontend, feature_table);
     stm.testing_only_disable_auto_abort();
 
     stm.start().get0();
@@ -135,13 +143,17 @@ FIXTURE_TEST(
     BOOST_REQUIRE((bool)r2);
 
     BOOST_REQUIRE(r1.value().last_offset < r2.value().last_offset);
+    feature_table.stop().get0();
 }
 
 FIXTURE_TEST(test_rm_stm_caches_last_5_offsets, mux_state_machine_fixture) {
     start_raft();
 
     ss::sharded<cluster::tx_gateway_frontend> tx_gateway_frontend;
-    cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend);
+    ss::sharded<cluster::feature_table> feature_table;
+    feature_table.start().get0();
+    cluster::rm_stm stm(
+      logger, _raft.get(), tx_gateway_frontend, feature_table);
     stm.testing_only_disable_auto_abort();
 
     stm.start().get0();
@@ -200,13 +212,17 @@ FIXTURE_TEST(test_rm_stm_caches_last_5_offsets, mux_state_machine_fixture) {
         BOOST_REQUIRE((bool)r1);
         BOOST_REQUIRE(r1.value().last_offset == offsets[i]);
     }
+    feature_table.stop().get0();
 }
 
 FIXTURE_TEST(test_rm_stm_doesnt_cache_6th_offset, mux_state_machine_fixture) {
     start_raft();
 
     ss::sharded<cluster::tx_gateway_frontend> tx_gateway_frontend;
-    cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend);
+    ss::sharded<cluster::feature_table> feature_table;
+    feature_table.start().get0();
+    cluster::rm_stm stm(
+      logger, _raft.get(), tx_gateway_frontend, feature_table);
     stm.testing_only_disable_auto_abort();
 
     stm.start().get0();
@@ -260,13 +276,17 @@ FIXTURE_TEST(test_rm_stm_doesnt_cache_6th_offset, mux_state_machine_fixture) {
           r1
           == failure_type<cluster::errc>(cluster::errc::sequence_out_of_order));
     }
+    feature_table.stop().get0();
 }
 
 FIXTURE_TEST(test_rm_stm_prevents_gaps, mux_state_machine_fixture) {
     start_raft();
 
     ss::sharded<cluster::tx_gateway_frontend> tx_gateway_frontend;
-    cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend);
+    ss::sharded<cluster::feature_table> feature_table;
+    feature_table.start().get0();
+    cluster::rm_stm stm(
+      logger, _raft.get(), tx_gateway_frontend, feature_table);
     stm.testing_only_disable_auto_abort();
 
     stm.start().get0();
@@ -312,6 +332,7 @@ FIXTURE_TEST(test_rm_stm_prevents_gaps, mux_state_machine_fixture) {
                 .get0();
     BOOST_REQUIRE(
       r2 == failure_type<cluster::errc>(cluster::errc::sequence_out_of_order));
+    feature_table.stop().get0();
 }
 
 FIXTURE_TEST(
@@ -319,7 +340,10 @@ FIXTURE_TEST(
     start_raft();
 
     ss::sharded<cluster::tx_gateway_frontend> tx_gateway_frontend;
-    cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend);
+    ss::sharded<cluster::feature_table> feature_table;
+    feature_table.start().get0();
+    cluster::rm_stm stm(
+      logger, _raft.get(), tx_gateway_frontend, feature_table);
     stm.testing_only_disable_auto_abort();
 
     stm.start().get0();
@@ -349,13 +373,17 @@ FIXTURE_TEST(
                .get0();
     BOOST_REQUIRE(
       r == failure_type<cluster::errc>(cluster::errc::sequence_out_of_order));
+    feature_table.stop().get0();
 }
 
 FIXTURE_TEST(test_rm_stm_passes_immediate_retry, mux_state_machine_fixture) {
     start_raft();
 
     ss::sharded<cluster::tx_gateway_frontend> tx_gateway_frontend;
-    cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend);
+    ss::sharded<cluster::feature_table> feature_table;
+    feature_table.start().get0();
+    cluster::rm_stm stm(
+      logger, _raft.get(), tx_gateway_frontend, feature_table);
     stm.testing_only_disable_auto_abort();
 
     stm.start().get0();
@@ -404,4 +432,5 @@ FIXTURE_TEST(test_rm_stm_passes_immediate_retry, mux_state_machine_fixture) {
     BOOST_REQUIRE((bool)r1);
     BOOST_REQUIRE((bool)r2);
     BOOST_REQUIRE(r1.value().last_offset == r2.value().last_offset);
+    feature_table.stop().get0();
 }
diff --git a/src/v/cluster/tests/rm_stm_tests.cc b/src/v/cluster/tests/rm_stm_tests.cc
index c53da9af13c5c..f40bb497b9d94 100644
--- a/src/v/cluster/tests/rm_stm_tests.cc
+++ b/src/v/cluster/tests/rm_stm_tests.cc
@@ -8,6 +8,7 @@
 // by the Apache License, Version 2.0
 
 #include "cluster/errc.h"
+#include "cluster/feature_table.h"
 #include "cluster/rm_stm.h"
 #include "finjector/hbadger.h"
 #include "model/fundamental.h"
@@ -65,7 +66,10 @@ FIXTURE_TEST(test_tx_happy_tx, mux_state_machine_fixture) {
     start_raft();
 
     ss::sharded<cluster::tx_gateway_frontend> tx_gateway_frontend;
-    cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend);
+    ss::sharded<cluster::feature_table> feature_table;
+    feature_table.start().get0();
+    cluster::rm_stm stm(
+      logger, _raft.get(), tx_gateway_frontend, feature_table);
     stm.testing_only_disable_auto_abort();
     stm.testing_only_enable_transactions();
 
@@ -129,6 +133,7 @@ FIXTURE_TEST(test_tx_happy_tx, mux_state_machine_fixture) {
     BOOST_REQUIRE_EQUAL(aborted_txs.size(), 0);
 
     BOOST_REQUIRE_LT(tx_offset, stm.last_stable_offset());
+    feature_table.stop().get0();
 }
 
 // tests:
@@ -138,7 +143,10 @@ FIXTURE_TEST(test_tx_aborted_tx_1, mux_state_machine_fixture) {
     start_raft();
 
     ss::sharded<cluster::tx_gateway_frontend> tx_gateway_frontend;
-    cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend);
+    ss::sharded<cluster::feature_table> feature_table;
+    feature_table.start().get0();
+    cluster::rm_stm stm(
+      logger, _raft.get(), tx_gateway_frontend, feature_table);
     stm.testing_only_disable_auto_abort();
     stm.testing_only_enable_transactions();
 
@@ -204,6 +212,7 @@ FIXTURE_TEST(test_tx_aborted_tx_1, mux_state_machine_fixture) {
       }));
 
     BOOST_REQUIRE_LT(tx_offset, stm.last_stable_offset());
+    feature_table.stop().get0();
 }
 
 // tests:
@@ -213,7 +222,10 @@ FIXTURE_TEST(test_tx_aborted_tx_2, mux_state_machine_fixture) {
     start_raft();
 
     ss::sharded<cluster::tx_gateway_frontend> tx_gateway_frontend;
-    cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend);
+    ss::sharded<cluster::feature_table> feature_table;
+    feature_table.start().get0();
+    cluster::rm_stm stm(
+      logger, _raft.get(), tx_gateway_frontend, feature_table);
     stm.testing_only_disable_auto_abort();
     stm.testing_only_enable_transactions();
 
@@ -285,6 +297,7 @@ FIXTURE_TEST(test_tx_aborted_tx_2, mux_state_machine_fixture) {
       }));
 
     BOOST_REQUIRE_LT(tx_offset, stm.last_stable_offset());
+    feature_table.stop().get0();
 }
 
 // transactional writes of an unknown tx are rejected
@@ -292,7 +305,10 @@ FIXTURE_TEST(test_tx_unknown_produce, mux_state_machine_fixture) {
     start_raft();
 
     ss::sharded<cluster::tx_gateway_frontend> tx_gateway_frontend;
-    cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend);
+    ss::sharded<cluster::feature_table> feature_table;
+    feature_table.start().get0();
+    cluster::rm_stm stm(
+      logger, _raft.get(), tx_gateway_frontend, feature_table);
     stm.testing_only_disable_auto_abort();
     stm.testing_only_enable_transactions();
 
@@ -322,6 +338,7 @@ FIXTURE_TEST(test_tx_unknown_produce, mux_state_machine_fixture) {
                    raft::replicate_options(raft::consistency_level::quorum_ack))
                  .get0();
     BOOST_REQUIRE(offset_r == invalid_producer_epoch);
+    feature_table.stop().get0();
 }
 
 // begin fences off old transactions
@@ -329,7 +346,10 @@ FIXTURE_TEST(test_tx_begin_fences_produce, mux_state_machine_fixture) {
     start_raft();
 
     ss::sharded<cluster::tx_gateway_frontend> tx_gateway_frontend;
-    cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend);
+    ss::sharded<cluster::feature_table> feature_table;
+    feature_table.start().get0();
+    cluster::rm_stm stm(
+      logger, _raft.get(), tx_gateway_frontend, feature_table);
     stm.testing_only_disable_auto_abort();
     stm.testing_only_enable_transactions();
 
@@ -379,6 +399,7 @@ FIXTURE_TEST(test_tx_begin_fences_produce, mux_state_machine_fixture) {
                    raft::replicate_options(raft::consistency_level::quorum_ack))
                  .get0();
     BOOST_REQUIRE(!(bool)offset_r);
+    feature_table.stop().get0();
 }
 
 // transactional writes of an aborted tx are rejected
@@ -386,7 +407,10 @@ FIXTURE_TEST(test_tx_post_aborted_produce, mux_state_machine_fixture) {
     start_raft();
 
     ss::sharded<cluster::tx_gateway_frontend> tx_gateway_frontend;
-    cluster::rm_stm stm(logger, _raft.get(), tx_gateway_frontend);
+    ss::sharded<cluster::feature_table> feature_table;
+    feature_table.start().get0();
+    cluster::rm_stm stm(
+      logger, _raft.get(), tx_gateway_frontend, feature_table);
     stm.testing_only_disable_auto_abort();
     stm.testing_only_enable_transactions();
 
@@ -438,4 +462,5 @@ FIXTURE_TEST(test_tx_post_aborted_produce, mux_state_machine_fixture) {
                    raft::replicate_options(raft::consistency_level::quorum_ack))
                  .get0();
     BOOST_REQUIRE(offset_r == invalid_producer_epoch);
+    feature_table.stop().get0();
 }
diff --git a/src/v/redpanda/application.cc b/src/v/redpanda/application.cc
index 6dba2999a00d8..1e05566105087 100644
--- a/src/v/redpanda/application.cc
+++ b/src/v/redpanda/application.cc
@@ -723,7 +723,8 @@ void application::wire_up_redpanda_services() {
       std::ref(tx_gateway_frontend),
       std::ref(partition_recovery_manager),
       std::ref(cloud_storage_api),
-      std::ref(shadow_index_cache))
+      std::ref(shadow_index_cache),
+      std::ref(_feature_table))
       .get();
     vlog(_log.info, "Partition manager started");
 

From 8e7346dba33421d9b2a0e6bca5daa0cf8e8d524f Mon Sep 17 00:00:00 2001
From: Denis Rystsov <denis@vectorized.io>
Date: Tue, 5 Jul 2022 17:24:10 -0700
Subject: [PATCH 051/201] rm_stm: put kafka offset cache behind feature manager

---
 src/v/cluster/feature_table.cc              | 4 +++-
 src/v/cluster/feature_table.h               | 7 +++++++
 src/v/cluster/rm_stm.cc                     | 7 ++++++-
 tests/rptest/tests/cluster_features_test.py | 2 +-
 4 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/v/cluster/feature_table.cc b/src/v/cluster/feature_table.cc
index 134f9a2263741..24bf521389c83 100644
--- a/src/v/cluster/feature_table.cc
+++ b/src/v/cluster/feature_table.cc
@@ -30,6 +30,8 @@ std::string_view to_string_view(feature f) {
         return "serde_raft_0";
     case feature::license:
         return "license";
+    case feature::rm_stm_kafka_cache:
+        return "rm_stm_kafka_cache";
     case feature::test_alpha:
         return "__test_alpha";
     }
@@ -58,7 +60,7 @@ std::string_view to_string_view(feature_state::state s) {
 
 // The version that this redpanda node will report: increment this
 // on protocol changes to raft0 structures, like adding new services.
-static constexpr cluster_version latest_version = cluster_version{4};
+static constexpr cluster_version latest_version = cluster_version{5};
 
 feature_table::feature_table() {
     // Intentionally undocumented environment variable, only for use
diff --git a/src/v/cluster/feature_table.h b/src/v/cluster/feature_table.h
index d40e2a7235b5b..88eddf0420923 100644
--- a/src/v/cluster/feature_table.h
+++ b/src/v/cluster/feature_table.h
@@ -27,6 +27,7 @@ enum class feature : std::uint64_t {
     mtls_authentication = 0x8,
     serde_raft_0 = 0x10,
     license = 0x20,
+    rm_stm_kafka_cache = 0x40,
 
     // Dummy features for testing only
     test_alpha = uint64_t(1) << 63,
@@ -115,6 +116,12 @@ constexpr static std::array feature_schema{
     feature::license,
     feature_spec::available_policy::always,
     feature_spec::prepare_policy::always},
+  feature_spec{
+    cluster_version{5},
+    "rm_stm_kafka_cache",
+    feature::rm_stm_kafka_cache,
+    feature_spec::available_policy::always,
+    feature_spec::prepare_policy::always},
   feature_spec{
     cluster_version{2001},
     "__test_alpha",
diff --git a/src/v/cluster/rm_stm.cc b/src/v/cluster/rm_stm.cc
index 3658c61db0327..baa3dc492167d 100644
--- a/src/v/cluster/rm_stm.cc
+++ b/src/v/cluster/rm_stm.cc
@@ -1959,7 +1959,12 @@ rm_stm::apply_snapshot(stm_snapshot_header hdr, iobuf&& tx_ss_buf) {
     _insync_offset = data.offset;
 }
 
-uint8_t rm_stm::active_snapshot_version() { return tx_snapshot_v1::version; }
+uint8_t rm_stm::active_snapshot_version() {
+    if (_feature_table.local().is_active(feature::rm_stm_kafka_cache)) {
+        return tx_snapshot::version;
+    }
+    return tx_snapshot_v1::version;
+}
 
 template<class T>
 void rm_stm::fill_snapshot_wo_seqs(T& snapshot) {
diff --git a/tests/rptest/tests/cluster_features_test.py b/tests/rptest/tests/cluster_features_test.py
index 471b576cba87d..ab7c91c43df89 100644
--- a/tests/rptest/tests/cluster_features_test.py
+++ b/tests/rptest/tests/cluster_features_test.py
@@ -42,7 +42,7 @@ def _assert_default_features(self):
         # This assertion will break each time we increment the value
         # of `latest_version` in the redpanda source.  Update it when
         # that happens.
-        assert features_response['cluster_version'] == 4
+        assert features_response['cluster_version'] == 5
 
         assert self._get_features_map(
             features_response)['central_config']['state'] == 'active'

From e5846f10846a9a4b8224b83b609cdc974297b6b7 Mon Sep 17 00:00:00 2001
From: Denis Rystsov <denis@vectorized.io>
Date: Mon, 11 Jul 2022 19:28:20 -0700
Subject: [PATCH 052/201] ducky: move wait_for_num_versions to
 redpanda_installer

---
 tests/rptest/services/redpanda_installer.py | 19 +++++++++++++++++++
 tests/rptest/tests/upgrade_test.py          | 20 +-------------------
 2 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/tests/rptest/services/redpanda_installer.py b/tests/rptest/services/redpanda_installer.py
index fe4cfd1bdc462..e9ab2c2f2cbbf 100644
--- a/tests/rptest/services/redpanda_installer.py
+++ b/tests/rptest/services/redpanda_installer.py
@@ -9,6 +9,7 @@
 
 import re
 import requests
+from ducktape.utils.util import wait_until
 
 # Match any version that may result from a redpanda binary, which may not be a
 # released version.
@@ -16,6 +17,24 @@
 VERSION_RE = re.compile(".*v(\\d+)\\.(\\d+)\\.(\\d+).*")
 
 
+def wait_for_num_versions(redpanda, num_versions):
+    def get_unique_versions():
+        node = redpanda.nodes[0]
+        brokers_list = \
+            str(node.account.ssh_output(f"{redpanda.find_binary('rpk')} redpanda admin brokers list"))
+        redpanda.logger.debug(brokers_list)
+        version_re = re.compile("v\\d+\\.\\d+\\.\\d+")
+        return set(version_re.findall(brokers_list))
+
+    # NOTE: allow retries, as the version may not be available immediately
+    # following a restart.
+    wait_until(lambda: len(get_unique_versions()) == num_versions,
+               timeout_sec=30)
+    unique_versions = get_unique_versions()
+    assert len(unique_versions) == num_versions, unique_versions
+    return unique_versions
+
+
 class RedpandaInstaller:
     """
     Provides mechanisms to install multiple Redpanda binaries on a cluster.
diff --git a/tests/rptest/tests/upgrade_test.py b/tests/rptest/tests/upgrade_test.py
index e5da152eeb6fb..7745e1ebccaf5 100644
--- a/tests/rptest/tests/upgrade_test.py
+++ b/tests/rptest/tests/upgrade_test.py
@@ -14,25 +14,7 @@
 from rptest.tests.redpanda_test import RedpandaTest
 from rptest.services.cluster import cluster
 from rptest.services.redpanda import RESTART_LOG_ALLOW_LIST
-from rptest.services.redpanda_installer import RedpandaInstaller
-
-
-def wait_for_num_versions(redpanda, num_versions):
-    def get_unique_versions():
-        node = redpanda.nodes[0]
-        brokers_list = \
-            str(node.account.ssh_output(f"{redpanda.find_binary('rpk')} redpanda admin brokers list"))
-        redpanda.logger.debug(brokers_list)
-        version_re = re.compile("v\\d+\\.\\d+\\.\\d+")
-        return set(version_re.findall(brokers_list))
-
-    # NOTE: allow retries, as the version may not be available immediately
-    # following a restart.
-    wait_until(lambda: len(get_unique_versions()) == num_versions,
-               timeout_sec=30)
-    unique_versions = get_unique_versions()
-    assert len(unique_versions) == num_versions, unique_versions
-    return unique_versions
+from rptest.services.redpanda_installer import RedpandaInstaller, wait_for_num_versions
 
 
 class UpgradeFromSpecificVersion(RedpandaTest):

From c45672a8a65277866349c353637a039cb60051f1 Mon Sep 17 00:00:00 2001
From: Denis Rystsov <denis@vectorized.io>
Date: Mon, 11 Jul 2022 19:28:59 -0700
Subject: [PATCH 053/201] ducky: add fix 5355 upgrade test

manually validated the tests by tweaking active_snapshot_version()
to ignore feature manager and to always use the newest version and
checked that in this case the tests fail
---
 tests/rptest/tests/fix_5355_upgrade_test.py | 118 ++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 tests/rptest/tests/fix_5355_upgrade_test.py

diff --git a/tests/rptest/tests/fix_5355_upgrade_test.py b/tests/rptest/tests/fix_5355_upgrade_test.py
new file mode 100644
index 0000000000000..455154950163d
--- /dev/null
+++ b/tests/rptest/tests/fix_5355_upgrade_test.py
@@ -0,0 +1,118 @@
+# Copyright 2022 Redpanda Data, Inc.
+#
+# Use of this software is governed by the Business Source License
+# included in the file licenses/BSL.md
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0
+
+import re
+
+from rptest.clients.types import TopicSpec
+from rptest.tests.redpanda_test import RedpandaTest
+from rptest.services.cluster import cluster
+from rptest.services.redpanda import RESTART_LOG_ALLOW_LIST
+from rptest.services.redpanda_installer import RedpandaInstaller, wait_for_num_versions
+from rptest.services.redpanda import RedpandaService
+
+from confluent_kafka import (Producer, KafkaException)
+from random import choice
+from string import ascii_uppercase
+
+
+def on_delivery(err, msg):
+    if err is not None:
+        raise KafkaException(err)
+
+
+class Fix5355UpgradeTest(RedpandaTest):
+    topics = [TopicSpec(name="topic1")]
+    """
+    Basic test that upgrading software works as expected.
+    """
+    def __init__(self, test_context):
+        extra_rp_conf = {
+            "default_topic_replications": 3,
+            "default_topic_partitions": 1,
+            "log_segment_size": 1048576
+        }
+        super(Fix5355UpgradeTest, self).__init__(test_context=test_context,
+                                                 num_brokers=3,
+                                                 enable_installer=True,
+                                                 extra_rp_conf=extra_rp_conf)
+        self.installer = self.redpanda._installer
+
+    def setUp(self):
+        # NOTE: `rpk redpanda admin brokers list` requires versions v22.1.x and
+        # above.
+        self.installer.install(self.redpanda.nodes, (22, 1, 3))
+        super(Fix5355UpgradeTest, self).setUp()
+
+    def fill_segment(self):
+        payload_1kb = ''.join(choice(ascii_uppercase) for i in range(1024))
+        p = Producer({
+            "bootstrap.servers": self.redpanda.brokers(),
+            "enable.idempotence": True,
+            "retries": 5
+        })
+        for i in range(0, 2 * 1024):
+            p.produce("topic1",
+                      key="key1".encode('utf-8'),
+                      value=payload_1kb.encode('utf-8'),
+                      callback=on_delivery)
+        p.flush()
+
+    def check_snapshot_exist(self):
+        for node in self.redpanda.nodes:
+            cmd = f"find {RedpandaService.DATA_DIR}"
+            out_iter = node.account.ssh_capture(cmd)
+            has_snapshot = False
+            for line in out_iter:
+                has_snapshot = has_snapshot or re.match(
+                    f"{RedpandaService.DATA_DIR}/kafka/topic1/\\d+_\\d+/tx.snapshot",
+                    line)
+            assert has_snapshot
+
+    @cluster(num_nodes=3, log_allow_list=RESTART_LOG_ALLOW_LIST)
+    def test_rollback(self):
+        """
+        the test checks than a mid upgrade rollback isn't broken
+        """
+        first_node = self.redpanda.nodes[0]
+
+        unique_versions = wait_for_num_versions(self.redpanda, 1)
+        assert "v22.1.3" in unique_versions, unique_versions
+
+        # Upgrade one node to the head version.
+        self.installer.install([first_node], RedpandaInstaller.HEAD)
+        self.redpanda.restart_nodes([first_node])
+        unique_versions = wait_for_num_versions(self.redpanda, 2)
+        assert "v22.1.3" in unique_versions, unique_versions
+
+        self.fill_segment()
+        self.check_snapshot_exist()
+
+        # Rollback the partial upgrade and ensure we go back to the original
+        # state.
+        self.installer.install([first_node], (22, 1, 3))
+        self.redpanda.restart_nodes([first_node])
+        unique_versions = wait_for_num_versions(self.redpanda, 1)
+        assert "v22.1.3" in unique_versions, unique_versions
+
+    @cluster(num_nodes=3, log_allow_list=RESTART_LOG_ALLOW_LIST)
+    def test_upgrade(self):
+        """
+        the test checks than upgrade isn't broken
+        """
+        unique_versions = wait_for_num_versions(self.redpanda, 1)
+        assert "v22.1.3" in unique_versions, unique_versions
+
+        self.fill_segment()
+        self.check_snapshot_exist()
+
+        # Upgrade one node to the head version.
+        self.installer.install(self.redpanda.nodes, RedpandaInstaller.HEAD)
+        self.redpanda.restart_nodes(self.redpanda.nodes)
+        unique_versions = wait_for_num_versions(self.redpanda, 1)
+        assert "v22.1.3" not in unique_versions, unique_versions

From 9179aeef047c82f3e09176f9e8cd071802469b72 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@redpanda.com>
Date: Tue, 12 Jul 2022 13:23:20 +0100
Subject: [PATCH 054/201] metrics: replicate selected seastar metrics

This commit replicates a few metrics exposed by seastar in order to
expose them on the 'public_metrics' endpoint. See a list of the new
metrics below:

* redpanda_io_queue_total_write_ops
    * Description: Total write operations passed in the queue
    * Labels: class (i.e. the IO priority class used for the write),
      ioshard (i.e. the shard that executes the IO), shard (i.e. the
      shard that issued the IO request), mountpoint

* redpanda_io_queue_total_read_ops
    * Description: Total read operations passed in the queue
    * Labels: class (i.e. the IO priority class used for the write),
      ioshard (i.e. the shard that executes the IO), shard (i.e. the
      shard that issued the IO request), mountpoint

* redpanda_memory_free_memory
    * Description: Free memory size in bytes
    * Labels: shard

* redpanda_memory_allocated_memory
    * Description: Allocated memory size in bytes
    * Labels: shard
---
 src/v/redpanda/application.cc | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/v/redpanda/application.cc b/src/v/redpanda/application.cc
index ddfe8fd73c618..2b5f6adc5e0e9 100644
--- a/src/v/redpanda/application.cc
+++ b/src/v/redpanda/application.cc
@@ -57,6 +57,7 @@
 #include "redpanda/admin_server.h"
 #include "resource_mgmt/io_priority.h"
 #include "rpc/simple_protocol.h"
+#include "ssx/metrics.h"
 #include "storage/backlog_controller.h"
 #include "storage/chunk_cache.h"
 #include "storage/compaction_controller.h"
@@ -312,6 +313,16 @@ void application::initialize(
 }
 
 void application::setup_metrics() {
+    if (!config::shard_local_cfg().disable_public_metrics()) {
+        seastar::metrics::replicate_metric_families(
+          seastar::metrics::default_handle(),
+          {{"io_queue_total_read_ops", ssx::metrics::public_metrics_handle},
+           {"io_queue_total_write_ops", ssx::metrics::public_metrics_handle},
+           {"memory_allocated_memory", ssx::metrics::public_metrics_handle},
+           {"memory_free_memory", ssx::metrics::public_metrics_handle}})
+          .get();
+    }
+
     if (config::shard_local_cfg().disable_metrics()) {
         return;
     }

From cb38ff1ac55bdd989191a0df20ff697a4a8f6c40 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@redpanda.com>
Date: Tue, 12 Jul 2022 19:54:04 +0100
Subject: [PATCH 055/201] cmake: update seastar tag

---
 cmake/oss.cmake.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/oss.cmake.in b/cmake/oss.cmake.in
index e74bcdb8a3482..9529c2b8da3e2 100644
--- a/cmake/oss.cmake.in
+++ b/cmake/oss.cmake.in
@@ -179,7 +179,7 @@ ExternalProject_Add(fmt
 
 ExternalProject_Add(seastar
   GIT_REPOSITORY https://github.com/redpanda-data/seastar.git
-  GIT_TAG 16d4456f86e344d6c240c431045957e111ec213f
+  GIT_TAG 8f98d69bcbd2473eb9915204bd8fd1665e609739
   INSTALL_DIR    @REDPANDA_DEPS_INSTALL_DIR@
   CMAKE_COMMAND ${CMAKE_COMMAND} -E env ${cmake_build_env} ${CMAKE_COMMAND}
   CMAKE_ARGS

From 6d9fe61c37c5b471ee45fbc6006ced3fdf019284 Mon Sep 17 00:00:00 2001
From: Alexey Zatelepin <ztlpn@vectorized.io>
Date: Tue, 12 Jul 2022 23:22:29 +0300
Subject: [PATCH 056/201] tests: fix upgrade_test

Upgrade test started failing because older versions of redpanda don't
support the partition_autobalancing_mode flag (that was added to default
config instead of the deprecated enable_auto_rebalance_on_node_add flag.
Looks like the only tests that need a non-default value are
node_operations_fuzzy_test (that sets it explicitly) and scaling_up_test
(where we explicitly set the value)..
---
 tests/rptest/services/redpanda.py     |  1 -
 tests/rptest/tests/scaling_up_test.py | 10 ++++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/tests/rptest/services/redpanda.py b/tests/rptest/services/redpanda.py
index ecbafdd30c0b5..f2b0cd1b37687 100644
--- a/tests/rptest/services/redpanda.py
+++ b/tests/rptest/services/redpanda.py
@@ -402,7 +402,6 @@ class RedpandaService(Service):
         'default_topic_partitions': 4,
         'enable_metrics_reporter': False,
         'superusers': [SUPERUSER_CREDENTIALS[0]],
-        'partition_autobalancing_mode': 'node_add_remove'
     }
 
     logs = {
diff --git a/tests/rptest/tests/scaling_up_test.py b/tests/rptest/tests/scaling_up_test.py
index d18a02c30d0fb..624843363df2e 100644
--- a/tests/rptest/tests/scaling_up_test.py
+++ b/tests/rptest/tests/scaling_up_test.py
@@ -23,8 +23,14 @@ class ScalingUpTest(EndToEndTest):
     """
     @cluster(num_nodes=5)
     def test_adding_nodes_to_cluster(self):
-        self.redpanda = RedpandaService(
-            self.test_context, 3, extra_rp_conf={"group_topic_partitions": 1})
+        self.redpanda = RedpandaService(self.test_context,
+                                        3,
+                                        extra_rp_conf={
+                                            "group_topic_partitions":
+                                            1,
+                                            "partition_autobalancing_mode":
+                                            "node_add_remove"
+                                        })
         # start single node cluster
         self.redpanda.start(nodes=[self.redpanda.nodes[0]])
         # create some topics

From 8f70bf2a784017e1fade815220b612807b96b7ae Mon Sep 17 00:00:00 2001
From: Alexey Zatelepin <ztlpn@vectorized.io>
Date: Wed, 13 Jul 2022 03:39:55 +0300
Subject: [PATCH 057/201] tests: increase timeouts in partition_balancer_test

Sometimes in cithe leader table doesn't get updated in a timely fashion,
leading to sporadic partition_balancer_test failures, so looks like the
timeout of 30 seconds needs to be increased.
---
 tests/rptest/tests/partition_balancer_test.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/rptest/tests/partition_balancer_test.py b/tests/rptest/tests/partition_balancer_test.py
index 2a0262a2515bd..111d60d23a8a7 100644
--- a/tests/rptest/tests/partition_balancer_test.py
+++ b/tests/rptest/tests/partition_balancer_test.py
@@ -51,7 +51,7 @@ def all_partitions_ready():
 
             partitions = wait_until_result(
                 all_partitions_ready,
-                timeout_sec=30,
+                timeout_sec=120,
                 backoff_sec=1,
                 err_msg="failed to wait until all partitions have leaders")
 
@@ -61,7 +61,7 @@ def all_partitions_ready():
 
         return ret
 
-    def wait_until_status(self, predicate, timeout_sec=60):
+    def wait_until_status(self, predicate, timeout_sec=120):
         admin = Admin(self.redpanda)
         start = time.time()
 
@@ -82,7 +82,7 @@ def check():
             backoff_sec=2,
             err_msg="failed to wait until status condition")
 
-    def wait_until_ready(self, timeout_sec=60):
+    def wait_until_ready(self, timeout_sec=120):
         return self.wait_until_status(
             lambda status: status['status'] == 'ready',
             timeout_sec=timeout_sec)

From 20c701a4653ca7a9c6dd481f81faef0123b9f09f Mon Sep 17 00:00:00 2001
From: Alexey Biryukov <alexey@redpanda.com>
Date: Tue, 12 Jul 2022 23:50:55 -0400
Subject: [PATCH 058/201] ducktape: run GroupMetricsTest with
 group_topic_partitions=1

test_leadership_transfer relies on kafka_group_offset metric source as an
indication of the node being a group coordinator. However in admin API
there is no way to know which __consumer_groups partition a specific
consumer group uses, and the testcase always works with partition 0.
To make that work, number of __consumer_groups partition is set to 1
explicitly.
---
 tests/rptest/tests/group_membership_test.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/rptest/tests/group_membership_test.py b/tests/rptest/tests/group_membership_test.py
index 389bc1cf3a6b8..d57fce734e79c 100644
--- a/tests/rptest/tests/group_membership_test.py
+++ b/tests/rptest/tests/group_membership_test.py
@@ -127,7 +127,8 @@ def __init__(self, ctx, *args, **kwargs):
 
         # Require internal_kafka topic to have an increased replication factor
         extra_rp_conf = dict(default_topic_replications=3,
-                             enable_leader_balancer=False)
+                             enable_leader_balancer=False,
+                             group_topic_partitions=1)
         super(GroupMetricsTest, self).__init__(test_context=ctx,
                                                num_brokers=3,
                                                extra_rp_conf=extra_rp_conf)
@@ -374,6 +375,9 @@ def select_next_leader():
                        timeout_sec=30,
                        backoff_sec=5)
 
+            self.logger.debug(
+                f"Waiting for metrics from the single node: {new_leader.account.hostname}"
+            )
             wait_until(lambda: metrics_from_single_node(new_leader),
                        timeout_sec=30,
                        backoff_sec=5)

From ddf721182a0c0bc2e419be2c0b2fcaa1a0f80cb5 Mon Sep 17 00:00:00 2001
From: Michal Maslanka <michal@redpanda.com>
Date: Tue, 5 Jul 2022 08:22:12 +0200
Subject: [PATCH 059/201] admin_server: do not use maybe_yield when listing
 reconfigurations

Signed-off-by: Michal Maslanka <michal@redpanda.com>
---
 src/v/redpanda/admin_server.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/v/redpanda/admin_server.cc b/src/v/redpanda/admin_server.cc
index bb75946a1927d..59e7348e40367 100644
--- a/src/v/redpanda/admin_server.cc
+++ b/src/v/redpanda/admin_server.cc
@@ -2465,10 +2465,9 @@ void admin_server::register_partition_routes() {
                   replica.core = bs.shard;
                   r.previous_replicas.push(replica);
               }
-              co_await ss::coroutine::maybe_yield();
               ret.push_back(std::move(r));
           }
-          co_return std::move(ret);
+          co_return ret;
       });
 }
 

From f3c2fba987f6aeb0d1f34ee00f416145f1cb8580 Mon Sep 17 00:00:00 2001
From: Michal Maslanka <michal@redpanda.com>
Date: Tue, 28 Jun 2022 09:43:31 +0200
Subject: [PATCH 060/201] c/topics_table: expose apis to query partitions being
 moved

Exposed APIs giving caller ability to query for all the partitions being
moved in the cluster and partitions that are moving to/from a given
node.

Signed-off-by: Michal Maslanka <michal@redpanda.com>
---
 src/v/cluster/cluster_utils.h | 10 +++++++
 src/v/cluster/topic_table.cc  | 53 +++++++++++++++++++++++++++++++++++
 src/v/cluster/topic_table.h   | 17 +++++++++++
 3 files changed, 80 insertions(+)

diff --git a/src/v/cluster/cluster_utils.h b/src/v/cluster/cluster_utils.h
index 4aa9380e47b7d..371cd2f4527f2 100644
--- a/src/v/cluster/cluster_utils.h
+++ b/src/v/cluster/cluster_utils.h
@@ -271,4 +271,14 @@ inline std::vector<model::broker_shard> subtract_replica_sets(
       });
     return ret;
 }
+
+// check if replica set contains a node
+inline bool contains_node(
+  const std::vector<model::broker_shard>& replicas, model::node_id id) {
+    return std::find_if(
+             replicas.begin(),
+             replicas.end(),
+             [id](const model::broker_shard& bs) { return bs.node_id == id; })
+           != replicas.end();
+}
 } // namespace cluster
diff --git a/src/v/cluster/topic_table.cc b/src/v/cluster/topic_table.cc
index 9a2f1558bfedc..185f322ae2362 100644
--- a/src/v/cluster/topic_table.cc
+++ b/src/v/cluster/topic_table.cc
@@ -724,6 +724,58 @@ topic_table::get_previous_replica_set(const model::ntp& ntp) const {
     return std::nullopt;
 }
 
+std::vector<model::ntp>
+topic_table::ntps_moving_to_node(model::node_id node) const {
+    std::vector<model::ntp> ret;
+
+    for (const auto& [ntp, state] : _updates_in_progress) {
+        if (contains_node(state.previous_replicas, node)) {
+            continue;
+        }
+
+        auto current_assignment = get_partition_assignment(ntp);
+        if (unlikely(!current_assignment)) {
+            continue;
+        }
+
+        if (contains_node(current_assignment->replicas, node)) {
+            ret.push_back(ntp);
+        }
+    }
+    return ret;
+}
+
+std::vector<model::ntp>
+topic_table::ntps_moving_from_node(model::node_id node) const {
+    std::vector<model::ntp> ret;
+
+    for (const auto& [ntp, state] : _updates_in_progress) {
+        if (!contains_node(state.previous_replicas, node)) {
+            continue;
+        }
+
+        auto current_assignment = get_partition_assignment(ntp);
+        if (unlikely(!current_assignment)) {
+            continue;
+        }
+
+        if (!contains_node(current_assignment->replicas, node)) {
+            ret.push_back(ntp);
+        }
+    }
+    return ret;
+}
+
+std::vector<model::ntp> topic_table::all_updates_in_progress() const {
+    std::vector<model::ntp> ret;
+    ret.reserve(_updates_in_progress.size());
+    for (const auto& [ntp, _] : _updates_in_progress) {
+        ret.push_back(ntp);
+    }
+
+    return ret;
+}
+
 std::ostream&
 operator<<(std::ostream& o, topic_table::in_progress_state update) {
     switch (update) {
@@ -736,4 +788,5 @@ operator<<(std::ostream& o, topic_table::in_progress_state update) {
     }
     __builtin_unreachable();
 }
+
 } // namespace cluster
diff --git a/src/v/cluster/topic_table.h b/src/v/cluster/topic_table.h
index 1ea2ac237c786..a3cd4b8b83a41 100644
--- a/src/v/cluster/topic_table.h
+++ b/src/v/cluster/topic_table.h
@@ -208,6 +208,23 @@ class topic_table {
     std::optional<std::vector<model::broker_shard>>
     get_previous_replica_set(const model::ntp&) const;
 
+    const absl::node_hash_map<model::ntp, in_progress_update>&
+    in_progress_updates() const {
+        return _updates_in_progress;
+    }
+
+    /**
+     * Lists all NTPs that replicas are being move to a node
+     */
+    std::vector<model::ntp> ntps_moving_to_node(model::node_id) const;
+
+    /**
+     * Lists all NTPs that replicas are being move from a node
+     */
+    std::vector<model::ntp> ntps_moving_from_node(model::node_id) const;
+
+    std::vector<model::ntp> all_updates_in_progress() const;
+
 private:
     struct waiter {
         explicit waiter(uint64_t id)

From 204562b300c488929cf895abf14913698b8e5822 Mon Sep 17 00:00:00 2001
From: Michal Maslanka <michal@redpanda.com>
Date: Fri, 8 Jul 2022 11:56:17 +0200
Subject: [PATCH 061/201] c/topics_table: fixed setting revision of pending
 update

Signed-off-by: Michal Maslanka <michal@redpanda.com>
---
 src/v/cluster/topic_table.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/v/cluster/topic_table.cc b/src/v/cluster/topic_table.cc
index 185f322ae2362..9c4fc1f9432c8 100644
--- a/src/v/cluster/topic_table.cc
+++ b/src/v/cluster/topic_table.cc
@@ -175,6 +175,7 @@ topic_table::apply(move_partition_replicas_cmd cmd, model::offset o) {
       in_progress_update{
         .previous_replicas = current_assignment_it->replicas,
         .state = in_progress_state::update_requested,
+        .update_revision = model::revision_id(o),
       });
     auto previous_assignment = *current_assignment_it;
     // replace partition replica set
@@ -190,6 +191,7 @@ topic_table::apply(move_partition_replicas_cmd cmd, model::offset o) {
               in_progress_update{
                 .previous_replicas = current_assignment_it->replicas,
                 .state = in_progress_state::update_requested,
+                .update_revision = model::revision_id(o),
               });
             vassert(
               success,

From 986bf086711f36200e3678f2599b18a91cba2281 Mon Sep 17 00:00:00 2001
From: Michal Maslanka <michal@redpanda.com>
Date: Fri, 8 Jul 2022 12:22:28 +0200
Subject: [PATCH 062/201] c/members_backend: fixed propagating update offset

Previously update offset wasn't propagated for decommission/recommission
update types

Signed-off-by: Michal Maslanka <michal@redpanda.com>
---
 src/v/cluster/members_manager.cc | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/v/cluster/members_manager.cc b/src/v/cluster/members_manager.cc
index ba0d8df250512..ac09f17569bc3 100644
--- a/src/v/cluster/members_manager.cc
+++ b/src/v/cluster/members_manager.cc
@@ -226,12 +226,15 @@ members_manager::apply_update(model::record_batch b) {
       [this, update_offset](decommission_node_cmd cmd) mutable {
           auto id = cmd.key;
           return dispatch_updates_to_cores(update_offset, cmd)
-            .then([this, id](std::error_code error) {
+            .then([this, id, update_offset](std::error_code error) {
                 auto f = ss::now();
                 if (!error) {
                     _allocator.local().decommission_node(id);
                     f = _update_queue.push_eventually(node_update{
-                      .id = id, .type = node_update_type::decommissioned});
+                      .id = id,
+                      .type = node_update_type::decommissioned,
+                      .offset = update_offset,
+                    });
                 }
                 return f.then([error] { return error; });
             });
@@ -239,23 +242,27 @@ members_manager::apply_update(model::record_batch b) {
       [this, update_offset](recommission_node_cmd cmd) mutable {
           auto id = cmd.key;
           return dispatch_updates_to_cores(update_offset, cmd)
-            .then([this, id](std::error_code error) {
+            .then([this, id, update_offset](std::error_code error) {
                 auto f = ss::now();
                 if (!error) {
                     _allocator.local().recommission_node(id);
                     f = _update_queue.push_eventually(node_update{
-                      .id = id, .type = node_update_type::recommissioned});
+                      .id = id,
+                      .type = node_update_type::recommissioned,
+                      .offset = update_offset});
                 }
                 return f.then([error] { return error; });
             });
       },
-      [this](finish_reallocations_cmd cmd) mutable {
+      [this, update_offset](finish_reallocations_cmd cmd) mutable {
           // we do not have to dispatch this command to members table since this
           // command is only used by a backend to signal successfully finished
           // node reallocations
           return _update_queue
             .push_eventually(node_update{
-              .id = cmd.key, .type = node_update_type::reallocation_finished})
+              .id = cmd.key,
+              .type = node_update_type::reallocation_finished,
+              .offset = update_offset})
             .then([] { return make_error_code(errc::success); });
       },
       [this, update_offset](maintenance_mode_cmd cmd) {

From 6a1d9f4a48e7c70fa35825dafa99e1f87eee8857 Mon Sep 17 00:00:00 2001
From: Michal Maslanka <michal@redpanda.com>
Date: Wed, 29 Jun 2022 12:01:14 +0200
Subject: [PATCH 063/201] c/members_backend: store revision of previous node
 decommissioning

In order to cancel only those partition movements which were requested
by node decommissioning we need to know which partition movements were
requested before the node was decommissioned.

In order to calculate the boundary and be able to cancel only
decommission related partition movements when recommissioning a node we
store revision of last node decommission request.

All the controller commands, both partition movements and node
operations can be sequenced using controller log offset. The offset
allow us to set a boundary between the moves which were scheduled before
node was decommissioned and ones which are a result of decommissioning.

Signed-off-by: Michal Maslanka <michal@redpanda.com>
---
 src/v/cluster/members_backend.cc | 18 +++++++++++++++++-
 src/v/cluster/members_backend.h  |  9 +++++++++
 src/v/cluster/members_manager.h  |  6 ++++++
 3 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/src/v/cluster/members_backend.cc b/src/v/cluster/members_backend.cc
index 65c4f57d03ab1..cab3bb272c542 100644
--- a/src/v/cluster/members_backend.cc
+++ b/src/v/cluster/members_backend.cc
@@ -132,6 +132,8 @@ void members_backend::handle_single_update(
         return;
     case update_t::decommissioned:
         stop_node_addition(update.id);
+        _decommission_command_revision.emplace(
+          update.id, model::revision_id(update.offset));
         _updates.emplace_back(update);
         _new_updates.signal();
         return;
@@ -335,7 +337,21 @@ ss::future<> members_backend::reconcile() {
     // if nothing to do, wait
     co_await _new_updates.wait([this] { return !_updates.empty(); });
     auto u = co_await _lock.get_units();
-
+    // remove stored revisions of previous decommissioning nodes, this will only
+    // happen when update is finished and it is either decommissioning or
+    // recommissioning of a node
+    for (const auto& meta : _updates) {
+        const bool is_decommission
+          = meta.update.type
+            == members_manager::node_update_type::decommissioned;
+        const bool is_recommission
+          = meta.update.type
+            == members_manager::node_update_type::recommissioned;
+
+        if (meta.finished && (is_decommission || is_recommission)) {
+            _decommission_command_revision.erase(meta.update.id);
+        }
+    }
     // remove finished updates
     std::erase_if(
       _updates, [](const update_meta& meta) { return meta.finished; });
diff --git a/src/v/cluster/members_backend.h b/src/v/cluster/members_backend.h
index aaae37e8b834b..b805e273a9213 100644
--- a/src/v/cluster/members_backend.h
+++ b/src/v/cluster/members_backend.h
@@ -10,6 +10,7 @@
 
 #include <seastar/core/condition-variable.hh>
 
+#include <absl/container/flat_hash_map.h>
 #include <absl/container/node_hash_set.h>
 
 #include <chrono>
@@ -118,6 +119,14 @@ class members_backend {
     ss::timer<> _retry_timer;
     ss::condition_variable _new_updates;
     ss::metrics::metric_groups _metrics;
+    /**
+     * store revision of node decommissioning update, decommissioning command
+     * revision is stored when node is being decommissioned, it is used to
+     * determine which partition movements were scheduled before the node was
+     * decommissioned, recommissioning process will not abort those movements.
+     */
+    absl::flat_hash_map<model::node_id, model::revision_id>
+      _decommission_command_revision;
 };
 std::ostream&
 operator<<(std::ostream&, const members_backend::reallocation_state&);
diff --git a/src/v/cluster/members_manager.h b/src/v/cluster/members_manager.h
index 37c019dce75a4..6743f709e6864 100644
--- a/src/v/cluster/members_manager.h
+++ b/src/v/cluster/members_manager.h
@@ -53,6 +53,12 @@ class members_manager {
         model::node_id id;
         node_update_type type;
         model::offset offset;
+
+        bool is_commissioning() const {
+            return type == members_manager::node_update_type::decommissioned
+                   || type == members_manager::node_update_type::recommissioned;
+        }
+
         friend std::ostream& operator<<(std::ostream&, const node_update&);
     };
 

From db0b129b70d9bf3cde24ff9b3304233670931717 Mon Sep 17 00:00:00 2001
From: Michal Maslanka <michal@redpanda.com>
Date: Wed, 29 Jun 2022 12:43:18 +0200
Subject: [PATCH 064/201] c/members_backend: handle node recommissioning

Added handling of `recommission` command in members backend. Node
recommissioning cancels ongoing decommissioning process. It stops all
the partitions that are being moved from the node which were scheduled
by decommissioning process and prevents node from being removed from the
cluster.

Signed-off-by: Michal Maslanka <michal@redpanda.com>
---
 src/v/cluster/members_backend.cc | 62 ++++++++++++++++++++++++++++++--
 src/v/cluster/members_backend.h  |  3 ++
 2 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/src/v/cluster/members_backend.cc b/src/v/cluster/members_backend.cc
index cab3bb272c542..881316eb4bb45 100644
--- a/src/v/cluster/members_backend.cc
+++ b/src/v/cluster/members_backend.cc
@@ -118,9 +118,9 @@ void members_backend::handle_single_update(
     vlog(clusterlog.debug, "membership update received: {}", update);
     switch (update.type) {
     case update_t::recommissioned:
-        // if node was recommissioned simply remove all decommissioning
-        // updates
         handle_recommissioned(update);
+        _updates.emplace_back(update);
+        _new_updates.signal();
         return;
     case update_t::reallocation_finished:
         handle_reallocation_finished(update.id);
@@ -173,6 +173,9 @@ void members_backend::calculate_reallocations(update_meta& meta) {
     case members_manager::node_update_type::added:
         calculate_reallocations_after_node_added(meta);
         return;
+    case members_manager::node_update_type::recommissioned:
+        calculate_reallocations_after_recommissioned(meta);
+        return;
     default:
         return;
     }
@@ -333,6 +336,61 @@ void members_backend::calculate_reallocations_after_node_added(
     }
 }
 
+std::vector<model::ntp> members_backend::ntps_moving_from_node_older_than(
+  model::node_id node, model::revision_id revision) const {
+    std::vector<model::ntp> ret;
+
+    for (const auto& [ntp, state] : _topics.local().in_progress_updates()) {
+        if (state.update_revision < revision) {
+            continue;
+        }
+        if (!contains_node(state.previous_replicas, node)) {
+            continue;
+        }
+
+        auto current_assignment = _topics.local().get_partition_assignment(ntp);
+        if (unlikely(!current_assignment)) {
+            continue;
+        }
+
+        if (!contains_node(current_assignment->replicas, node)) {
+            ret.push_back(ntp);
+        }
+    }
+    return ret;
+}
+
+void members_backend::calculate_reallocations_after_recommissioned(
+  update_meta& meta) const {
+    auto it = _decommission_command_revision.find(meta.update.id);
+    vassert(
+      it != _decommission_command_revision.end(),
+      "members backend should hold a revision of nodes being decommissioned, "
+      "node_id: {}",
+      meta.update.id);
+    auto ntps = ntps_moving_from_node_older_than(meta.update.id, it->second);
+    // reallocate all partitions for which any of replicas is placed on
+    // decommissioned node
+    meta.partition_reallocations.reserve(ntps.size());
+    for (auto& ntp : ntps) {
+        partition_reallocation reallocation(ntp);
+        reallocation.state = reallocation_state::request_cancel;
+        auto current_assignment = _topics.local().get_partition_assignment(ntp);
+        auto previous_replica_set = _topics.local().get_previous_replica_set(
+          ntp);
+        if (
+          !current_assignment.has_value()
+          || !previous_replica_set.has_value()) {
+            continue;
+        }
+        reallocation.current_replica_set = std::move(
+          current_assignment->replicas);
+        reallocation.new_replica_set = std::move(*previous_replica_set);
+
+        meta.partition_reallocations.push_back(std::move(reallocation));
+    }
+}
+
 ss::future<> members_backend::reconcile() {
     // if nothing to do, wait
     co_await _new_updates.wait([this] { return !_updates.empty(); });
diff --git a/src/v/cluster/members_backend.h b/src/v/cluster/members_backend.h
index b805e273a9213..918ac77423619 100644
--- a/src/v/cluster/members_backend.h
+++ b/src/v/cluster/members_backend.h
@@ -99,6 +99,9 @@ class members_backend {
     void reassign_replicas(partition_assignment&, partition_reallocation&);
     void calculate_reallocations_after_node_added(update_meta&) const;
     void calculate_reallocations_after_decommissioned(update_meta&) const;
+    void calculate_reallocations_after_recommissioned(update_meta&) const;
+    std::vector<model::ntp> ntps_moving_from_node_older_than(
+      model::node_id, model::revision_id) const;
     void setup_metrics();
     ss::sharded<topics_frontend>& _topics_frontend;
     ss::sharded<topic_table>& _topics;

From 79af6b1e5881aadd9c7c2dcce9aa875c7dbe1d64 Mon Sep 17 00:00:00 2001
From: Michal Maslanka <michal@redpanda.com>
Date: Tue, 12 Jul 2022 11:10:46 +0200
Subject: [PATCH 065/201] c/topics_table: store revision of partition replicas

When reverting partition operation we can not use the revision of
operation canceling a move. When configuration update that is being
canceled removes a replica from partition replica set when reverting
that operation replica is added back to partition configuration. In
order for the configurations to be equivalent, we must use exactly the
same revision which was used by replica that was removed.

Replicas revision information is stored in raft configurations, however
relaying on raft configuration itself is impossible as they may get
deleted or truncated (as a part of raft operations). Hence the need to
store replica nodes revisions in topic table.

This PR introduces a map that stores per partition replica revisions.
The state is built in memory from existing information applied to the
the topics table.

Signed-off-by: Michal Maslanka <michal@redpanda.com>
---
 src/v/cluster/partition_balancer_planner.cc |  4 +-
 src/v/cluster/topic_table.cc                | 86 ++++++++++++++++-----
 src/v/cluster/topic_table.h                 | 49 +++++++++++-
 src/v/kafka/server/handlers/metadata.cc     |  2 +-
 4 files changed, 116 insertions(+), 25 deletions(-)

diff --git a/src/v/cluster/partition_balancer_planner.cc b/src/v/cluster/partition_balancer_planner.cc
index cf77d6117eb05..bdbb73a280365 100644
--- a/src/v/cluster/partition_balancer_planner.cc
+++ b/src/v/cluster/partition_balancer_planner.cc
@@ -305,7 +305,7 @@ void partition_balancer_planner::get_unavailable_nodes_reassignments(
                     continue;
                 }
                 auto new_allocation_units = get_reallocation(
-                  a, t.second, partition_size.value(), false, rrs);
+                  a, t.second.metadata, partition_size.value(), false, rrs);
                 if (new_allocation_units) {
                     result.reassignments.emplace_back(ntp_reassignments{
                       .ntp = ntp,
@@ -384,7 +384,7 @@ void partition_balancer_planner::get_full_node_reassignments(
             }
             auto new_allocation_units = get_reallocation(
               *current_assignments,
-              topic_metadata,
+              topic_metadata.metadata,
               ntp_size_it->first,
               true,
               rrs);
diff --git a/src/v/cluster/topic_table.cc b/src/v/cluster/topic_table.cc
index 9c4fc1f9432c8..9ad44ca5efceb 100644
--- a/src/v/cluster/topic_table.cc
+++ b/src/v/cluster/topic_table.cc
@@ -25,16 +25,16 @@
 namespace cluster {
 
 template<typename Func>
-std::vector<std::invoke_result_t<Func, const topic_metadata&>>
+std::vector<std::invoke_result_t<Func, const topic_table::topic_metadata_item&>>
 topic_table::transform_topics(Func&& f) const {
-    std::vector<std::invoke_result_t<Func, const topic_metadata>> ret;
+    std::vector<std::invoke_result_t<Func, const topic_metadata_item&>> ret;
     ret.reserve(_topics.size());
     std::transform(
       std::cbegin(_topics),
       std::cend(_topics),
       std::back_inserter(ret),
       [f = std::forward<Func>(f)](
-        const std::pair<model::topic_namespace, topic_metadata>& p) {
+        const std::pair<model::topic_namespace, topic_metadata_item>& p) {
           return f(p.second);
       });
     return ret;
@@ -47,22 +47,30 @@ topic_table::apply(create_topic_cmd cmd, model::offset offset) {
         return ss::make_ready_future<std::error_code>(
           errc::topic_already_exists);
     }
-    // calculate delta
-    for (auto& pas : cmd.value.assignments) {
-        auto ntp = model::ntp(cmd.key.ns, cmd.key.tp, pas.id);
-        _pending_deltas.emplace_back(
-          std::move(ntp), pas, offset, delta::op_type::add);
-    }
 
     std::optional<model::initial_revision_id> remote_revision
       = cmd.value.cfg.properties.remote_topic_properties ? std::make_optional(
           cmd.value.cfg.properties.remote_topic_properties->remote_revision)
                                                          : std::nullopt;
+    auto md = topic_metadata_item{
+      .metadata = topic_metadata(
+        std::move(cmd.value), model::revision_id(offset()), remote_revision)};
+    // calculate delta
+    md.replica_revisions.reserve(cmd.value.assignments.size());
+    for (auto& pas : md.get_assignments()) {
+        auto ntp = model::ntp(cmd.key.ns, cmd.key.tp, pas.id);
+        for (auto& r : pas.replicas) {
+            md.replica_revisions[pas.id][r.node_id] = model::revision_id(
+              offset);
+        }
+        _pending_deltas.emplace_back(
+          std::move(ntp), pas, offset, delta::op_type::add);
+    }
 
-    _topics.insert(
-      {cmd.key,
-       topic_metadata(
-         std::move(cmd.value), model::revision_id(offset()), remote_revision)});
+    _topics.insert({
+      cmd.key,
+      std::move(md),
+    });
     notify_waiters();
     return ss::make_ready_future<std::error_code>(errc::success);
 }
@@ -132,6 +140,10 @@ topic_table::apply(create_partition_cmd cmd, model::offset offset) {
         tp->second.get_assignments().emplace(p_as);
         // propagate deltas
         auto ntp = model::ntp(cmd.key.ns, cmd.key.tp, p_as.id);
+        for (auto& bs : p_as.replicas) {
+            tp->second.replica_revisions[p_as.id][bs.node_id]
+              = model::revision_id(offset);
+        }
         _pending_deltas.emplace_back(
           std::move(ntp), std::move(p_as), offset, delta::op_type::add);
     }
@@ -169,6 +181,11 @@ topic_table::apply(move_partition_replicas_cmd cmd, model::offset o) {
     if (are_replica_sets_equal(current_assignment_it->replicas, cmd.value)) {
         return ss::make_ready_future<std::error_code>(errc::success);
     }
+    auto revisions_it = tp->second.replica_revisions.find(cmd.key.tp.partition);
+    vassert(
+      revisions_it != tp->second.replica_revisions.end(),
+      "partition {}, replica revisions map must exists as partition is present",
+      cmd.key);
 
     _updates_in_progress.emplace(
       cmd.key,
@@ -176,10 +193,29 @@ topic_table::apply(move_partition_replicas_cmd cmd, model::offset o) {
         .previous_replicas = current_assignment_it->replicas,
         .state = in_progress_state::update_requested,
         .update_revision = model::revision_id(o),
+        // snapshot replicas revisions
+        .replicas_revisions = revisions_it->second,
       });
     auto previous_assignment = *current_assignment_it;
     // replace partition replica set
     current_assignment_it->replicas = cmd.value;
+    /**
+     * Update partition replica revisions. Assign new revision to added replicas
+     * and erase replicas which are removed from replica set
+     */
+    auto added_replicas = subtract_replica_sets(
+      current_assignment_it->replicas, previous_assignment.replicas);
+
+    for (auto& r : added_replicas) {
+        revisions_it->second[r.node_id] = model::revision_id(o);
+    }
+
+    auto removed_replicas = subtract_replica_sets(
+      previous_assignment.replicas, current_assignment_it->replicas);
+
+    for (auto& removed : removed_replicas) {
+        revisions_it->second.erase(removed.node_id);
+    }
 
     /// Update all non_replicable topics to have the same 'in-progress' state
     auto found = _topics_hierarchy.find(model::topic_namespace_view(cmd.key));
@@ -352,6 +388,13 @@ topic_table::apply(cancel_moving_partition_replicas_cmd cmd, model::offset o) {
     auto replicas = current_assignment_it->replicas;
     // replace replica set with set from in progress operation
     current_assignment_it->replicas = in_progress_it->second.previous_replicas;
+    auto revisions_it = tp->second.replica_revisions.find(cmd.key.tp.partition);
+    vassert(
+      revisions_it != tp->second.replica_revisions.end(),
+      "partition {} replica revisions map must exists",
+      cmd.key);
+
+    revisions_it->second = in_progress_it->second.replicas_revisions;
 
     /// Update all non_replicable topics to have the same 'in-progress' state
     auto found = _topics_hierarchy.find(model::topic_namespace_view(cmd.key));
@@ -555,10 +598,14 @@ topic_table::apply(create_non_replicable_topic_cmd cmd, model::offset o) {
           success,
           "Duplicate non_replicable_topic detected when it shouldn't exist");
     }
+    auto md = topic_metadata(
+      std::move(cfg), std::move(p_as), model::revision_id(o()), source.tp);
+
     _topics.insert(
       {new_non_rep_topic,
-       topic_metadata(
-         std::move(cfg), std::move(p_as), model::revision_id(o()), source.tp)});
+       topic_metadata_item{
+         .metadata = std::move(md),
+       }});
     notify_waiters();
     co_return make_error_code(errc::success);
 }
@@ -626,8 +673,9 @@ topic_table::wait_for_changes(ss::abort_source& as) {
 }
 
 std::vector<model::topic_namespace> topic_table::all_topics() const {
-    return transform_topics(
-      [](const topic_metadata& tp) { return tp.get_configuration().tp_ns; });
+    return transform_topics([](const topic_metadata_item& tp) {
+        return tp.get_configuration().tp_ns;
+    });
 }
 
 size_t topic_table::all_topics_count() const { return _topics.size(); }
@@ -635,14 +683,14 @@ size_t topic_table::all_topics_count() const { return _topics.size(); }
 std::optional<topic_metadata>
 topic_table::get_topic_metadata(model::topic_namespace_view tp) const {
     if (auto it = _topics.find(tp); it != _topics.end()) {
-        return it->second;
+        return it->second.metadata;
     }
     return {};
 }
 std::optional<std::reference_wrapper<const topic_metadata>>
 topic_table::get_topic_metadata_ref(model::topic_namespace_view tp) const {
     if (auto it = _topics.find(tp); it != _topics.end()) {
-        return it->second;
+        return it->second.metadata;
     }
     return {};
 }
diff --git a/src/v/cluster/topic_table.h b/src/v/cluster/topic_table.h
index a3cd4b8b83a41..0e6f7dd9c8692 100644
--- a/src/v/cluster/topic_table.h
+++ b/src/v/cluster/topic_table.h
@@ -41,17 +41,60 @@ class topic_table {
         cancel_requested,
         force_cancel_requested
     };
+    /**
+     * Replicas revision map is used to track revision of brokers in a replica
+     * set. When a node is added into replica set its gets the revision assigned
+     */
+    using replicas_revision_map
+      = absl::flat_hash_map<model::node_id, model::revision_id>;
 
     struct in_progress_update {
         std::vector<model::broker_shard> previous_replicas;
         in_progress_state state;
         model::revision_id update_revision;
+        replicas_revision_map replicas_revisions;
     };
+
+    struct topic_metadata_item {
+        topic_metadata metadata;
+        // replicas revisions for each partition
+        absl::node_hash_map<model::partition_id, replicas_revision_map>
+          replica_revisions;
+
+        bool is_topic_replicable() const {
+            return metadata.is_topic_replicable();
+        }
+
+        assignments_set& get_assignments() {
+            return metadata.get_assignments();
+        }
+
+        const assignments_set& get_assignments() const {
+            return metadata.get_assignments();
+        }
+        model::revision_id get_revision() const {
+            return metadata.get_revision();
+        }
+        std::optional<model::initial_revision_id> get_remote_revision() const {
+            return metadata.get_remote_revision();
+        }
+        const model::topic& get_source_topic() const {
+            return metadata.get_source_topic();
+        }
+
+        const topic_configuration& get_configuration() const {
+            return metadata.get_configuration();
+        }
+        topic_configuration& get_configuration() {
+            return metadata.get_configuration();
+        }
+    };
+
     using delta = topic_table_delta;
 
-    using underlying_t = absl::flat_hash_map<
+    using underlying_t = absl::node_hash_map<
       model::topic_namespace,
-      topic_metadata,
+      topic_metadata_item,
       model::topic_namespace_hash,
       model::topic_namespace_eq>;
     using hierarchy_t = absl::node_hash_map<
@@ -238,7 +281,7 @@ class topic_table {
     void notify_waiters();
 
     template<typename Func>
-    std::vector<std::invoke_result_t<Func, const topic_metadata&>>
+    std::vector<std::invoke_result_t<Func, const topic_metadata_item&>>
     transform_topics(Func&&) const;
 
     underlying_t _topics;
diff --git a/src/v/kafka/server/handlers/metadata.cc b/src/v/kafka/server/handlers/metadata.cc
index a372d5b6a4fec..43813d0e5931a 100644
--- a/src/v/kafka/server/handlers/metadata.cc
+++ b/src/v/kafka/server/handlers/metadata.cc
@@ -229,7 +229,7 @@ get_topic_metadata(request_context& ctx, metadata_request& request) {
                   authz_quiet{true})) {
                 continue;
             }
-            res.push_back(make_topic_response(ctx, request, md));
+            res.push_back(make_topic_response(ctx, request, md.metadata));
         }
 
         return ss::make_ready_future<std::vector<metadata_response::topic>>(

From 6d21be64a236d76dc94d79013ffa7af5dddf5928 Mon Sep 17 00:00:00 2001
From: Elena Anyusheva <lena@vectorized.io>
Date: Tue, 12 Jul 2022 16:03:49 +0200
Subject: [PATCH 066/201] ducky: add e2e test for read replica

This test created original topic, enables remote.write for it, waits
until data is in S3, then creates another cluster, creates read replica
topic there and consumes data from S3 topic.
---
 tests/rptest/services/redpanda.py           |   6 +-
 tests/rptest/tests/read_replica_e2e_test.py | 133 ++++++++++++++++++++
 2 files changed, 137 insertions(+), 2 deletions(-)
 create mode 100644 tests/rptest/tests/read_replica_e2e_test.py

diff --git a/tests/rptest/services/redpanda.py b/tests/rptest/services/redpanda.py
index 864baed73c64c..a184cd6f4fd65 100644
--- a/tests/rptest/services/redpanda.py
+++ b/tests/rptest/services/redpanda.py
@@ -578,7 +578,7 @@ def get_node_memory_mb(self):
             memory_kb = int(line.strip().split()[1])
             return memory_kb / 1024
 
-    def start(self, nodes=None, clean_nodes=True):
+    def start(self, nodes=None, clean_nodes=True, start_si=True):
         """Start the service on all nodes."""
         to_start = nodes if nodes is not None else self.nodes
         assert all((node in self.nodes for node in to_start))
@@ -653,7 +653,7 @@ def start(self, nodes=None, clean_nodes=True):
                                          request_timeout_ms=30000,
                                          api_version_auto_timeout_ms=3000)
 
-        if self._si_settings is not None:
+        if start_si and self._si_settings is not None:
             self.start_si()
 
     def write_tls_certs(self):
@@ -931,6 +931,8 @@ def start_si(self):
             logger=self.logger,
         )
 
+        self.logger.debug(
+            f"Creating S3 bucket: {self._si_settings.cloud_storage_bucket}")
         self._s3client.create_bucket(self._si_settings.cloud_storage_bucket)
 
     def list_buckets(self) -> dict[str, Union[list, dict]]:
diff --git a/tests/rptest/tests/read_replica_e2e_test.py b/tests/rptest/tests/read_replica_e2e_test.py
new file mode 100644
index 0000000000000..651d8e1c26abb
--- /dev/null
+++ b/tests/rptest/tests/read_replica_e2e_test.py
@@ -0,0 +1,133 @@
+# Copyright 2022 Redpanda Data, Inc.
+#
+# Use of this software is governed by the Business Source License
+# included in the file licenses/BSL.md
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0
+from rptest.services.cluster import cluster
+
+from rptest.clients.default import DefaultClient
+from rptest.services.redpanda import SISettings
+from rptest.clients.rpk import RpkTool
+from rptest.clients.types import TopicSpec
+from ducktape.mark import matrix
+
+import json
+
+from rptest.services.redpanda import RedpandaService
+from rptest.tests.end_to_end import EndToEndTest
+from rptest.services.verifiable_producer import VerifiableProducer, is_int_with_prefix
+from rptest.services.verifiable_consumer import VerifiableConsumer
+from rptest.util import (
+    wait_until, )
+
+
+class TestReadReplicaService(EndToEndTest):
+    log_segment_size = 1048576  # 5MB
+    topic_name = "panda-topic"
+    s3_bucket_name = "panda-bucket"
+    si_settings = SISettings(
+        cloud_storage_bucket=s3_bucket_name,
+        cloud_storage_reconciliation_interval_ms=500,
+        cloud_storage_max_connections=5,
+        log_segment_size=log_segment_size,
+        cloud_storage_readreplica_manifest_sync_timeout_ms=500,
+        cloud_storage_segment_max_upload_interval_sec=5)
+
+    def __init__(self, test_context):
+        super(TestReadReplicaService, self).__init__(test_context=test_context)
+        self.second_cluster = None
+
+    def create_read_replica_topic(self):
+        self.second_cluster = RedpandaService(self.test_context,
+                                              num_brokers=3,
+                                              si_settings=self.si_settings)
+        self.second_cluster.start(start_si=False)
+
+        rpk_second_cluster = RpkTool(self.second_cluster)
+        conf = {
+            'redpanda.remote.readreplica': 'true',
+            'redpanda.remote.readreplica.bucket': self.s3_bucket_name,
+        }
+        rpk_second_cluster.create_topic(self.topic_name, config=conf)
+
+    def start_consumer(self):
+        self.consumer = VerifiableConsumer(
+            self.test_context,
+            num_nodes=1,
+            redpanda=self.second_cluster,
+            topic=self.topic_name,
+            group_id='consumer_test_group',
+            on_record_consumed=self.on_record_consumed)
+        self.consumer.start()
+
+    def start_producer(self):
+        self.producer = VerifiableProducer(
+            self.test_context,
+            num_nodes=1,
+            redpanda=self.redpanda,
+            topic=self.topic_name,
+            throughput=1000,
+            message_validator=is_int_with_prefix)
+        self.producer.start()
+
+    @cluster(num_nodes=8)
+    @matrix(partition_count=[10], min_records=[10000])
+    def test_simple_end_to_end(self, partition_count, min_records):
+        # Create original topic, produce data to it
+        self.start_redpanda(3, si_settings=self.si_settings)
+        spec = TopicSpec(name=self.topic_name,
+                         partition_count=partition_count,
+                         replication_factor=3)
+
+        DefaultClient(self.redpanda).create_topic(spec)
+
+        self.start_producer()
+        wait_until(lambda: self.producer.num_acked > min_records,
+                       timeout_sec=30,
+                       err_msg="Producer failed to produce messages for %ds." %\
+                       30)
+        self.logger.info("Stopping producer after writing up to offsets %s" %\
+                        str(self.producer.last_acked_offsets))
+        self.producer.stop()
+
+        # Make original topic upload data to S3
+        rpk = RpkTool(self.redpanda)
+        rpk.alter_topic_config(spec.name, 'redpanda.remote.write', 'true')
+
+        # Make sure all produced data is uploaded to S3
+        def s3_has_all_data():
+            objects = list(
+                self.redpanda._s3client.list_objects(self.s3_bucket_name))
+            total_uploaded = 0
+            for o in objects:
+                if o.Key.endswith(
+                        "/manifest.json") and self.topic_name in o.Key:
+                    data = self.redpanda._s3client.get_object_data(
+                        self.s3_bucket_name, o.Key)
+                    manifest = json.loads(data)
+                    last_upl_offset = manifest['last_offset']
+                    total_uploaded += last_upl_offset
+                    self.logger.info(
+                        f"Found manifest at {o.Key}, last_offset is {last_upl_offset}"
+                    )
+            self.logger.info(
+                f"Total uploaded: {total_uploaded}, num_acked: {self.producer.num_acked}"
+            )
+            return total_uploaded >= self.producer.num_acked
+
+        wait_until(
+            s3_has_all_data,
+            timeout_sec=
+            30,  #should be uploaded since cloud_storage_segment_max_upload_interval_sec=5
+            backoff_sec=5,
+            err_msg=
+            f"Not all data is uploaded to S3 bucket, is S3 bucket: {list(self.redpanda._s3client.list_objects(self.s3_bucket_name))}"
+        )
+
+        # Create read replica topic, consume from it and validate
+        self.create_read_replica_topic()
+        self.start_consumer()
+        self.run_validation()

From f6e39b5689f5ca1c897d13e42e1e32b5bbf1e414 Mon Sep 17 00:00:00 2001
From: Elena Anyusheva <lena@vectorized.io>
Date: Tue, 12 Jul 2022 17:49:14 +0200
Subject: [PATCH 067/201] test: delete kafka/create_topic test for read replica

Since we now have e2e test for read replica, I think this test doesn't
belong here. But leaving test cases for creating read replica with
invalid config since request validation happens in the Kafka layer.
---
 src/v/kafka/server/tests/CMakeLists.txt       |   1 -
 .../kafka/server/tests/create_topics_test.cc  |  65 +------
 .../kafka/server/tests/s3_imposter_fixture.cc | 171 ------------------
 .../kafka/server/tests/s3_imposter_fixture.h  |  93 ----------
 4 files changed, 4 insertions(+), 326 deletions(-)
 delete mode 100644 src/v/kafka/server/tests/s3_imposter_fixture.cc
 delete mode 100644 src/v/kafka/server/tests/s3_imposter_fixture.h

diff --git a/src/v/kafka/server/tests/CMakeLists.txt b/src/v/kafka/server/tests/CMakeLists.txt
index 875fc931d94be..3630ba7e10ff6 100644
--- a/src/v/kafka/server/tests/CMakeLists.txt
+++ b/src/v/kafka/server/tests/CMakeLists.txt
@@ -14,7 +14,6 @@ rp_test(
 )
 
 set(srcs
-  s3_imposter_fixture.cc
   consumer_groups_test.cc
   member_test.cc
   group_test.cc
diff --git a/src/v/kafka/server/tests/create_topics_test.cc b/src/v/kafka/server/tests/create_topics_test.cc
index afb0532b41eec..8aac8de3fbf7a 100644
--- a/src/v/kafka/server/tests/create_topics_test.cc
+++ b/src/v/kafka/server/tests/create_topics_test.cc
@@ -12,23 +12,16 @@
 #include "kafka/server/handlers/topics/types.h"
 #include "redpanda/tests/fixture.h"
 #include "resource_mgmt/io_priority.h"
-#include "s3_imposter_fixture.h"
 
 #include <seastar/core/smp.hh>
 #include <seastar/core/sstring.hh>
 
 #include <algorithm>
 #include <limits>
-#include <optional>
-
-inline ss::logger test_log("test"); // NOLINT
 
 // rougly equivalent to the test harness:
 //   https://github.com/apache/kafka/blob/8e16158/core/src/test/scala/unit/kafka/server/AbstractCreateTopicsRequestTest.scala
-class create_topic_fixture
-  : public s3_imposter_fixture
-  , public enable_cloud_storage_fixture
-  , public redpanda_thread_fixture {
+class create_topic_fixture : public redpanda_thread_fixture {
 public:
     kafka::create_topics_request make_req(
       std::vector<kafka::creatable_topic> topics, bool validate_only = false) {
@@ -97,18 +90,11 @@ class create_topic_fixture
 
     void test_create_topic(
       kafka::create_topics_request req,
-      std::optional<int> partition_count = std::nullopt,
-      std::optional<int> revision_id = std::nullopt,
       kafka::api_version version = kafka::api_version(2)) {
         auto client = make_kafka_client().get0();
         client.connect().get();
         auto resp = client.dispatch(req, version).get0();
 
-        // todo: here
-        for (auto req : get_requests()) {
-            vlog(test_log.info, "{} {}", req._method, req._url);
-        }
-
         BOOST_REQUIRE_MESSAGE(
           std::all_of(
             std::cbegin(resp.data.topics),
@@ -119,7 +105,7 @@ class create_topic_fixture
           fmt::format("expected no errors. received response: {}", resp));
 
         for (auto& topic : req.data.topics) {
-            verify_metadata(client, req, topic, partition_count, revision_id);
+            verify_metadata(client, req, topic);
 
             auto it = std::find_if(
               resp.data.topics.begin(),
@@ -140,11 +126,6 @@ class create_topic_fixture
         client.stop().then([&client] { client.shutdown(); }).get();
     }
 
-    void test_create_read_replica_topic(
-      kafka::create_topics_request req, int partition_count, int revision_id) {
-        test_create_topic(req, partition_count, revision_id);
-    }
-
     void verify_response(
       const kafka::creatable_topic& req,
       const kafka::creatable_topic_result& topic_res,
@@ -223,9 +204,7 @@ class create_topic_fixture
     void verify_metadata(
       kafka::client::transport& client,
       kafka::create_topics_request& create_req,
-      kafka::creatable_topic& request_topic,
-      std::optional<int> partition_count = std::nullopt,
-      std::optional<int> revision_id = std::nullopt) {
+      kafka::creatable_topic& request_topic) {
         // query the server for this topic's metadata
         kafka::metadata_request metadata_req;
         metadata_req.data.topics
@@ -248,9 +227,7 @@ class create_topic_fixture
           "expected topic not returned from metadata query");
 
         int partitions;
-        if (partition_count) {
-            partitions = partition_count.value();
-        } else if (!request_topic.assignments.empty()) {
+        if (!request_topic.assignments.empty()) {
             partitions = request_topic.assignments.size();
         } else {
             partitions = request_topic.num_partitions;
@@ -375,38 +352,6 @@ FIXTURE_TEST(create_non_replicable_topics, create_topic_fixture) {
     BOOST_CHECK(resp[1].tp_ns.tp() == "topic2");
 }
 
-FIXTURE_TEST(read_replica, create_topic_fixture) {
-    ss::sstring manifest_url = ssx::sformat(
-      "/f0000000/meta/kafka/test-topic/topic_manifest.json");
-
-    std::string_view manifest_payload = R"json({
-        "version": 1,
-        "namespace": "kafka",
-        "topic": "test-topic",
-        "partition_count": 32,
-        "replication_factor": 3,
-        "revision_id": 10,
-        "compression": null,
-        "cleanup_policy_bitflags": null,
-        "compaction_strategy": null,
-        "timestamp_type": null,
-        "segment_size": null
-    })json";
-
-    set_expectations_and_listen({expectation{
-      .url = manifest_url, .body = ss::sstring(manifest_payload)}});
-
-    auto topic = make_topic(
-      "test-topic",
-      std::nullopt,
-      std::nullopt,
-      std::map<ss::sstring, ss::sstring>{
-        {"redpanda.remote.readreplica", "true"},
-        {"redpanda.remote.readreplica.bucket", "panda-bucket"}});
-
-    test_create_read_replica_topic(make_req({topic}), 32, 10);
-}
-
 FIXTURE_TEST(s3bucket_is_missing, create_topic_fixture) {
     auto topic = make_topic(
       "topic1",
@@ -494,7 +439,5 @@ FIXTURE_TEST(test_v5_validate_configs_resp, create_topic_fixture) {
         {make_topic("topicC", 3, 1, config_map),
          make_topic("topicD", 3, 1, config_map)},
         false),
-      std::nullopt,
-      std::nullopt,
       kafka::api_version(5));
 }
diff --git a/src/v/kafka/server/tests/s3_imposter_fixture.cc b/src/v/kafka/server/tests/s3_imposter_fixture.cc
deleted file mode 100644
index 5a45621dfa6f8..0000000000000
--- a/src/v/kafka/server/tests/s3_imposter_fixture.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright 2022 Redpanda Data, Inc.
- *
- * Licensed as a Redpanda Enterprise file under the Redpanda Community
- * License (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * https://github.com/redpanda-data/redpanda/blob/master/licenses/rcl.md
- */
-
-#include "s3_imposter_fixture.h"
-
-#include "bytes/iobuf.h"
-#include "bytes/iobuf_parser.h"
-#include "config/configuration.h"
-#include "seastarx.h"
-#include "test_utils/async.h"
-
-#include <seastar/core/coroutine.hh>
-#include <seastar/core/iostream.hh>
-#include <seastar/core/temporary_buffer.hh>
-#include <seastar/http/function_handlers.hh>
-#include <seastar/net/socket_defs.hh>
-#include <seastar/util/defer.hh>
-
-#include <boost/core/noncopyable.hpp>
-#include <boost/property_tree/json_parser.hpp>
-#include <boost/test/tools/old/interface.hpp>
-#include <boost/test/unit_test.hpp>
-
-using namespace std::chrono_literals;
-
-inline ss::logger fixt_log("fixture"); // NOLINT
-
-static constexpr int16_t httpd_port_number = 4430;
-static constexpr const char* httpd_host_name = "127.0.0.1";
-
-s3_imposter_fixture::s3_imposter_fixture() {
-    _server = ss::make_shared<ss::httpd::http_server_control>();
-    _server->start().get();
-    ss::ipv4_addr ip_addr = {httpd_host_name, httpd_port_number};
-    _server_addr = ss::socket_address(ip_addr);
-}
-
-s3_imposter_fixture::~s3_imposter_fixture() { _server->stop().get(); }
-
-const std::vector<ss::httpd::request>&
-s3_imposter_fixture::get_requests() const {
-    return _requests;
-}
-
-const std::multimap<ss::sstring, ss::httpd::request>&
-s3_imposter_fixture::get_targets() const {
-    return _targets;
-}
-
-void s3_imposter_fixture::set_expectations_and_listen(
-  const std::vector<s3_imposter_fixture::expectation>& expectations) {
-    _server
-      ->set_routes([this, &expectations](ss::httpd::routes& r) {
-          set_routes(r, expectations);
-      })
-      .get();
-    _server->listen(_server_addr).get();
-}
-
-void s3_imposter_fixture::set_routes(
-  ss::httpd::routes& r,
-  const std::vector<s3_imposter_fixture::expectation>& expectations) {
-    using namespace ss::httpd;
-    struct content_handler {
-        content_handler(
-          const std::vector<expectation>& exp, s3_imposter_fixture& imp)
-          : fixture(imp) {
-            for (const auto& e : exp) {
-                expectations[e.url] = e;
-            }
-        }
-        ss::sstring handle(const_req request, reply& repl) {
-            static const ss::sstring error_payload
-              = R"xml(<?xml version="1.0" encoding="UTF-8"?>
-                        <Error>
-                            <Code>NoSuchKey</Code>
-                            <Message>Object not found</Message>
-                            <Resource>resource</Resource>
-                            <RequestId>requestid</RequestId>
-                        </Error>)xml";
-            fixture._requests.push_back(request);
-            fixture._targets.insert(std::make_pair(request._url, request));
-            vlog(
-              fixt_log.trace,
-              "S3 imposter request {} - {} - {}",
-              request._url,
-              request.content_length,
-              request._method);
-            if (request._method == "GET") {
-                auto it = expectations.find(request._url);
-                if (it == expectations.end() || !it->second.body.has_value()) {
-                    vlog(fixt_log.trace, "Reply GET request with error");
-                    repl.set_status(reply::status_type::not_found);
-                    return error_payload;
-                }
-                return *it->second.body;
-            } else if (request._method == "PUT") {
-                expectations[request._url] = {
-                  .url = request._url, .body = request.content};
-                return "";
-            } else if (request._method == "DELETE") {
-                auto it = expectations.find(request._url);
-                if (it == expectations.end() || !it->second.body.has_value()) {
-                    vlog(fixt_log.trace, "Reply DELETE request with error");
-                    repl.set_status(reply::status_type::not_found);
-                    return error_payload;
-                }
-                repl.set_status(reply::status_type::no_content);
-                it->second.body = std::nullopt;
-                return "";
-            } else if (request._method == "HEAD") {
-                auto it = expectations.find(request._url);
-                if (it == expectations.end() || !it->second.body.has_value()) {
-                    vlog(fixt_log.trace, "Reply HEAD request with error");
-                    repl.add_header("x-amz-request-id", "placeholder-id");
-                    repl.set_status(reply::status_type::not_found);
-                } else {
-                    repl.add_header("ETag", "placeholder-etag");
-                    repl.add_header(
-                      "Content-Length",
-                      ssx::sformat("{}", it->second.body->size()));
-                    repl.set_status(reply::status_type::ok);
-                }
-                vlog(
-                  fixt_log.trace,
-                  "S3 imposter response: {}",
-                  repl.response_line());
-                return "";
-            }
-            BOOST_FAIL("Unexpected request");
-            return "";
-        }
-        std::map<ss::sstring, expectation> expectations;
-        s3_imposter_fixture& fixture;
-    };
-    auto hd = ss::make_shared<content_handler>(expectations, *this);
-    _handler = std::make_unique<function_handler>(
-      [hd](const_req req, reply& repl) { return hd->handle(req, repl); },
-      "txt");
-    r.add_default_handler(_handler.get());
-}
-
-enable_cloud_storage_fixture::enable_cloud_storage_fixture() {
-    ss::smp::invoke_on_all([]() {
-        auto& cfg = config::shard_local_cfg();
-        cfg.cloud_storage_enabled.set_value(true);
-        cfg.cloud_storage_disable_tls.set_value(true);
-        cfg.cloud_storage_api_endpoint.set_value(
-          std::optional<ss::sstring>{httpd_host_name});
-        cfg.cloud_storage_api_endpoint_port.set_value(httpd_port_number);
-        cfg.cloud_storage_access_key.set_value(
-          std::optional<ss::sstring>{"access-key"});
-        cfg.cloud_storage_secret_key.set_value(
-          std::optional<ss::sstring>{"secret-key"});
-        cfg.cloud_storage_region.set_value(
-          std::optional<ss::sstring>{"us-east-1"});
-        cfg.cloud_storage_bucket.set_value(
-          std::optional<ss::sstring>{"test-bucket"});
-    }).get0();
-}
-
-enable_cloud_storage_fixture::~enable_cloud_storage_fixture() {
-    config::shard_local_cfg().cloud_storage_enabled.set_value(false);
-}
diff --git a/src/v/kafka/server/tests/s3_imposter_fixture.h b/src/v/kafka/server/tests/s3_imposter_fixture.h
deleted file mode 100644
index 73f90fd61ba71..0000000000000
--- a/src/v/kafka/server/tests/s3_imposter_fixture.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright 2022 Redpanda Data, Inc.
- *
- * Licensed as a Redpanda Enterprise file under the Redpanda Community
- * License (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * https://github.com/redpanda-data/redpanda/blob/master/licenses/rcl.md
- */
-
-#pragma once
-
-#include "seastarx.h"
-#include "ssx/sformat.h"
-
-#include <seastar/core/future.hh>
-#include <seastar/core/shared_ptr.hh>
-#include <seastar/core/sstring.hh>
-#include <seastar/http/httpd.hh>
-
-#include <chrono>
-#include <exception>
-#include <map>
-#include <vector>
-
-// TODO(https://github.com/redpanda-data/redpanda/issues/5240):
-// Move s3_imposter_fixture to the common place and use one implementation
-// throughout the code base
-
-/// Emulates S3 REST API for testing purposes.
-/// The imposter is a simple KV-store that contains a set of expectations.
-/// Expectations are accessible by url via GET, PUT, and DELETE http calls.
-/// Expectations are provided before impster starts to listen. They have
-/// two field - url and optional body. If body is set to nullopt, attemtp
-/// to read it using GET or delete it using DELETE requests will trigger an
-/// http response with error code 404 and xml formatted error message.
-/// If the body of the expectation is set by the user or PUT request it can
-/// be retrieved using the GET request or deleted using the DELETE request.
-class s3_imposter_fixture {
-public:
-    s3_imposter_fixture();
-    ~s3_imposter_fixture();
-
-    s3_imposter_fixture(const s3_imposter_fixture&) = delete;
-    s3_imposter_fixture& operator=(const s3_imposter_fixture&) = delete;
-    s3_imposter_fixture(s3_imposter_fixture&&) = delete;
-    s3_imposter_fixture& operator=(s3_imposter_fixture&&) = delete;
-
-    struct expectation {
-        ss::sstring url;
-        std::optional<ss::sstring> body;
-    };
-
-    /// Set expectaitions on REST API calls that supposed to be made
-    /// Only the requests that described in this call will be possible
-    /// to make. This method can only be called once per test run.
-    ///
-    /// \param expectations is a collection of access points that allow GET,
-    /// PUT, and DELETE requests, each expectation has url and body. The body
-    /// will be returned by GET call if it's set or trigger error if its null.
-    /// The expectations are statefull. If the body of the expectation was set
-    /// to null but there was PUT call that sent some data, subsequent GET call
-    /// will retrieve this data.
-    void
-    set_expectations_and_listen(const std::vector<expectation>& expectations);
-
-    /// Access all http requests ordered by time
-    const std::vector<ss::httpd::request>& get_requests() const;
-
-    /// Access all http requests ordered by target url
-    const std::multimap<ss::sstring, ss::httpd::request>& get_targets() const;
-
-    // static s3::configuration get_configuration();
-
-private:
-    void set_routes(
-      ss::httpd::routes& r, const std::vector<expectation>& expectations);
-
-    ss::socket_address _server_addr;
-    ss::shared_ptr<ss::httpd::http_server_control> _server;
-
-    std::unique_ptr<ss::httpd::handler_base> _handler;
-    /// Contains saved requests
-    std::vector<ss::httpd::request> _requests;
-    /// Contains all accessed target urls
-    std::multimap<ss::sstring, ss::httpd::request> _targets;
-};
-
-class enable_cloud_storage_fixture {
-public:
-    enable_cloud_storage_fixture();
-    ~enable_cloud_storage_fixture();
-};

From b3130f60f48d9e48e2f18c527a988d56a27ae2be Mon Sep 17 00:00:00 2001
From: NyaliaLui <nyalia@redpanda.com>
Date: Fri, 8 Jul 2022 16:41:39 -0400
Subject: [PATCH 068/201] net: add rpc error count to new endpoint

This patch adds rpc error count metrics to the new prometheus endpoint
(exposed at "public_metrics"). The following metrics were added:

* redpanda_rpc_request_errors_total
       * Description: Number of rpc errors
       * Labels: server
       * Aggregation: shard
---
 src/v/net/probes.cc                      | 25 ++++++++++++++++++++++--
 src/v/net/server.cc                      |  8 +++++++-
 src/v/net/server.h                       |  3 +++
 src/v/net/server_probe.h                 |  5 ++++-
 src/v/raft/tests/raft_group_fixture.h    |  1 +
 src/v/redpanda/application.cc            |  5 +++++
 src/v/rpc/test/rpc_integration_fixture.h |  1 +
 7 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/src/v/net/probes.cc b/src/v/net/probes.cc
index 69a226f9a2eac..41aef57e63478 100644
--- a/src/v/net/probes.cc
+++ b/src/v/net/probes.cc
@@ -11,6 +11,7 @@
 #include "net/client_probe.h"
 #include "net/server_probe.h"
 #include "prometheus/prometheus_sanitize.h"
+#include "ssx/metrics.h"
 #include "ssx/sformat.h"
 
 #include <seastar/core/metrics.hh>
@@ -20,13 +21,13 @@
 
 namespace net {
 void server_probe::setup_metrics(
-  ss::metrics::metric_groups& mgs, const char* proto) {
+  ss::metrics::metric_groups& mgs, std::string_view proto) {
     namespace sm = ss::metrics;
     auto aggregate_labels = config::shard_local_cfg().aggregate_metrics()
                               ? std::vector<sm::label>{sm::shard_label}
                               : std::vector<sm::label>{};
     mgs.add_group(
-      prometheus_sanitize::metrics_name(proto),
+      prometheus_sanitize::metrics_name(ss::sstring{proto}),
       {
         sm::make_gauge(
           "active_connections",
@@ -110,6 +111,26 @@ void server_probe::setup_metrics(
       });
 }
 
+void server_probe::setup_public_metrics(
+  ss::metrics::metric_groups& mgs, std::string_view proto) {
+    namespace sm = ss::metrics;
+
+    if (proto.ends_with("_rpc")) {
+        proto.remove_suffix(4);
+    }
+
+    auto server_label = sm::label("server");
+
+    mgs.add_group(
+      "rpc",
+      {sm::make_counter(
+         "request_errors_total",
+         [this] { return _service_errors; },
+         sm::description("Number of rpc errors"),
+         {server_label(proto)})
+         .aggregate({sm::shard_label})});
+}
+
 std::ostream& operator<<(std::ostream& o, const server_probe& p) {
     o << "{"
       << "connects: " << p._connects << ", "
diff --git a/src/v/net/server.cc b/src/v/net/server.cc
index 1f28c2bfd7ae1..956a99d3b9665 100644
--- a/src/v/net/server.cc
+++ b/src/v/net/server.cc
@@ -16,6 +16,7 @@
 #include "rpc/service.h"
 #include "seastar/core/coroutine.hh"
 #include "ssx/future-util.h"
+#include "ssx/metrics.h"
 #include "ssx/sformat.h"
 #include "vassert.h"
 #include "vlog.h"
@@ -31,7 +32,8 @@ namespace net {
 
 server::server(server_configuration c)
   : cfg(std::move(c))
-  , _memory(cfg.max_service_memory_per_core) {}
+  , _memory(cfg.max_service_memory_per_core)
+  , _public_metrics(ssx::metrics::public_metrics_handle) {}
 
 server::server(ss::sharded<server_configuration>* s)
   : server(s->local()) {}
@@ -45,6 +47,10 @@ void server::start() {
         _probe.setup_metrics(_metrics, cfg.name.c_str());
     }
 
+    if (!cfg.disable_public_metrics) {
+        _probe.setup_public_metrics(_public_metrics, cfg.name.c_str());
+    }
+
     if (cfg.connection_rate_bindings) {
         connection_rate_bindings.emplace(cfg.connection_rate_bindings.value());
 
diff --git a/src/v/net/server.h b/src/v/net/server.h
index 6c966d057c713..c7f2474b9ad25 100644
--- a/src/v/net/server.h
+++ b/src/v/net/server.h
@@ -98,6 +98,8 @@ struct server_configuration {
     std::optional<int> tcp_send_buf;
     std::optional<size_t> stream_recv_buf;
     net::metrics_disabled disable_metrics = net::metrics_disabled::no;
+    net::public_metrics_disabled disable_public_metrics
+      = net::public_metrics_disabled::no;
     ss::sstring name;
     std::optional<config_connection_rate_bindings> connection_rate_bindings;
     // we use the same default as seastar for load balancing algorithm
@@ -203,6 +205,7 @@ class server {
     hdr_hist _hist;
     server_probe _probe;
     ss::metrics::metric_groups _metrics;
+    ss::metrics::metric_groups _public_metrics;
 
     std::optional<config_connection_rate_bindings> connection_rate_bindings;
     std::optional<connection_rate<>> _connection_rates;
diff --git a/src/v/net/server_probe.h b/src/v/net/server_probe.h
index 4eabbddb3f9e4..8b45b55813cc1 100644
--- a/src/v/net/server_probe.h
+++ b/src/v/net/server_probe.h
@@ -52,7 +52,10 @@ class server_probe {
 
     void waiting_for_conection_rate() { ++_connections_wait_rate; }
 
-    void setup_metrics(ss::metrics::metric_groups& mgs, const char* name);
+    void setup_metrics(ss::metrics::metric_groups& mgs, std::string_view proto);
+
+    void setup_public_metrics(
+      ss::metrics::metric_groups& mgs, std::string_view proto);
 
 private:
     uint64_t _requests_completed = 0;
diff --git a/src/v/raft/tests/raft_group_fixture.h b/src/v/raft/tests/raft_group_fixture.h
index e69b4d705d6c2..63649e9ff8a87 100644
--- a/src/v/raft/tests/raft_group_fixture.h
+++ b/src/v/raft/tests/raft_group_fixture.h
@@ -184,6 +184,7 @@ struct raft_node {
         scfg.addrs.emplace_back(net::resolve_dns(broker.rpc_address()).get());
         scfg.max_service_memory_per_core = 1024 * 1024 * 1024;
         scfg.disable_metrics = net::metrics_disabled::yes;
+        scfg.disable_public_metrics = net::public_metrics_disabled::yes;
         server.start(std::move(scfg)).get0();
         raft_manager.start().get0();
         raft_manager
diff --git a/src/v/redpanda/application.cc b/src/v/redpanda/application.cc
index 6dba2999a00d8..62835af67a239 100644
--- a/src/v/redpanda/application.cc
+++ b/src/v/redpanda/application.cc
@@ -927,6 +927,8 @@ void application::wire_up_redpanda_services() {
               c.max_service_memory_per_core = memory_groups::rpc_total_memory();
               c.disable_metrics = net::metrics_disabled(
                 config::shard_local_cfg().disable_metrics());
+              c.disable_public_metrics = net::public_metrics_disabled(
+                config::shard_local_cfg().disable_public_metrics());
               c.listen_backlog
                 = config::shard_local_cfg().rpc_server_listen_backlog;
               c.tcp_recv_buf
@@ -1116,6 +1118,8 @@ void application::wire_up_redpanda_services() {
 
               c.disable_metrics = net::metrics_disabled(
                 config::shard_local_cfg().disable_metrics());
+              c.disable_public_metrics = net::public_metrics_disabled(
+                config::shard_local_cfg().disable_public_metrics());
 
               net::config_connection_rate_bindings bindings{
                 .config_general_rate
@@ -1287,6 +1291,7 @@ void application::start_redpanda(::stop_signal& app_signal) {
           if (!config::shard_local_cfg().disable_metrics()) {
               proto->setup_metrics();
           }
+
           s.set_protocol(std::move(proto));
       })
       .get();
diff --git a/src/v/rpc/test/rpc_integration_fixture.h b/src/v/rpc/test/rpc_integration_fixture.h
index 40bd817d3bafc..81d3c4976e5ec 100644
--- a/src/v/rpc/test/rpc_integration_fixture.h
+++ b/src/v/rpc/test/rpc_integration_fixture.h
@@ -140,6 +140,7 @@ class rpc_sharded_integration_fixture : public rpc_base_integration_fixture {
       ss::tls::reload_callback&& cb = {}) override {
         net::server_configuration scfg("unit_test_rpc_sharded");
         scfg.disable_metrics = net::metrics_disabled::yes;
+        scfg.disable_public_metrics = net::public_metrics_disabled::yes;
         auto resolved = net::resolve_dns(_listen_address).get();
         scfg.addrs.emplace_back(
           resolved,

From ebe3fa2ae8faf51e6f234d4d49720e10fb7af8a9 Mon Sep 17 00:00:00 2001
From: NyaliaLui <nyalia@redpanda.com>
Date: Fri, 8 Jul 2022 16:46:11 -0400
Subject: [PATCH 069/201] rpc: add latency measurements to the new endpoint

This patch adds rpc latency metrics to the new prometheus endpoint
(exposed at "public_metrics"). The following metrics were added:

* redpanda_rpc_request_latency_seconds
  * Description: Internal and Kafka RPC latency
  * Labels: server
  * Aggregation: shard
---
 src/v/net/server.cc | 25 +++++++++++++++++++++++++
 src/v/net/server.h  |  1 +
 2 files changed, 26 insertions(+)

diff --git a/src/v/net/server.cc b/src/v/net/server.cc
index 956a99d3b9665..7cce70e67fb48 100644
--- a/src/v/net/server.cc
+++ b/src/v/net/server.cc
@@ -48,6 +48,7 @@ void server::start() {
     }
 
     if (!cfg.disable_public_metrics) {
+        setup_public_metrics();
         _probe.setup_public_metrics(_public_metrics, cfg.name.c_str());
     }
 
@@ -324,6 +325,30 @@ void server::setup_metrics() {
          sm::description(ssx::sformat("{}: Latency ", cfg.name)))});
 }
 
+void server::setup_public_metrics() {
+    namespace sm = ss::metrics;
+    if (!_proto) {
+        return;
+    }
+
+    std::string_view server_name(cfg.name);
+
+    if (server_name.ends_with("_rpc")) {
+        server_name.remove_suffix(4);
+    }
+
+    auto server_label = sm::label("server");
+
+    _public_metrics.add_group(
+      prometheus_sanitize::metrics_name("rpc:request"),
+      {sm::make_histogram(
+         "latency_seconds",
+         sm::description("RPC latency"),
+         {server_label(server_name)},
+         [this] { return ssx::metrics::report_default_histogram(_hist); })
+         .aggregate({sm::shard_label})});
+}
+
 std::ostream& operator<<(std::ostream& o, const server_configuration& c) {
     o << "{";
     for (auto& a : c.addrs) {
diff --git a/src/v/net/server.h b/src/v/net/server.h
index c7f2474b9ad25..6c88cccadc421 100644
--- a/src/v/net/server.h
+++ b/src/v/net/server.h
@@ -195,6 +195,7 @@ class server {
     friend resources;
     ss::future<> accept(listener&);
     void setup_metrics();
+    void setup_public_metrics();
 
     std::unique_ptr<protocol> _proto;
     ss::semaphore _memory;

From 3e89e4f7ea29f646590415402af4b8dd78413f46 Mon Sep 17 00:00:00 2001
From: Elena Anyusheva <lena@vectorized.io>
Date: Wed, 13 Jul 2022 13:36:49 +0200
Subject: [PATCH 070/201] kafka: merge remote.readreplica and bucket

The change is only UX, feature implementation and internal types don't
change.
---
 src/v/kafka/server/handlers/create_topics.cc  |  9 ++--
 src/v/kafka/server/handlers/topics/types.cc   | 13 ++---
 src/v/kafka/server/handlers/topics/types.h    |  2 -
 .../kafka/server/handlers/topics/validators.h | 41 ----------------
 .../kafka/server/tests/create_topics_test.cc  | 47 +------------------
 tests/rptest/tests/read_replica_e2e_test.py   |  3 +-
 6 files changed, 10 insertions(+), 105 deletions(-)

diff --git a/src/v/kafka/server/handlers/create_topics.cc b/src/v/kafka/server/handlers/create_topics.cc
index 67b6ee2e667f0..e088df7371a57 100644
--- a/src/v/kafka/server/handlers/create_topics.cc
+++ b/src/v/kafka/server/handlers/create_topics.cc
@@ -33,7 +33,7 @@
 
 namespace kafka {
 
-static constexpr std::array<std::string_view, 12> supported_configs{
+static constexpr std::array<std::string_view, 11> supported_configs{
   topic_property_compression,
   topic_property_cleanup_policy,
   topic_property_timestamp_type,
@@ -44,8 +44,7 @@ static constexpr std::array<std::string_view, 12> supported_configs{
   topic_property_recovery,
   topic_property_remote_write,
   topic_property_remote_read,
-  topic_property_read_replica,
-  topic_property_read_replica_bucket};
+  topic_property_read_replica};
 
 bool is_supported(std::string_view name) {
     return std::any_of(
@@ -65,9 +64,7 @@ using validators = make_validator_types<
   compaction_strategy_validator,
   timestamp_type_validator,
   cleanup_policy_validator,
-  remote_read_and_write_are_not_supported_for_read_replica,
-  s3_bucket_is_required_for_read_replica,
-  s3_bucket_is_supported_only_for_read_replica>;
+  remote_read_and_write_are_not_supported_for_read_replica>;
 
 static std::vector<creatable_topic_configs>
 properties_to_result_configs(config_map_t config_map) {
diff --git a/src/v/kafka/server/handlers/topics/types.cc b/src/v/kafka/server/handlers/topics/types.cc
index 8eda38c9996b6..80594c920795c 100644
--- a/src/v/kafka/server/handlers/topics/types.cc
+++ b/src/v/kafka/server/handlers/topics/types.cc
@@ -152,10 +152,11 @@ to_cluster_type(const creatable_topic& t) {
     cfg.properties.recovery = get_bool_value(
       config_entries, topic_property_recovery);
     cfg.properties.shadow_indexing = get_shadow_indexing_mode(config_entries);
-    cfg.properties.read_replica = get_bool_value(
-      config_entries, topic_property_read_replica);
     cfg.properties.read_replica_bucket = get_string_value(
-      config_entries, topic_property_read_replica_bucket);
+      config_entries, topic_property_read_replica);
+    if (cfg.properties.read_replica_bucket.has_value()) {
+        cfg.properties.read_replica = true;
+    }
     /// Final topic_property not decoded here is \ref remote_topic_properties,
     /// is more of an implementation detail no need to ever show user
 
@@ -246,12 +247,8 @@ config_map_t from_cluster_type(const cluster::topic_properties& properties) {
             break;
         }
     }
-    if (properties.read_replica) {
-        config_entries[topic_property_read_replica] = from_config_type(
-          *properties.read_replica);
-    }
     if (properties.read_replica_bucket) {
-        config_entries[topic_property_read_replica_bucket] = from_config_type(
+        config_entries[topic_property_read_replica] = from_config_type(
           *properties.read_replica_bucket);
     }
     /// Final topic_property not encoded here is \ref remote_topic_properties,
diff --git a/src/v/kafka/server/handlers/topics/types.h b/src/v/kafka/server/handlers/topics/types.h
index b2a66303ebe43..0c57d1bd458a7 100644
--- a/src/v/kafka/server/handlers/topics/types.h
+++ b/src/v/kafka/server/handlers/topics/types.h
@@ -53,8 +53,6 @@ static constexpr std::string_view topic_property_remote_read
   = "redpanda.remote.read";
 static constexpr std::string_view topic_property_read_replica
   = "redpanda.remote.readreplica";
-static constexpr std::string_view topic_property_read_replica_bucket
-  = "redpanda.remote.readreplica.bucket";
 
 // Data-policy property
 static constexpr std::string_view topic_property_data_policy_function_name
diff --git a/src/v/kafka/server/handlers/topics/validators.h b/src/v/kafka/server/handlers/topics/validators.h
index cb346517beff0..12a92e67a89bb 100644
--- a/src/v/kafka/server/handlers/topics/validators.h
+++ b/src/v/kafka/server/handlers/topics/validators.h
@@ -163,47 +163,6 @@ struct remote_read_and_write_are_not_supported_for_read_replica {
     }
 };
 
-struct s3_bucket_is_required_for_read_replica {
-    static constexpr error_code ec = error_code::invalid_config;
-    static constexpr const char* error_message
-      = "s3 bucket should be provided for read replica topic";
-
-    static bool is_valid(const creatable_topic& c) {
-        auto config_entries = config_map(c.configs);
-        auto end = config_entries.end();
-        bool is_read_replica
-          = (config_entries.find(topic_property_read_replica) != end);
-        bool s3_bucket_provided
-          = (config_entries.find(topic_property_read_replica_bucket) != end);
-
-        if (is_read_replica && !s3_bucket_provided) {
-            return false;
-        }
-        return true;
-    }
-};
-
-struct s3_bucket_is_supported_only_for_read_replica {
-    static constexpr error_code ec = error_code::invalid_config;
-    static constexpr const char* error_message
-      = "s3 bucket is supported only when redpanda.remote.readreplica is "
-        "enabled";
-
-    static bool is_valid(const creatable_topic& c) {
-        auto config_entries = config_map(c.configs);
-        auto end = config_entries.end();
-        bool is_read_replica
-          = (config_entries.find(topic_property_read_replica) != end);
-        bool s3_bucket_provided
-          = (config_entries.find(topic_property_read_replica_bucket) != end);
-
-        if (!is_read_replica && s3_bucket_provided) {
-            return false;
-        }
-        return true;
-    }
-};
-
 struct compression_type_validator_details {
     using validated_type = model::compression;
 
diff --git a/src/v/kafka/server/tests/create_topics_test.cc b/src/v/kafka/server/tests/create_topics_test.cc
index 8aac8de3fbf7a..41e043e9a03e9 100644
--- a/src/v/kafka/server/tests/create_topics_test.cc
+++ b/src/v/kafka/server/tests/create_topics_test.cc
@@ -352,58 +352,13 @@ FIXTURE_TEST(create_non_replicable_topics, create_topic_fixture) {
     BOOST_CHECK(resp[1].tp_ns.tp() == "topic2");
 }
 
-FIXTURE_TEST(s3bucket_is_missing, create_topic_fixture) {
-    auto topic = make_topic(
-      "topic1",
-      std::nullopt,
-      std::nullopt,
-      std::map<ss::sstring, ss::sstring>{
-        {"redpanda.remote.readreplica", "true"}});
-
-    auto req = make_req({topic});
-
-    auto client = make_kafka_client().get0();
-    client.connect().get();
-    auto resp = client.dispatch(req, kafka::api_version(2)).get0();
-
-    BOOST_CHECK(
-      resp.data.topics[0].error_code == kafka::error_code::invalid_config);
-    BOOST_CHECK(
-      resp.data.topics[0].error_message
-      == "s3 bucket should be provided for read replica topic");
-    BOOST_CHECK(resp.data.topics[0].name == "topic1");
-}
-
-FIXTURE_TEST(s3bucket_but_not_read_replica, create_topic_fixture) {
-    auto topic = make_topic(
-      "topic1",
-      std::nullopt,
-      std::nullopt,
-      std::map<ss::sstring, ss::sstring>{
-        {"redpanda.remote.readreplica.bucket", "panda-bucket"}});
-
-    auto req = make_req({topic});
-
-    auto client = make_kafka_client().get0();
-    client.connect().get();
-    auto resp = client.dispatch(req, kafka::api_version(2)).get0();
-
-    BOOST_CHECK(
-      resp.data.topics[0].error_code == kafka::error_code::invalid_config);
-    BOOST_CHECK(
-      resp.data.topics[0].error_message
-      == "s3 bucket is supported only when redpanda.remote.readreplica is "
-         "enabled");
-    BOOST_CHECK(resp.data.topics[0].name == "topic1");
-}
-
 FIXTURE_TEST(read_replica_and_remote_write, create_topic_fixture) {
     auto topic = make_topic(
       "topic1",
       std::nullopt,
       std::nullopt,
       std::map<ss::sstring, ss::sstring>{
-        {"redpanda.remote.readreplica", "true"},
+        {"redpanda.remote.readreplica", "panda-bucket"},
         {"redpanda.remote.write", "true"}});
 
     auto req = make_req({topic});
diff --git a/tests/rptest/tests/read_replica_e2e_test.py b/tests/rptest/tests/read_replica_e2e_test.py
index 651d8e1c26abb..81afc0e3896f5 100644
--- a/tests/rptest/tests/read_replica_e2e_test.py
+++ b/tests/rptest/tests/read_replica_e2e_test.py
@@ -48,8 +48,7 @@ def create_read_replica_topic(self):
 
         rpk_second_cluster = RpkTool(self.second_cluster)
         conf = {
-            'redpanda.remote.readreplica': 'true',
-            'redpanda.remote.readreplica.bucket': self.s3_bucket_name,
+            'redpanda.remote.readreplica': self.s3_bucket_name,
         }
         rpk_second_cluster.create_topic(self.topic_name, config=conf)
 

From 73203736808f332ac6fef53c5fcb0ed03c599061 Mon Sep 17 00:00:00 2001
From: Michal Maslanka <michal@redpanda.com>
Date: Tue, 12 Jul 2022 11:16:17 +0200
Subject: [PATCH 071/201] r/configuration: add ability to assign replica
 revisions

Added API allowing caller to replace raft group configuration and assign
revision to replicas. This way when reverting configuration change,
`cluster::controller_backend` will be able to control replicas
revisions.

Signed-off-by: Michal Maslanka <michal@redpanda.com>
---
 src/v/cluster/partition.h         |  7 +++
 src/v/raft/consensus.cc           | 12 +++++
 src/v/raft/consensus.h            |  5 ++
 src/v/raft/group_configuration.cc | 82 +++++++++++++++++++++++++++++++
 src/v/raft/group_configuration.h  |  6 +++
 5 files changed, 112 insertions(+)

diff --git a/src/v/cluster/partition.h b/src/v/cluster/partition.h
index 060153c9cc0e0..0cca06e4d6d20 100644
--- a/src/v/cluster/partition.h
+++ b/src/v/cluster/partition.h
@@ -159,6 +159,13 @@ class partition {
           std::move(brokers), new_revision_id);
     }
 
+    ss::future<std::error_code> update_replica_set(
+      std::vector<raft::broker_revision> brokers,
+      model::revision_id new_revision_id) {
+        return _raft->replace_configuration(
+          std::move(brokers), new_revision_id);
+    }
+
     raft::group_configuration group_configuration() const {
         return _raft->config();
     }
diff --git a/src/v/raft/consensus.cc b/src/v/raft/consensus.cc
index b30bff1c0977b..f4345afb8aaef 100644
--- a/src/v/raft/consensus.cc
+++ b/src/v/raft/consensus.cc
@@ -994,6 +994,18 @@ ss::future<std::error_code> consensus::replace_configuration(
       });
 }
 
+ss::future<std::error_code> consensus::replace_configuration(
+  std::vector<raft::broker_revision> new_brokers,
+  model::revision_id new_revision) {
+    return change_configuration(
+      [new_brokers = std::move(new_brokers),
+       new_revision](group_configuration current) mutable {
+          current.replace(std::move(new_brokers), new_revision);
+          current.set_revision(new_revision);
+          return result<group_configuration>(std::move(current));
+      });
+}
+
 template<typename Func>
 ss::future<std::error_code>
 consensus::interrupt_configuration_change(model::revision_id revision, Func f) {
diff --git a/src/v/raft/consensus.h b/src/v/raft/consensus.h
index 87f7323d6072c..a852c819bc195 100644
--- a/src/v/raft/consensus.h
+++ b/src/v/raft/consensus.h
@@ -123,6 +123,11 @@ class consensus {
     // Replace configuration of raft group with given set of nodes
     ss::future<std::error_code>
       replace_configuration(std::vector<model::broker>, model::revision_id);
+    /**
+     * Replace configuration, uses revision provided with brokers
+     */
+    ss::future<std::error_code>
+      replace_configuration(std::vector<broker_revision>, model::revision_id);
     // Abort ongoing configuration change - may cause data loss
     ss::future<std::error_code> abort_configuration_change(model::revision_id);
     // Revert current configuration change - this is safe and will never cause
diff --git a/src/v/raft/group_configuration.cc b/src/v/raft/group_configuration.cc
index 0f8790024b19f..76fa0635dad6e 100644
--- a/src/v/raft/group_configuration.cc
+++ b/src/v/raft/group_configuration.cc
@@ -313,6 +313,88 @@ void group_configuration::replace(
     }
 }
 
+void group_configuration::replace(
+  std::vector<broker_revision> brokers, model::revision_id rev) {
+    vassert(!_old, "can not replace joint configuration - {}", *this);
+    _revision = rev;
+
+    /**
+     * If configurations are identical do nothing. For identical configuration
+     * we assume that brokers list hasn't changed (1) and current configuration
+     * contains all brokers in either voters of learners (2).
+     */
+    // check list of brokers (1)
+
+    // check if all brokers are assigned to current configuration (2)
+    bool brokers_are_equal
+      = brokers.size() == _brokers.size()
+        && std::all_of(
+          brokers.begin(), brokers.end(), [this](const broker_revision& b) {
+              // we may do linear lookup in _brokers collection as number of
+              // brokers is usually very small f.e. 3 or 5
+              auto it = std::find_if(
+                _brokers.begin(),
+                _brokers.end(),
+                [&b](const model::broker& existing) {
+                    return b.broker == existing;
+                });
+
+              return _current.contains(vnode(b.broker.id(), b.rev))
+                     && it != _brokers.end();
+          });
+
+    // configurations are identical, do nothing
+    if (brokers_are_equal) {
+        return;
+    }
+
+    _old = _current;
+    _current.learners.clear();
+    _current.voters.clear();
+
+    for (auto& br : brokers) {
+        // check if broker is already a voter. voter will stay a voter
+        auto v_it = std::find_if(
+          _old->voters.cbegin(), _old->voters.cend(), [&br](const vnode& rni) {
+              return rni.id() == br.broker.id() && rni.revision() == br.rev;
+          });
+
+        if (v_it != _old->voters.cend()) {
+            _current.voters.push_back(*v_it);
+            continue;
+        }
+
+        // check if broker was a learner. learner will stay a learner
+        auto l_it = std::find_if(
+          _old->learners.cbegin(),
+          _old->learners.cend(),
+          [&br](const vnode& rni) {
+              return rni.id() == br.broker.id() && rni.revision() == br.rev;
+          });
+
+        if (l_it != _old->learners.cend()) {
+            _current.learners.push_back(*l_it);
+            continue;
+        }
+
+        // new broker, use broker revision
+        _current.learners.emplace_back(br.broker.id(), br.rev);
+    }
+
+    // if both current and previous configurations are exactly the same, we do
+    // not need to enter joint consensus
+    if (
+      _current.voters == _old->voters && _current.learners == _old->learners) {
+        _old.reset();
+    }
+
+    for (auto& b : brokers) {
+        if (!contains_broker(b.broker.id())) {
+            _brokers.push_back(std::move(b.broker));
+        }
+    }
+}
+
 void group_configuration::promote_to_voter(vnode id) {
     auto it = std::find(
       _current.learners.cbegin(), _current.learners.cend(), id);
diff --git a/src/v/raft/group_configuration.h b/src/v/raft/group_configuration.h
index fda4638260671..f62fdddb21ad2 100644
--- a/src/v/raft/group_configuration.h
+++ b/src/v/raft/group_configuration.h
@@ -23,6 +23,11 @@
 
 namespace raft {
 
+struct broker_revision {
+    model::broker broker;
+    model::revision_id rev;
+};
+
 static constexpr model::revision_id no_revision{};
 class vnode : public serde::envelope<vnode, serde::version<0>> {
 public:
@@ -118,6 +123,7 @@ class group_configuration final {
     void add(std::vector<model::broker>, model::revision_id);
     void remove(const std::vector<model::node_id>&);
     void replace(std::vector<model::broker>, model::revision_id);
+    void replace(std::vector<broker_revision>, model::revision_id);
 
     /**
      * Updating broker configuration. This operation does not require entering

From 1e7d851fa109db9f2affee2faed98eaf37f52e69 Mon Sep 17 00:00:00 2001
From: Michal Maslanka <michal@redpanda.com>
Date: Thu, 7 Jul 2022 12:54:06 +0200
Subject: [PATCH 072/201] c/controller_backend: fixed canceling already
 finished reconfigurations

Signed-off-by: Michal Maslanka <michal@redpanda.com>
---
 src/v/cluster/controller_backend.cc | 117 ++++++++++++++++++++++------
 1 file changed, 95 insertions(+), 22 deletions(-)

diff --git a/src/v/cluster/controller_backend.cc b/src/v/cluster/controller_backend.cc
index 3d0bce521fde3..6635c21c38ea9 100644
--- a/src/v/cluster/controller_backend.cc
+++ b/src/v/cluster/controller_backend.cc
@@ -90,6 +90,31 @@ std::vector<model::broker> create_brokers_set(
     return brokers;
 }
 
+std::vector<raft::broker_revision> create_brokers_set(
+  const std::vector<model::broker_shard>& replicas,
+  const absl::flat_hash_map<model::node_id, model::revision_id>&
+    replica_revisions,
+  cluster::members_table& members) {
+    std::vector<raft::broker_revision> brokers;
+    brokers.reserve(replicas.size());
+
+    std::transform(
+      std::cbegin(replicas),
+      std::cend(replicas),
+      std::back_inserter(brokers),
+      [&members, &replica_revisions](const model::broker_shard& bs) {
+          auto br = members.get_broker(bs.node_id);
+          if (!br) {
+              throw std::logic_error(
+                fmt::format("Replica node {} is not available", bs.node_id));
+          }
+          return raft::broker_revision{
+            .broker = *br->get(),
+            .rev = replica_revisions.find(bs.node_id)->second};
+      });
+    return brokers;
+}
+
 std::optional<ss::shard_id> get_target_shard(
   model::node_id id, const std::vector<model::broker_shard>& replicas) {
     auto it = std::find_if(
@@ -519,6 +544,29 @@ find_interrupting_operation(deltas_t::iterator current_it, deltas_t& deltas) {
           }
       });
 }
+ss::future<std::error_code> revert_configuration_update(
+  const model::ntp& ntp,
+  const std::vector<model::broker_shard>& replicas,
+  model::revision_id rev,
+  ss::lw_shared_ptr<partition> p,
+  members_table& members,
+  topic_table& topics) {
+    auto in_progress_it = topics.in_progress_updates().find(ntp);
+    // no longer in progress
+    if (in_progress_it == topics.in_progress_updates().end()) {
+        co_return errc::success;
+    }
+    auto brokers = create_brokers_set(
+      replicas, in_progress_it->second.replicas_revisions, members);
+    vlog(
+      clusterlog.debug,
+      "reverting already finished reconfiguration of {}, revision: {}. Replica "
+      "set: {} ",
+      ntp,
+      rev,
+      replicas);
+    co_return co_await p->update_replica_set(std::move(brokers), rev);
+}
 } // namespace
 
 ss::future<> controller_backend::reconcile_ntp(deltas_t& deltas) {
@@ -1197,21 +1245,25 @@ ss::future<std::error_code> controller_backend::cancel_replica_set_update(
       replicas,
       rev,
       [this, &ntp, rev, replicas](ss::lw_shared_ptr<partition> p) {
+          const auto current_cfg = p->group_configuration();
+          // we do not have to request update/cancellation twice
+          if (current_cfg.revision_id() == rev) {
+              return ss::make_ready_future<std::error_code>(
+                errc::waiting_for_recovery);
+          }
+
           const auto raft_cfg_update_finished
-            = are_configuration_replicas_up_to_date(
-              p->group_configuration(), replicas);
+            = current_cfg.type() == raft::configuration_type::simple;
 
           // raft already finished its part, we need to move replica back
           if (raft_cfg_update_finished) {
-              auto brokers = create_brokers_set(
-                replicas, _members_table.local());
-              vlog(
-                clusterlog.debug,
-                "raft reconfiguration finished, moving partition {} "
-                "configuration back to requested state: {}",
+              return revert_configuration_update(
                 ntp,
-                replicas);
-              return p->update_replica_set(std::move(brokers), rev);
+                replicas,
+                rev,
+                std::move(p),
+                _members_table.local(),
+                _topics.local());
           } else {
               vlog(
                 clusterlog.debug,
@@ -1233,21 +1285,42 @@ ss::future<std::error_code> controller_backend::force_abort_replica_set_update(
     if (!partition) {
         co_return errc::partition_not_exists;
     }
+    const auto current_cfg = partition->group_configuration();
 
-    const auto raft_cfg_update_finished = are_configuration_replicas_up_to_date(
-      partition->group_configuration(), replicas);
-    if (raft_cfg_update_finished) {
-        co_return co_await update_partition_replica_set(ntp, replicas, rev);
-    } else {
-        // wait for configuration update, only declare success
-        // when configuration was actually updated
-        auto update_ec = check_configuration_update(
-          _self, partition, replicas, rev);
+    // wait for configuration update, only declare success
+    // when configuration was actually updated
+    auto update_ec = check_configuration_update(
+      _self, partition, replicas, rev);
 
-        if (!update_ec) {
-            co_return errc::success;
-        }
+    if (!update_ec) {
+        co_return errc::success;
+    }
+
+    // we do not have to request update/cancellation twice
+    if (current_cfg.revision_id() == rev) {
+        co_return errc::waiting_for_recovery;
+    }
+
+    const auto raft_cfg_update_finished = current_cfg.type()
+                                          == raft::configuration_type::simple;
+
+    if (raft_cfg_update_finished) {
+        co_return co_await apply_configuration_change_on_leader(
+          ntp,
+          replicas,
+          rev,
+          [this, rev, &replicas, &ntp](
+            ss::lw_shared_ptr<cluster::partition> p) {
+              return revert_configuration_update(
+                ntp,
+                replicas,
+                rev,
+                std::move(p),
+                _members_table.local(),
+                _topics.local());
+          });
 
+    } else {
         auto ec = co_await partition->force_abort_replica_set_update(rev);
 
         if (ec) {

From 64ec96bbd6eb58efad5669c7bfb0ae8ae5eab426 Mon Sep 17 00:00:00 2001
From: Michal Maslanka <michal@redpanda.com>
Date: Thu, 7 Jul 2022 17:45:20 +0200
Subject: [PATCH 073/201] c/topics_update_dispatcher: fixed de-allocation of
 deleted topics

Recently introduced two stage update of partition allocator changed the
way how we update allocation state. Currently when partition is being
moved allocations are added when processing update command and removed
when processing finished command.

When topic is being updated and it is deleted we must deallocate the
in-progress replicas that would normally be removed with the update
finsihed command.

Signed-off-by: Michal Maslanka <michal@redpanda.com>
---
 src/v/cluster/topic_updates_dispatcher.cc | 71 ++++++++++++++++++-----
 src/v/cluster/topic_updates_dispatcher.h  | 10 +++-
 2 files changed, 64 insertions(+), 17 deletions(-)

diff --git a/src/v/cluster/topic_updates_dispatcher.cc b/src/v/cluster/topic_updates_dispatcher.cc
index 93f752f969283..2ee7fa3754bbf 100644
--- a/src/v/cluster/topic_updates_dispatcher.cc
+++ b/src/v/cluster/topic_updates_dispatcher.cc
@@ -12,9 +12,13 @@
 #include "cluster/cluster_utils.h"
 #include "cluster/commands.h"
 #include "cluster/partition_leaders_table.h"
+#include "cluster/topic_table.h"
+#include "model/fundamental.h"
 #include "model/metadata.h"
 #include "raft/types.h"
 
+#include <absl/container/node_hash_map.h>
+
 #include <iterator>
 #include <system_error>
 #include <vector>
@@ -37,19 +41,29 @@ topic_updates_dispatcher::apply_update(model::record_batch b) {
           return ss::visit(
             std::move(cmd),
             [this, base_offset](delete_topic_cmd del_cmd) {
-                // delete case - we need state copy to
-                auto tp_md = _topic_table.local().get_topic_metadata(
-                  del_cmd.value);
+                auto tp_ns = del_cmd.key;
+                auto topic_assignments
+                  = _topic_table.local().get_topic_assignments(del_cmd.value);
+                in_progress_map in_progress;
+
+                if (topic_assignments) {
+                    in_progress = collect_in_progress(
+                      del_cmd.key, *topic_assignments);
+                }
                 return dispatch_updates_to_cores(del_cmd, base_offset)
-                  .then([this, tp_md = std::move(tp_md)](std::error_code ec) {
-                      if (ec == errc::success) {
-                          vassert(
-                            tp_md.has_value(),
-                            "Topic had to exist before successful delete");
-                          deallocate_topic(*tp_md);
-                      }
-                      return ec;
-                  });
+                  .then(
+                    [this,
+                     topic_assignments = std::move(topic_assignments),
+                     in_progress = std::move(in_progress)](std::error_code ec) {
+                        if (ec == errc::success) {
+                            vassert(
+                              topic_assignments.has_value(),
+                              "Topic had to exist before successful delete");
+                            deallocate_topic(*topic_assignments, in_progress);
+                        }
+
+                        return ec;
+                    });
             },
             [this, base_offset](create_topic_cmd create_cmd) {
                 return dispatch_updates_to_cores(create_cmd, base_offset)
@@ -187,6 +201,22 @@ topic_updates_dispatcher::apply_update(model::record_batch b) {
             });
       });
 }
+topic_updates_dispatcher::in_progress_map
+topic_updates_dispatcher::collect_in_progress(
+  const model::topic_namespace& tp_ns,
+  const assignments_set& current_assignments) {
+    in_progress_map in_progress;
+    in_progress.reserve(current_assignments.size());
+    // collect in progress assignments
+    for (auto& p : current_assignments) {
+        auto previous = _topic_table.local().get_previous_replica_set(
+          model::ntp(tp_ns.ns, tp_ns.tp, p.id));
+        if (previous) {
+            in_progress.emplace(p.id, std::move(previous.value()));
+        }
+    }
+    return in_progress;
+}
 
 ss::future<> topic_updates_dispatcher::update_leaders_with_estimates(
   std::vector<ntp_leader> leaders) {
@@ -250,10 +280,19 @@ topic_updates_dispatcher::dispatch_updates_to_cores(Cmd cmd, model::offset o) {
       });
 }
 
-void topic_updates_dispatcher::deallocate_topic(const topic_metadata& tp_md) {
-    // we have to deallocate topics
-    for (auto& p : tp_md.get_assignments()) {
-        _partition_allocator.local().deallocate(p.replicas);
+void topic_updates_dispatcher::deallocate_topic(
+  const assignments_set& topic_assignments,
+  const in_progress_map& in_progress) {
+    for (auto& p_as : topic_assignments) {
+        _partition_allocator.local().deallocate(p_as.replicas);
+        auto it = in_progress.find(p_as.id);
+
+        // we must remove the allocation that would normally
+        // be removed with update_finished request
+        if (it != in_progress.end()) {
+            auto to_delete = subtract_replica_sets(it->second, p_as.replicas);
+            _partition_allocator.local().remove_allocations(to_delete);
+        }
     }
 }
 
diff --git a/src/v/cluster/topic_updates_dispatcher.h b/src/v/cluster/topic_updates_dispatcher.h
index e383e7c59bd40..0c5f3a7cc44fa 100644
--- a/src/v/cluster/topic_updates_dispatcher.h
+++ b/src/v/cluster/topic_updates_dispatcher.h
@@ -13,6 +13,8 @@
 #include "cluster/commands.h"
 #include "cluster/scheduling/partition_allocator.h"
 #include "cluster/topic_table.h"
+#include "cluster/types.h"
+#include "model/fundamental.h"
 #include "model/record.h"
 
 #include <seastar/core/sharded.hh>
@@ -71,6 +73,8 @@ class topic_updates_dispatcher {
     }
 
 private:
+    using in_progress_map = absl::
+      node_hash_map<model::partition_id, std::vector<model::broker_shard>>;
     template<typename Cmd>
     ss::future<std::error_code> dispatch_updates_to_cores(Cmd, model::offset);
 
@@ -78,7 +82,11 @@ class topic_updates_dispatcher {
 
     ss::future<> update_leaders_with_estimates(std::vector<ntp_leader> leaders);
     void update_allocations(std::vector<partition_assignment>);
-    void deallocate_topic(const topic_metadata&);
+
+    void deallocate_topic(const assignments_set&, const in_progress_map&);
+
+    in_progress_map
+    collect_in_progress(const model::topic_namespace&, const assignments_set&);
 
     ss::sharded<partition_allocator>& _partition_allocator;
     ss::sharded<topic_table>& _topic_table;

From 32c58e5a3449d3aadda96e91d6df5782eda9875b Mon Sep 17 00:00:00 2001
From: Michal Maslanka <michal@redpanda.com>
Date: Thu, 7 Jul 2022 14:17:31 +0200
Subject: [PATCH 074/201] c/members_backend: recalculate reallocations when
 decommissioning node

Signed-off-by: Michal Maslanka <michal@redpanda.com>
---
 src/v/cluster/members_backend.cc | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/v/cluster/members_backend.cc b/src/v/cluster/members_backend.cc
index 881316eb4bb45..7de4277fdf136 100644
--- a/src/v/cluster/members_backend.cc
+++ b/src/v/cluster/members_backend.cc
@@ -524,7 +524,6 @@ ss::future<> members_backend::reconcile() {
 
         const auto allocator_empty = _allocator.local().is_empty(
           meta.update.id);
-
         if (
           is_draining && all_reallocations_finished && allocator_empty
           && !updates_in_progress) {
@@ -552,6 +551,15 @@ ss::future<> members_backend::reconcile() {
               all_reallocations_finished,
               allocator_empty,
               updates_in_progress);
+            if (!allocator_empty && all_reallocations_finished) {
+                // recalculate reallocations
+                vlog(
+                  clusterlog.info,
+                  "[update: {}] decommissioning in progress. recalculating "
+                  "reallocations",
+                  meta.update);
+                calculate_reallocations(meta);
+            }
         }
     }
 }

From 5005c7260ba392f43ca32b54cc9ce2a621b99311 Mon Sep 17 00:00:00 2001
From: Michal Maslanka <michal@redpanda.com>
Date: Tue, 12 Jul 2022 11:17:59 +0200
Subject: [PATCH 075/201] c/members_backend: recalculate reassignment when
 configuration change finished

When `members_backend` requests partition operation cancellation it may
already be finished. In this case we skip the operation and allow
backend to recalculate required replicas reassignments.

Signed-off-by: Michal Maslanka <michal@redpanda.com>
---
 src/v/cluster/members_backend.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/v/cluster/members_backend.cc b/src/v/cluster/members_backend.cc
index 7de4277fdf136..f69505b91d605 100644
--- a/src/v/cluster/members_backend.cc
+++ b/src/v/cluster/members_backend.cc
@@ -723,6 +723,11 @@ ss::future<> members_backend::reallocate_replica_set(
               meta.current_replica_set,
               meta.new_replica_set,
               error.message());
+            if (error == errc::no_update_in_progress) {
+                // mark reallocation as finished, reallocations will be
+                // recalculated if required
+                meta.state = reallocation_state::finished;
+            }
             co_return;
         }
         // success, update state and move on

From f895b5c2585be67753241d0a785b4cd16771dfeb Mon Sep 17 00:00:00 2001
From: Michal Maslanka <michal@redpanda.com>
Date: Mon, 11 Jul 2022 16:52:42 +0200
Subject: [PATCH 076/201] r/consensus: step down when after cancellation node
 is not longer voter

Signed-off-by: Michal Maslanka <michal@redpanda.com>
---
 src/v/raft/consensus.cc | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/v/raft/consensus.cc b/src/v/raft/consensus.cc
index f4345afb8aaef..4532311880323 100644
--- a/src/v/raft/consensus.cc
+++ b/src/v/raft/consensus.cc
@@ -45,6 +45,7 @@
 
 #include <algorithm>
 #include <iterator>
+#include <system_error>
 
 template<>
 struct fmt::formatter<raft::consensus::vote_state> final
@@ -1032,12 +1033,23 @@ ss::future<std::error_code>
 consensus::cancel_configuration_change(model::revision_id revision) {
     vlog(
       _ctxlog.info,
-      "requested revert of current configuration change - {}",
+      "requested cancellation of current configuration change - {}",
       config());
     return interrupt_configuration_change(
-      revision, [revision](raft::group_configuration cfg) {
-          cfg.cancel_configuration_change(revision);
-          return cfg;
+             revision,
+             [revision](raft::group_configuration cfg) {
+                 cfg.cancel_configuration_change(revision);
+                 return cfg;
+             })
+      .then([this](std::error_code ec) -> ss::future<std::error_code> {
+          if (!ec) {
+              // current leader is not a voter, step down
+              if (!config().is_voter(_self)) {
+                  auto u = co_await _op_lock.get_units();
+                  do_step_down("current leader is not voter");
+              }
+          }
+          co_return ec;
       });
 }
 

From deab76cdc69dd816b4461c06c45b0e5b89a7ad3f Mon Sep 17 00:00:00 2001
From: Michal Maslanka <michal@redpanda.com>
Date: Wed, 29 Jun 2022 12:21:58 +0200
Subject: [PATCH 077/201] tests/admin: added api to recommission node

Signed-off-by: Michal Maslanka <michal@redpanda.com>
---
 tests/rptest/services/admin.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/rptest/services/admin.py b/tests/rptest/services/admin.py
index 2c620bcb05a47..23ca749d8b205 100644
--- a/tests/rptest/services/admin.py
+++ b/tests/rptest/services/admin.py
@@ -439,6 +439,14 @@ def decommission_broker(self, id, node=None):
         self.redpanda.logger.debug(f"decommissioning {path}")
         return self._request('put', path, node=node)
 
+    def recommission_broker(self, id, node=None):
+        """
+        Recommission broker i.e. abort ongoing decommissioning
+        """
+        path = f"brokers/{id}/recommission"
+        self.redpanda.logger.debug(f"recommissioning {id}")
+        return self._request('put', path, node=node)
+
     def list_reconfigurations(self, node=None):
         """
         List pending reconfigurations

From e75b59fbbe5906c943e436630bfa8210c51d3ea0 Mon Sep 17 00:00:00 2001
From: Michal Maslanka <michal@redpanda.com>
Date: Thu, 30 Jun 2022 13:04:26 +0200
Subject: [PATCH 078/201] tests: added nodes recommissioning tests

Signed-off-by: Michal Maslanka <michal@redpanda.com>
---
 .../tests/nodes_decommissioning_test.py       | 188 ++++++++++++++++--
 1 file changed, 175 insertions(+), 13 deletions(-)

diff --git a/tests/rptest/tests/nodes_decommissioning_test.py b/tests/rptest/tests/nodes_decommissioning_test.py
index 2256f2cbf70c6..5a5a46f58eb18 100644
--- a/tests/rptest/tests/nodes_decommissioning_test.py
+++ b/tests/rptest/tests/nodes_decommissioning_test.py
@@ -44,7 +44,7 @@ def _partitions_moving(self):
     def _partitions_not_moving(self):
         admin = Admin(self.redpanda)
         reconfigurations = admin.list_reconfigurations()
-        return len(reconfigurations) > 0
+        return len(reconfigurations) == 0
 
     def _partition_to_move(self, predicate):
         rpk = RpkTool(self.redpanda)
@@ -69,6 +69,38 @@ def _node_removed(self, removed_id, node_to_query):
                 return False
         return True
 
+    def _find_replacement(self, current_replicas, to_remove):
+        new_replicas = []
+        unique_node_ids = set()
+        for r in current_replicas:
+            if r['node_id'] != to_remove:
+                unique_node_ids.add(r['node_id'])
+                new_replicas.append(r)
+
+        admin = Admin(self.redpanda)
+        brokers = admin.get_brokers()
+
+        to_add = None
+        while len(unique_node_ids) < len(current_replicas):
+            id = random.choice(brokers)['node_id']
+            if id == to_remove:
+                continue
+            to_add = id
+            unique_node_ids.add(to_add)
+
+        new_replicas.append({"node_id": to_add, "core": 0})
+        return new_replicas
+
+    def _wait_until_status(self, node_id, status, timeout_sec=15):
+        def requested_status():
+            brokers = Admin(self.redpanda).get_brokers()
+            for broker in brokers:
+                if broker['node_id'] == node_id:
+                    return broker['membership_status'] == status
+            return False
+
+        wait_until(requested_status, timeout_sec=timeout_sec, backoff_sec=1)
+
     @cluster(
         num_nodes=6,
         # A decom can look like a restart in terms of logs from peers dropping
@@ -174,18 +206,6 @@ def test_decommissioning_cancel_ongoing_movements(self):
         self.logger.info(f"decommissioning node: {to_decommission}", )
         admin.decommission_broker(to_decommission)
 
-        def check_status(node_id, status):
-            brokers = admin.get_brokers()
-            for broker in brokers:
-                if broker['node_id'] == node_id:
-                    return broker['membership_status'] == status
-
-            return False
-
-        wait_until(lambda: check_status(to_decommission, 'draining'),
-                   timeout_sec=15,
-                   backoff_sec=1)
-
         survivor_node = self._not_decommissioned_node(to_decommission)
         # adjust recovery throttle to make sure moves will finish
         rpk.cluster_config_set("raft_learner_recovery_rate", str(2 << 30))
@@ -198,3 +218,145 @@ def check_status(node_id, status):
         self.redpanda.stop_node(self.redpanda.get_node(to_decommission))
 
         self.run_validation(enable_idempotence=False, consumer_timeout_sec=90)
+
+    @cluster(num_nodes=6, log_allow_list=RESTART_LOG_ALLOW_LIST)
+    def test_recommissioning_node(self):
+        self.start_redpanda(num_nodes=4)
+        self._create_topics()
+
+        self.start_producer(1)
+        self.start_consumer(1)
+        self.await_startup()
+        admin = Admin(self.redpanda)
+
+        brokers = admin.get_brokers()
+        to_decommission = random.choice(brokers)['node_id']
+
+        # throttle recovery
+        rpk = RpkTool(self.redpanda)
+        rpk.cluster_config_set("raft_learner_recovery_rate", str(1))
+
+        self.logger.info(f"decommissioning node: {to_decommission}", )
+        admin.decommission_broker(to_decommission)
+
+        self._wait_until_status(to_decommission, 'draining')
+
+        wait_until(lambda: self._partitions_moving(),
+                   timeout_sec=15,
+                   backoff_sec=1)
+
+        # recommission broker
+        admin.recommission_broker(to_decommission)
+        self._wait_until_status(to_decommission, 'active')
+
+        wait_until(lambda: self._partitions_not_moving(),
+                   timeout_sec=15,
+                   backoff_sec=1)
+
+    @cluster(num_nodes=6, log_allow_list=RESTART_LOG_ALLOW_LIST)
+    def test_recommissioning_do_not_stop_all_moves_node(self):
+        self.start_redpanda(num_nodes=4)
+        self._create_topics()
+
+        self.start_producer(1)
+        self.start_consumer(1)
+        self.await_startup()
+        admin = Admin(self.redpanda)
+
+        brokers = admin.get_brokers()
+        to_decommission = random.choice(brokers)['node_id']
+
+        # throttle recovery
+        rpk = RpkTool(self.redpanda)
+        rpk.cluster_config_set("raft_learner_recovery_rate", str(1))
+
+        # schedule partition move from the node being decommissioned before actually calling decommission
+
+        partitions = admin.get_partitions(
+            node=self.redpanda.get_node(to_decommission))
+
+        partition_to_move = random.choice(partitions)
+        to_move_tp = partition_to_move['topic']
+        to_move_p = partition_to_move['partition_id']
+        details = admin.get_partitions(topic=to_move_tp, partition=to_move_p)
+
+        new_replicas = self._find_replacement(details['replicas'],
+                                              to_decommission)
+        self.logger.info(
+            f"moving partition {to_move_tp}/{to_move_p} - {details['replicas']} -> {new_replicas}"
+        )
+
+        admin.set_partition_replicas(topic=to_move_tp,
+                                     partition=to_move_p,
+                                     replicas=new_replicas)
+        # moving partition should be present in moving list
+        wait_until(lambda: self._partitions_moving(),
+                   timeout_sec=15,
+                   backoff_sec=1)
+
+        self.logger.info(f"decommissioning node: {to_decommission}", )
+        admin.decommission_broker(to_decommission)
+
+        self._wait_until_status(to_decommission, 'draining')
+
+        wait_until(lambda: self._partitions_moving(),
+                   timeout_sec=15,
+                   backoff_sec=1)
+
+        # recommission broker
+        admin.recommission_broker(to_decommission)
+        self._wait_until_status(to_decommission, 'active')
+
+        def one_left_moving():
+            reconfigurations = admin.list_reconfigurations()
+            return len(reconfigurations) == 1
+
+        wait_until(one_left_moving, timeout_sec=15, backoff_sec=1)
+
+    @cluster(num_nodes=7, log_allow_list=RESTART_LOG_ALLOW_LIST)
+    def test_recommissioning_one_of_decommissioned_nodes(self):
+        self.start_redpanda(num_nodes=5)
+        self._create_topics()
+
+        self.start_producer(1)
+        self.start_consumer(1)
+        self.await_startup()
+        admin = Admin(self.redpanda)
+
+        brokers = admin.get_brokers()
+        to_decommission_1 = random.choice(brokers)['node_id']
+        to_decommission_2 = to_decommission_1
+
+        while to_decommission_1 == to_decommission_2:
+            to_decommission_2 = random.choice(brokers)['node_id']
+
+        # throttle recovery
+        rpk = RpkTool(self.redpanda)
+        rpk.cluster_config_set("raft_learner_recovery_rate", str(1))
+
+        self.logger.info(f"decommissioning node: {to_decommission_1}", )
+        admin.decommission_broker(to_decommission_1)
+        self.logger.info(f"decommissioning node: {to_decommission_2}", )
+        admin.decommission_broker(to_decommission_2)
+
+        self._wait_until_status(to_decommission_1, 'draining')
+        self._wait_until_status(to_decommission_2, 'draining')
+
+        wait_until(lambda: self._partitions_moving(),
+                   timeout_sec=15,
+                   backoff_sec=1)
+
+        # recommission broker that was decommissioned first
+        admin.recommission_broker(to_decommission_1)
+        self._wait_until_status(to_decommission_1, 'active')
+
+        rpk.cluster_config_set("raft_learner_recovery_rate", str(2 << 30))
+
+        def node_removed():
+            brokers = admin.get_brokers()
+            for broker in brokers:
+                if broker['node_id'] == to_decommission_2:
+                    return False
+            return True
+
+        wait_until(node_removed, 60, 2)

From 610c5ae3356291eaaf9521de99c69316fbc2ed2d Mon Sep 17 00:00:00 2001
From: Michal Maslanka <michal@redpanda.com>
Date: Tue, 12 Jul 2022 13:01:49 +0200
Subject: [PATCH 079/201] tests: prevent node decommissioning test from using
 controller partition

Signed-off-by: Michal Maslanka <michal@redpanda.com>
---
 tests/rptest/tests/nodes_decommissioning_test.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tests/rptest/tests/nodes_decommissioning_test.py b/tests/rptest/tests/nodes_decommissioning_test.py
index 5a5a46f58eb18..d38d0d451100d 100644
--- a/tests/rptest/tests/nodes_decommissioning_test.py
+++ b/tests/rptest/tests/nodes_decommissioning_test.py
@@ -272,12 +272,8 @@ def test_recommissioning_do_not_stop_all_moves_node(self):
 
         # schedule partition move from the node being decommissioned before actually calling decommission
 
-        partitions = admin.get_partitions(
-            node=self.redpanda.get_node(to_decommission))
-
-        partition_to_move = random.choice(partitions)
-        to_move_tp = partition_to_move['topic']
-        to_move_p = partition_to_move['partition_id']
+        to_move_tp, to_move_p, _ = self._partition_to_move(
+            lambda p: to_decommission in p.replicas)
         details = admin.get_partitions(topic=to_move_tp, partition=to_move_p)
 
         new_replicas = self._find_replacement(details['replicas'],

From c95cbcc3a507c3cfff2469dfdfeb39b4f3ee7275 Mon Sep 17 00:00:00 2001
From: Michal Maslanka <michal@redpanda.com>
Date: Wed, 13 Jul 2022 10:14:13 +0200
Subject: [PATCH 080/201] c/controller_backend: added comment describing
 controller backend idea

Signed-off-by: Michal Maslanka <michal@redpanda.com>
---
 src/v/cluster/controller_backend.h | 186 ++++++++++++++++++++++++++++-
 1 file changed, 184 insertions(+), 2 deletions(-)

diff --git a/src/v/cluster/controller_backend.h b/src/v/cluster/controller_backend.h
index cbdb850eac00a..2febe8dd27d5f 100644
--- a/src/v/cluster/controller_backend.h
+++ b/src/v/cluster/controller_backend.h
@@ -30,8 +30,190 @@
 
 namespace cluster {
 
-/// on every core, sharded
-
+/**
+ *
+ * # Reconciliation
+ *
+ * Controller backend is responsible for making sure that the cluster state is
+ * in align with the topic and partition state gathered in topic_table.
+ *
+ * Controller backend lives on each core on every node in the cluster. Each
+ * instance of controller backend is responsible for dealing with core & node
+ * local partition replicas (instances of `cluster::partition` object that are
+ * supposed to be instantiated on given core and given node). Controller backend
+ * manages partition replica lifecycle. It instantiates/deletes
+ * `cluster::partition` instances and registers them in shard table.
+ *
+ * Controller backend operations are driven by deltas generated in topics table.
+ * Backend waits for the new deltas using condition variable. Each delta
+ * represent an operation that must be executed for ntp f.e. create, update
+ * properties, move, etc.
+ *
+ * Each controller backend in the cluster (on each node and each core) process
+ * all the deltas and based on the situation it either executes an operation or
+ * ignore it (command pattern).
+ *
+ * Deltas vector for each NTP is processed in separate fiber in other words
+ * deltas for different NTPs are executed concurrently but for the same NTP
+ * sequentially.
+ *
+ * Each delta has revision assigned revision for the delta is assigned based on
+ * the raft0 log offset of command that the delta is related with. The same
+ * delta has the same revision globally.
+ *
+ * Deltas are executed in order from oldest revision up to the newest.
+ *
+ *
+ * NTP_1
+ *                                              Loop until finished or cancelled
+ *
+ *                                                    ┌──────────────────┐
+ *                                                    │                  │
+ *                                                    │                  │
+ * ┌────────────┐ ┌────────────┐ ┌────────────┐       │  ┌────────────┐  │
+ * │   delta    │ │   delta    │ │   delta    │       │  │   delta    │  │
+ * │            │ │            │ │            ├──►    └─►│            ├──┘
+ * │ revision: 3│ │ revision: 2│ │ revision: 1│          │ revision: 0│
+ * └────────────┘ └────────────┘ └────────────┘          └────────────┘
+ *
+ *                            .
+ *                            .
+ *                            .
+ * NTP_N
+ *                                              Loop until finished or cancelled
+ *
+ *                                                    ┌──────────────────┐
+ *                                                    │                  │
+ *                                                    │                  │
+ * ┌────────────┐ ┌────────────┐ ┌────────────┐       │  ┌────────────┐  │
+ * │   delta    │ │   delta    │ │   delta    │       │  │   delta    │  │
+ * │            │ │            │ │            ├──►    └─►│            ├──┘
+ * │ revision: 3│ │ revision: 2│ │ revision: 1│          │ revision: 0│
+ * └────────────┘ └────────────┘ └────────────┘          └────────────┘
+ *
+ * # Revisions
+ *
+ * As each reconciliation loops are not coordinated we must be able to recognize
+ * epochs. Consider a situation in which a stream of deltas executed by the
+ * backend leads to the state which is identical from end user perspective f.e.
+ * topic with the same name and configuration was deleted and then created back
+ * again. We must be able to recognize if the instance of partition replica that
+ * has been created for the topic belongs to the original topic or the one that
+ * was re created. In order to introduce differentiation between the two not
+ * distinguishable states we use revision_id as an epoch. Revision is used
+ * whenever partition is created or its replicas are moved. This way controller
+ * backend is able to recognize if partition replicas have already been updated
+ * or if action is required.
+ *
+ * ## Revisions and raft vnode
+ *
+ * Whenever a new replica is added to raft configuration it has new revision
+ * assigned. In raft each raft group participant is described by a tuple of
+ * model::node_id and model::revision_id. This way every time the node is re
+ * added to the configuration (consider a situation in which partition with
+ * single replica is moved back and forth between two nodes f.e. 1 -> 2 -> 1
+ * -> 2...) it is recognized as a new node. This fencing mechanism prevents the
+ * up to date raft group replicas from communicating with one from previous
+ * epoch.
+ *
+ * # Partition movement
+ *
+ * Partition movement in Redpanda is based on the Raft protocol mechanism called
+ * Joint Consensus. When requested Raft implementation is able to move data
+ * between nodes in a safe and consistent way. However requesting Raft to
+ * reconfigure a raft group is not enough to complete a partition move. When
+ * partition move is requested based on the current situation some of the
+ * controller backend may have to create new partition replica instances while
+ * other have to delete the one that are not longer part of raft group.
+ * Additionally there may be a need to move partition instance between cores on
+ * the same node.
+ *
+ * Every time partition move is requested each reconciliation loop executes an
+ * operation based on current and requested state and poll for its completion.
+ *
+ * Partition movement finish is coordinated using a designated finish command.
+ *
+ * Partition movement finish command is replicated from one of the replicas that
+ * was changed during reconfiguration process.
+ *
+ * IMPORTANT:
+ * Partition replicas are only deleted when executing delta for operation
+ * finished command. This way when partition replica is deleted it is guaranteed
+ * to not longer be needed.
+ *
+ * Example:
+ *
+ * Consider moving partition between a set of nodes:
+ *
+ *      replicas on nodes (1,2,3) -> replicas on nodes (2,3,4)
+ *
+ * (for simplicity we ignore core assignment in this example)
+ *
+ * Assumptions:
+ *  - node 1 is a leader for the partition.
+ *
+ * Operations that has to be executed on every node:
+ *
+ * Node 1:
+ * - node 1 is a leader, leader is the only one that can replicate data so it
+ * will be asked for reconfiguration
+ * - after partition replica is not longer needed on this node it may be removed
+ *
+ * Node 2 & 3:
+ * - node 2 will wait until configuration will be up to date with requested. In
+ * case leadership from node 1 moved it will ask for reconfiguration
+ *
+ * Node 4:
+ * - node 4 will create a new instance of partition replica and wait for the
+ * configuration to be up to date.
+ * - after successful reconfiguration node 4 will dispatch finish update command
+ *
+ *
+ * When finish update command will be received by node 1 it will remove the
+ * partition replica instance.
+ *
+ *
+ * ## Interrupting partition movement
+ *
+ * Partition movement interruption may only be accepted after topic table
+ * processed move command but before the finish update command was processed. We
+ * use topics table as a single source of truth to decide if the update may
+ * still be canceled or if it has finished. This way we must be able to revert
+ * configuration change even if raft already finished reconfiguration.
+ *
+ * Partition move interruption does not mark the reconfiguration process as
+ * finished i.e. it will still be represented as in progress when queried from
+ * topic table. Move interruption will only finish when reconfiguration is
+ * finished in raft and finish move command is issued by the controller backend
+ *
+ * In general the interrupt may happen in the following situations:
+ *
+ * 1) before raft reconfiguration was requested
+ * 2) when raft reconfiguration is in progress
+ * 3) when raft reconfiguration has already finished but before finish command
+ * was replicated
+ *
+ * In all of the situations we must move back to the raft group configuration
+ * which was active before the move was scheduled. The set of actions that must
+ * be taken to finish the interruption is different based on the situation in
+ * which interruption happened.
+ *
+ * For 1) controller backend must simply update raft configuration revision to
+ * be able to decide if action related with given revision_id has been executed.
+ *
+ * For 2) controller backend with request reconfiguration cancellation on a
+ * leader and will wait until raft configuration is up to date with what was
+ * observed before the move. Any replicas that were created for the purpose of
+ * move will be removed when processing finished move command.
+ *
+ * For 3) controller backend must request reconfiguration with the same exact
+ * replica set as before the move was requested. It is important to notice that
+ * no partition replicas were yet removed as finish command wasn't yet
+ * processed. Since cancelling partition move does not create new partition
+ * replica instances (instances of `cluster::partition`) but reuse the existing
+ * one we must reuse revision id of currently existing replica instances.
+ *
+ */
 class controller_backend
   : public ss::peering_sharded_service<controller_backend> {
 public:

From ddc2cf5f6e7bddcdc33f175469fff944e20ddaa7 Mon Sep 17 00:00:00 2001
From: Rogger Vasquez <rvasque3@gmail.com>
Date: Mon, 11 Jul 2022 16:27:04 -0500
Subject: [PATCH 081/201] rpk: standardize command and flag output

Now all commands and flags descriptions start
with a capital letter and shouldn't end with a
punctuation mark.
---
 src/go/rpk/pkg/cli/cmd/acl.go                 |  4 +--
 src/go/rpk/pkg/cli/cmd/acl/create.go          | 22 +++++++--------
 src/go/rpk/pkg/cli/cmd/acl/delete.go          | 28 +++++++++----------
 src/go/rpk/pkg/cli/cmd/acl/list.go            | 24 ++++++++--------
 src/go/rpk/pkg/cli/cmd/acl/user.go            | 18 ++++++------
 src/go/rpk/pkg/cli/cmd/cluster.go             |  2 +-
 .../rpk/pkg/cli/cmd/cluster/config/config.go  |  4 +--
 src/go/rpk/pkg/cli/cmd/cluster/config/edit.go |  2 +-
 .../rpk/pkg/cli/cmd/cluster/config/export.go  |  2 +-
 .../rpk/pkg/cli/cmd/cluster/config/import.go  |  2 +-
 src/go/rpk/pkg/cli/cmd/cluster/config/lint.go |  2 +-
 .../rpk/pkg/cli/cmd/cluster/config/reset.go   |  2 +-
 src/go/rpk/pkg/cli/cmd/cluster/health.go      |  6 ++--
 .../cli/cmd/cluster/maintenance/disable.go    |  2 +-
 .../pkg/cli/cmd/cluster/maintenance/enable.go |  2 +-
 .../cmd/cluster/maintenance/maintenance.go    |  2 +-
 .../pkg/cli/cmd/cluster/maintenance/status.go |  2 +-
 src/go/rpk/pkg/cli/cmd/cluster/metadata.go    | 12 ++++----
 src/go/rpk/pkg/cli/cmd/common/common.go       | 24 ++++++++--------
 src/go/rpk/pkg/cli/cmd/container.go           |  2 +-
 src/go/rpk/pkg/cli/cmd/container/purge.go     |  2 +-
 src/go/rpk/pkg/cli/cmd/container/start.go     |  2 +-
 src/go/rpk/pkg/cli/cmd/container/stop.go      |  2 +-
 src/go/rpk/pkg/cli/cmd/debug/bundle.go        |  6 ++--
 src/go/rpk/pkg/cli/cmd/debug/debug.go         |  2 +-
 src/go/rpk/pkg/cli/cmd/debug/info.go          |  6 ++--
 src/go/rpk/pkg/cli/cmd/generate.go            |  2 +-
 .../rpk/pkg/cli/cmd/generate/autocomplete.go  |  2 +-
 src/go/rpk/pkg/cli/cmd/generate/grafana.go    |  2 +-
 src/go/rpk/pkg/cli/cmd/generate/prometheus.go |  2 +-
 src/go/rpk/pkg/cli/cmd/group/describe.go      |  2 +-
 src/go/rpk/pkg/cli/cmd/group/group.go         |  6 ++--
 src/go/rpk/pkg/cli/cmd/group/seek.go          |  2 +-
 src/go/rpk/pkg/cli/cmd/iotune.go              |  4 +--
 src/go/rpk/pkg/cli/cmd/plugin/plugin.go       |  8 +++---
 .../rpk/pkg/cli/cmd/redpanda/admin/admin.go   |  4 +--
 .../cli/cmd/redpanda/admin/brokers/brokers.go |  8 +++---
 .../cli/cmd/redpanda/admin/config/config.go   |  8 +++---
 .../redpanda/admin/partitions/partitions.go   |  4 +--
 src/go/rpk/pkg/cli/cmd/redpanda/check.go      |  2 +-
 src/go/rpk/pkg/cli/cmd/redpanda/config.go     |  2 +-
 src/go/rpk/pkg/cli/cmd/redpanda/mode.go       |  2 +-
 src/go/rpk/pkg/cli/cmd/redpanda/start.go      |  2 +-
 src/go/rpk/pkg/cli/cmd/redpanda/stop.go       |  2 +-
 src/go/rpk/pkg/cli/cmd/redpanda/tune/help.go  |  2 +-
 src/go/rpk/pkg/cli/cmd/root.go                | 19 +++++++++++--
 src/go/rpk/pkg/cli/cmd/topic.go               |  2 +-
 .../rpk/pkg/cli/cmd/topic/add_partitions.go   |  4 +--
 src/go/rpk/pkg/cli/cmd/topic/config.go        |  4 +--
 src/go/rpk/pkg/cli/cmd/topic/consume.go       |  6 ++--
 src/go/rpk/pkg/cli/cmd/topic/create.go        |  8 +++---
 src/go/rpk/pkg/cli/cmd/topic/delete.go        |  4 +--
 src/go/rpk/pkg/cli/cmd/topic/describe.go      | 10 +++----
 src/go/rpk/pkg/cli/cmd/topic/list.go          |  8 +++---
 src/go/rpk/pkg/cli/cmd/topic/produce.go       | 16 +++++------
 src/go/rpk/pkg/cli/cmd/version.go             |  2 +-
 src/go/rpk/pkg/cli/cmd/wasm.go                |  2 +-
 src/go/rpk/pkg/cli/cmd/wasm/deploy.go         |  6 ++--
 src/go/rpk/pkg/cli/cmd/wasm/generate.go       |  4 +--
 src/go/rpk/pkg/cli/cmd/wasm/remove.go         |  2 +-
 60 files changed, 181 insertions(+), 166 deletions(-)

diff --git a/src/go/rpk/pkg/cli/cmd/acl.go b/src/go/rpk/pkg/cli/cmd/acl.go
index aec3aa0148265..702ed32d3658b 100644
--- a/src/go/rpk/pkg/cli/cmd/acl.go
+++ b/src/go/rpk/pkg/cli/cmd/acl.go
@@ -38,7 +38,7 @@ func NewACLCommand(fs afero.Fs) *cobra.Command {
 	)
 	command := &cobra.Command{
 		Use:   "acl",
-		Short: "Manage ACLs and SASL users.",
+		Short: "Manage ACLs and SASL users",
 		Long:  helpACLs,
 		Args:  cobra.ExactArgs(0),
 		Run: func(cmd *cobra.Command, _ []string) {
@@ -50,7 +50,7 @@ func NewACLCommand(fs afero.Fs) *cobra.Command {
 		},
 	}
 
-	command.Flags().BoolVar(&helpOperations, "help-operations", false, "Print more help about ACL operations.")
+	command.Flags().BoolVar(&helpOperations, "help-operations", false, "Print more help about ACL operations")
 
 	common.AddKafkaFlags(
 		command,
diff --git a/src/go/rpk/pkg/cli/cmd/acl/create.go b/src/go/rpk/pkg/cli/cmd/acl/create.go
index 5cc31654635d3..5ca682b998a4a 100644
--- a/src/go/rpk/pkg/cli/cmd/acl/create.go
+++ b/src/go/rpk/pkg/cli/cmd/acl/create.go
@@ -25,7 +25,7 @@ func NewCreateCommand(fs afero.Fs) *cobra.Command {
 	var a acls
 	cmd := &cobra.Command{
 		Use:   "create",
-		Short: "Create ACLs.",
+		Short: "Create ACLs",
 		Long: `Create ACLs.
 
 See the 'rpk acl' help text for a full write up on ACLs. Following the
@@ -88,17 +88,17 @@ Allow write permissions to user buzz to transactional id "txn":
 func (a *acls) addCreateFlags(cmd *cobra.Command) {
 	a.addDeprecatedFlags(cmd)
 
-	cmd.Flags().StringSliceVar(&a.topics, topicFlag, nil, "topic to grant ACLs for (repeatable)")
-	cmd.Flags().StringSliceVar(&a.groups, groupFlag, nil, "group to grant ACLs for (repeatable)")
-	cmd.Flags().BoolVar(&a.cluster, clusterFlag, false, "whether to grant ACLs to the cluster")
-	cmd.Flags().StringSliceVar(&a.txnIDs, txnIDFlag, nil, "transactional IDs to grant ACLs for (repeatable)")
+	cmd.Flags().StringSliceVar(&a.topics, topicFlag, nil, "Topic to grant ACLs for (repeatable)")
+	cmd.Flags().StringSliceVar(&a.groups, groupFlag, nil, "Group to grant ACLs for (repeatable)")
+	cmd.Flags().BoolVar(&a.cluster, clusterFlag, false, "Whether to grant ACLs to the cluster")
+	cmd.Flags().StringSliceVar(&a.txnIDs, txnIDFlag, nil, "Transactional IDs to grant ACLs for (repeatable)")
 
-	cmd.Flags().StringVar(&a.resourcePatternType, patternFlag, "literal", "pattern to use when matching resource names (literal or prefixed)")
+	cmd.Flags().StringVar(&a.resourcePatternType, patternFlag, "literal", "Pattern to use when matching resource names (literal or prefixed)")
 
-	cmd.Flags().StringSliceVar(&a.operations, operationFlag, nil, "operation to grant (repeatable)")
+	cmd.Flags().StringSliceVar(&a.operations, operationFlag, nil, "Operation to grant (repeatable)")
 
-	cmd.Flags().StringSliceVar(&a.allowPrincipals, allowPrincipalFlag, nil, "principals for which these permissions will be granted (repeatable)")
-	cmd.Flags().StringSliceVar(&a.allowHosts, allowHostFlag, nil, "hosts from which access will be granted (repeatable)")
-	cmd.Flags().StringSliceVar(&a.denyPrincipals, denyPrincipalFlag, nil, "principal for which these permissions will be denied (repeatable)")
-	cmd.Flags().StringSliceVar(&a.denyHosts, denyHostFlag, nil, "hosts from from access will be denied (repeatable)")
+	cmd.Flags().StringSliceVar(&a.allowPrincipals, allowPrincipalFlag, nil, "Principals for which these permissions will be granted (repeatable)")
+	cmd.Flags().StringSliceVar(&a.allowHosts, allowHostFlag, nil, "Hosts from which access will be granted (repeatable)")
+	cmd.Flags().StringSliceVar(&a.denyPrincipals, denyPrincipalFlag, nil, "Principal for which these permissions will be denied (repeatable)")
+	cmd.Flags().StringSliceVar(&a.denyHosts, denyHostFlag, nil, "Hosts from from access will be denied (repeatable)")
 }
diff --git a/src/go/rpk/pkg/cli/cmd/acl/delete.go b/src/go/rpk/pkg/cli/cmd/acl/delete.go
index 30f27f989b855..93576b57a6dca 100644
--- a/src/go/rpk/pkg/cli/cmd/acl/delete.go
+++ b/src/go/rpk/pkg/cli/cmd/acl/delete.go
@@ -31,7 +31,7 @@ func NewDeleteCommand(fs afero.Fs) *cobra.Command {
 	)
 	cmd := &cobra.Command{
 		Use:   "delete",
-		Short: "Delete ACLs.",
+		Short: "Delete ACLs",
 		Long: `Delete ACLs.
 
 See the 'rpk acl' help text for a full write up on ACLs. Delete flags work in a
@@ -94,28 +94,28 @@ resource names:
 		},
 	}
 	a.addDeleteFlags(cmd)
-	cmd.Flags().BoolVarP(&printAllFilters, "print-filters", "f", false, "print the filters that were requested (failed filters are always printed)")
-	cmd.Flags().BoolVarP(&dry, "dry", "d", false, "dry run: validate what would be deleted")
-	cmd.Flags().BoolVar(&noConfirm, "no-confirm", false, "disable confirmation prompt")
+	cmd.Flags().BoolVarP(&printAllFilters, "print-filters", "f", false, "Print the filters that were requested (failed filters are always printed)")
+	cmd.Flags().BoolVarP(&dry, "dry", "d", false, "Dry run: validate what would be deleted")
+	cmd.Flags().BoolVar(&noConfirm, "no-confirm", false, "Disable confirmation prompt")
 	return cmd
 }
 
 func (a *acls) addDeleteFlags(cmd *cobra.Command) {
 	a.addDeprecatedFlags(cmd)
 
-	cmd.Flags().StringSliceVar(&a.topics, topicFlag, nil, "topic to remove ACLs for (repeatable)")
-	cmd.Flags().StringSliceVar(&a.groups, groupFlag, nil, "group to remove ACLs for (repeatable)")
-	cmd.Flags().BoolVar(&a.cluster, clusterFlag, false, "whether to remove ACLs to the cluster")
-	cmd.Flags().StringSliceVar(&a.txnIDs, txnIDFlag, nil, "transactional IDs to remove ACLs for (repeatable)")
+	cmd.Flags().StringSliceVar(&a.topics, topicFlag, nil, "Topic to remove ACLs for (repeatable)")
+	cmd.Flags().StringSliceVar(&a.groups, groupFlag, nil, "Group to remove ACLs for (repeatable)")
+	cmd.Flags().BoolVar(&a.cluster, clusterFlag, false, "Whether to remove ACLs to the cluster")
+	cmd.Flags().StringSliceVar(&a.txnIDs, txnIDFlag, nil, "Transactional IDs to remove ACLs for (repeatable)")
 
-	cmd.Flags().StringVar(&a.resourcePatternType, patternFlag, "any", "pattern to use when matching resource names (any, match, literal, or prefixed)")
+	cmd.Flags().StringVar(&a.resourcePatternType, patternFlag, "any", "Pattern to use when matching resource names (any, match, literal, or prefixed)")
 
-	cmd.Flags().StringSliceVar(&a.operations, operationFlag, nil, "operation to remove (repeatable)")
+	cmd.Flags().StringSliceVar(&a.operations, operationFlag, nil, "Operation to remove (repeatable)")
 
-	cmd.Flags().StringSliceVar(&a.allowPrincipals, allowPrincipalFlag, nil, "allowed principal ACLs to remove (repeatable)")
-	cmd.Flags().StringSliceVar(&a.allowHosts, allowHostFlag, nil, "allowed host ACLs to remove (repeatable)")
-	cmd.Flags().StringSliceVar(&a.denyPrincipals, denyPrincipalFlag, nil, "denied principal ACLs to remove (repeatable)")
-	cmd.Flags().StringSliceVar(&a.denyHosts, denyHostFlag, nil, "denied host ACLs to remove (repeatable)")
+	cmd.Flags().StringSliceVar(&a.allowPrincipals, allowPrincipalFlag, nil, "Allowed principal ACLs to remove (repeatable)")
+	cmd.Flags().StringSliceVar(&a.allowHosts, allowHostFlag, nil, "Allowed host ACLs to remove (repeatable)")
+	cmd.Flags().StringSliceVar(&a.denyPrincipals, denyPrincipalFlag, nil, "Denied principal ACLs to remove (repeatable)")
+	cmd.Flags().StringSliceVar(&a.denyHosts, denyHostFlag, nil, "Denied host ACLs to remove (repeatable)")
 }
 
 func deleteReqResp(
diff --git a/src/go/rpk/pkg/cli/cmd/acl/list.go b/src/go/rpk/pkg/cli/cmd/acl/list.go
index 26461412cace1..6ecdb61df3163 100644
--- a/src/go/rpk/pkg/cli/cmd/acl/list.go
+++ b/src/go/rpk/pkg/cli/cmd/acl/list.go
@@ -28,7 +28,7 @@ func NewListCommand(fs afero.Fs) *cobra.Command {
 	cmd := &cobra.Command{
 		Use:     "list",
 		Aliases: []string{"ls", "describe"},
-		Short:   "List ACLs.",
+		Short:   "List ACLs",
 		Long: `List ACLs.
 
 See the 'rpk acl' help text for a full write up on ACLs. List flags work in a
@@ -64,7 +64,7 @@ resource names:
 		},
 	}
 	a.addListFlags(cmd)
-	cmd.Flags().BoolVarP(&printAllFilters, "print-filters", "f", false, "print the filters that were requested (failed filters are always printed)")
+	cmd.Flags().BoolVarP(&printAllFilters, "print-filters", "f", false, "Print the filters that were requested (failed filters are always printed)")
 	return cmd
 }
 
@@ -79,19 +79,19 @@ func (a *acls) addListFlags(cmd *cobra.Command) {
 	cmd.Flags().MarkDeprecated("principal", "use --{allow,deny}-{host,principal}")
 	cmd.Flags().MarkDeprecated("host", "use --{allow,deny}-{host,principal}")
 
-	cmd.Flags().StringSliceVar(&a.topics, topicFlag, nil, "topic to match ACLs for (repeatable)")
-	cmd.Flags().StringSliceVar(&a.groups, groupFlag, nil, "group to match ACLs for (repeatable)")
-	cmd.Flags().BoolVar(&a.cluster, clusterFlag, false, "whether to match ACLs to the cluster")
-	cmd.Flags().StringSliceVar(&a.txnIDs, txnIDFlag, nil, "transactional IDs to match ACLs for (repeatable)")
+	cmd.Flags().StringSliceVar(&a.topics, topicFlag, nil, "Topic to match ACLs for (repeatable)")
+	cmd.Flags().StringSliceVar(&a.groups, groupFlag, nil, "Group to match ACLs for (repeatable)")
+	cmd.Flags().BoolVar(&a.cluster, clusterFlag, false, "Whether to match ACLs to the cluster")
+	cmd.Flags().StringSliceVar(&a.txnIDs, txnIDFlag, nil, "Transactional IDs to match ACLs for (repeatable)")
 
-	cmd.Flags().StringVar(&a.resourcePatternType, patternFlag, "any", "pattern to use when matching resource names (any, match, literal, or prefixed)")
+	cmd.Flags().StringVar(&a.resourcePatternType, patternFlag, "any", "Pattern to use when matching resource names (any, match, literal, or prefixed)")
 
-	cmd.Flags().StringSliceVar(&a.operations, operationFlag, nil, "operation to match (repeatable)")
+	cmd.Flags().StringSliceVar(&a.operations, operationFlag, nil, "Operation to match (repeatable)")
 
-	cmd.Flags().StringSliceVar(&a.allowPrincipals, allowPrincipalFlag, nil, "allowed principal ACLs to match (repeatable)")
-	cmd.Flags().StringSliceVar(&a.allowHosts, allowHostFlag, nil, "allowed host ACLs to match (repeatable)")
-	cmd.Flags().StringSliceVar(&a.denyPrincipals, denyPrincipalFlag, nil, "denied principal ACLs to match (repeatable)")
-	cmd.Flags().StringSliceVar(&a.denyHosts, denyHostFlag, nil, "denied host ACLs to match (repeatable)")
+	cmd.Flags().StringSliceVar(&a.allowPrincipals, allowPrincipalFlag, nil, "Allowed principal ACLs to match (repeatable)")
+	cmd.Flags().StringSliceVar(&a.allowHosts, allowHostFlag, nil, "Allowed host ACLs to match (repeatable)")
+	cmd.Flags().StringSliceVar(&a.denyPrincipals, denyPrincipalFlag, nil, "Denied principal ACLs to match (repeatable)")
+	cmd.Flags().StringSliceVar(&a.denyHosts, denyHostFlag, nil, "Denied host ACLs to match (repeatable)")
 }
 
 func describeReqResp(
diff --git a/src/go/rpk/pkg/cli/cmd/acl/user.go b/src/go/rpk/pkg/cli/cmd/acl/user.go
index dd26aad511a21..847808b951a66 100644
--- a/src/go/rpk/pkg/cli/cmd/acl/user.go
+++ b/src/go/rpk/pkg/cli/cmd/acl/user.go
@@ -24,7 +24,7 @@ func NewUserCommand(fs afero.Fs) *cobra.Command {
 	var apiUrls []string
 	cmd := &cobra.Command{
 		Use:   "user",
-		Short: "Manage SASL users.",
+		Short: "Manage SASL users",
 		Long: `Manage SASL users.
 
 If SASL is enabled, a SASL user is what you use to talk to Redpanda, and ACLs
@@ -39,7 +39,7 @@ redpanda section of your redpanda.yaml.
 		config.FlagAdminHosts2,
 		[]string{},
 		"The comma-separated list of Admin API addresses (<IP>:<port>)."+
-			" You must specify one for each node.",
+			" You must specify one for each node",
 	)
 
 	cmd.AddCommand(NewCreateUserCommand(fs))
@@ -59,7 +59,7 @@ func NewCreateUserCommand(fs afero.Fs) *cobra.Command {
 	var userOld, pass, passOld, mechanism string
 	cmd := &cobra.Command{
 		Use:   "create [USER] -p [PASS]",
-		Short: "Create a SASL user.",
+		Short: "Create a SASL user",
 		Long: `Create a SASL user.
 
 This command creates a single SASL user with the given password, optionally
@@ -120,11 +120,11 @@ acl help text for more info.
 	}
 
 	cmd.Flags().StringVar(&userOld, "new-username", "", "")
-	cmd.Flags().MarkDeprecated("new-username", "the username now does not require a flag") // Oct 2021
+	cmd.Flags().MarkDeprecated("new-username", "The username now does not require a flag") // Oct 2021
 
-	cmd.Flags().StringVarP(&pass, "password", "p", "", "new user's password")
+	cmd.Flags().StringVarP(&pass, "password", "p", "", "New user's password")
 	cmd.Flags().StringVar(&passOld, "new-password", "", "")
-	cmd.Flags().MarkDeprecated("new-password", "renamed to --password") // Oct 2021
+	cmd.Flags().MarkDeprecated("new-password", "Renamed to --password") // Oct 2021
 
 	cmd.Flags().StringVar(
 		&mechanism,
@@ -140,7 +140,7 @@ func NewDeleteUserCommand(fs afero.Fs) *cobra.Command {
 	var oldUser string
 	cmd := &cobra.Command{
 		Use:   "delete [USER]",
-		Short: "Delete a SASL user.",
+		Short: "Delete a SASL user",
 		Long: `Delete a SASL user.
 
 This command deletes the specified SASL account from Redpanda. This does not
@@ -174,7 +174,7 @@ delete any ACLs that may exist for this user.
 	}
 
 	cmd.Flags().StringVar(&oldUser, "delete-username", "", "The user to be deleted")
-	cmd.Flags().MarkDeprecated("delete-username", "the username now does not require a flag")
+	cmd.Flags().MarkDeprecated("delete-username", "The username now does not require a flag")
 
 	return cmd
 }
@@ -183,7 +183,7 @@ func NewListUsersCommand(fs afero.Fs) *cobra.Command {
 	return &cobra.Command{
 		Use:     "list",
 		Aliases: []string{"ls"},
-		Short:   "List SASL users.",
+		Short:   "List SASL users",
 		Run: func(cmd *cobra.Command, _ []string) {
 			p := config.ParamsFromCommand(cmd)
 			cfg, err := p.Load(fs)
diff --git a/src/go/rpk/pkg/cli/cmd/cluster.go b/src/go/rpk/pkg/cli/cmd/cluster.go
index c20e6dbdc51c1..a3be6165a55b8 100644
--- a/src/go/rpk/pkg/cli/cmd/cluster.go
+++ b/src/go/rpk/pkg/cli/cmd/cluster.go
@@ -33,7 +33,7 @@ func NewClusterCommand(fs afero.Fs) *cobra.Command {
 	)
 	command := &cobra.Command{
 		Use:   "cluster",
-		Short: "Interact with a Redpanda cluster.",
+		Short: "Interact with a Redpanda cluster",
 	}
 	// backcompat: until we switch to -X, we need these flags.
 	common.AddKafkaFlags(
diff --git a/src/go/rpk/pkg/cli/cmd/cluster/config/config.go b/src/go/rpk/pkg/cli/cmd/cluster/config/config.go
index 938932c17223a..517d15e9f23a5 100644
--- a/src/go/rpk/pkg/cli/cmd/cluster/config/config.go
+++ b/src/go/rpk/pkg/cli/cmd/cluster/config/config.go
@@ -29,7 +29,7 @@ func NewConfigCommand(fs afero.Fs) *cobra.Command {
 	command := &cobra.Command{
 		Use:   "config",
 		Args:  cobra.ExactArgs(0),
-		Short: "Interact with cluster configuration properties.",
+		Short: "Interact with cluster configuration properties",
 		Long: `Interact with cluster configuration properties.
 
 Cluster properties are redpanda settings which apply to all nodes in
@@ -68,7 +68,7 @@ different redpanda version that does not recognize certain properties.`,
 		&all,
 		"all",
 		false,
-		"Include all properties, including tunables.",
+		"Include all properties, including tunables",
 	)
 
 	command.AddCommand(
diff --git a/src/go/rpk/pkg/cli/cmd/cluster/config/edit.go b/src/go/rpk/pkg/cli/cmd/cluster/config/edit.go
index 3a99001f122a2..f5c55004a0b76 100644
--- a/src/go/rpk/pkg/cli/cmd/cluster/config/edit.go
+++ b/src/go/rpk/pkg/cli/cmd/cluster/config/edit.go
@@ -26,7 +26,7 @@ import (
 func newEditCommand(fs afero.Fs, all *bool) *cobra.Command {
 	cmd := &cobra.Command{
 		Use:   "edit",
-		Short: "Edit cluster configuration properties.",
+		Short: "Edit cluster configuration properties",
 		Long: `Edit cluster-wide configuration properties.
 
 This command opens a text editor to modify the cluster's configuration.
diff --git a/src/go/rpk/pkg/cli/cmd/cluster/config/export.go b/src/go/rpk/pkg/cli/cmd/cluster/config/export.go
index 6d11b999517f4..8653c05d7259e 100644
--- a/src/go/rpk/pkg/cli/cmd/cluster/config/export.go
+++ b/src/go/rpk/pkg/cli/cmd/cluster/config/export.go
@@ -135,7 +135,7 @@ func newExportCommand(fs afero.Fs, all *bool) *cobra.Command {
 
 	cmd := &cobra.Command{
 		Use:   "export",
-		Short: "Export cluster configuration.",
+		Short: "Export cluster configuration",
 		Long: `Export cluster configuration.
 
 Writes out a YAML representation of the cluster configuration to a file,
diff --git a/src/go/rpk/pkg/cli/cmd/cluster/config/import.go b/src/go/rpk/pkg/cli/cmd/cluster/config/import.go
index 78fcc62ed266a..eff288565068c 100644
--- a/src/go/rpk/pkg/cli/cmd/cluster/config/import.go
+++ b/src/go/rpk/pkg/cli/cmd/cluster/config/import.go
@@ -216,7 +216,7 @@ func newImportCommand(fs afero.Fs, all *bool) *cobra.Command {
 	var filename string
 	cmd := &cobra.Command{
 		Use:   "import",
-		Short: "Import cluster configuration from a file.",
+		Short: "Import cluster configuration from a file",
 		Long: `Import cluster configuration from a file.
 
 Import configuration from a YAML file, usually generated with
diff --git a/src/go/rpk/pkg/cli/cmd/cluster/config/lint.go b/src/go/rpk/pkg/cli/cmd/cluster/config/lint.go
index 6f1d59ebd1442..c3b9e6bb5d862 100644
--- a/src/go/rpk/pkg/cli/cmd/cluster/config/lint.go
+++ b/src/go/rpk/pkg/cli/cmd/cluster/config/lint.go
@@ -24,7 +24,7 @@ import (
 func newLintCommand(fs afero.Fs) *cobra.Command {
 	cmd := &cobra.Command{
 		Use:   "lint",
-		Short: "Remove any deprecated content from redpanda.yaml.",
+		Short: "Remove any deprecated content from redpanda.yaml",
 		Long: `Remove any deprecated content from redpanda.yaml.
 
 Deprecated content includes properties which were set via redpanda.yaml
diff --git a/src/go/rpk/pkg/cli/cmd/cluster/config/reset.go b/src/go/rpk/pkg/cli/cmd/cluster/config/reset.go
index 53d5b2e833196..18cd1e5054a99 100644
--- a/src/go/rpk/pkg/cli/cmd/cluster/config/reset.go
+++ b/src/go/rpk/pkg/cli/cmd/cluster/config/reset.go
@@ -23,7 +23,7 @@ func newForceResetCommand(fs afero.Fs) *cobra.Command {
 	var configCacheFile string
 	cmd := &cobra.Command{
 		Use:   "force-reset [PROPERTY...]",
-		Short: "Forcibly clear a cluster configuration property on this node.",
+		Short: "Forcibly clear a cluster configuration property on this node",
 		Long: `Forcibly clear a cluster configuration property on this node.
 
 This command is not for general changes to cluster configuration: use this only
diff --git a/src/go/rpk/pkg/cli/cmd/cluster/health.go b/src/go/rpk/pkg/cli/cmd/cluster/health.go
index 03a5d0a13766e..2092151ed88b0 100644
--- a/src/go/rpk/pkg/cli/cmd/cluster/health.go
+++ b/src/go/rpk/pkg/cli/cmd/cluster/health.go
@@ -35,7 +35,7 @@ func NewHealthOverviewCommand(fs afero.Fs) *cobra.Command {
 	)
 	cmd := &cobra.Command{
 		Use:   "health",
-		Short: "Queries cluster for health overview.",
+		Short: "Queries cluster for health overview",
 		Long: `Queries health overview.
 
 Health overview is created based on the health reports collected periodically
@@ -84,8 +84,8 @@ following conditions are met:
 		&adminCAFile,
 	)
 
-	cmd.Flags().BoolVarP(&watch, "watch", "w", false, "blocks and writes out all cluster health changes")
-	cmd.Flags().BoolVarP(&exit, "exit-when-healthy", "e", false, "when used with watch, exits after cluster is back in healthy state")
+	cmd.Flags().BoolVarP(&watch, "watch", "w", false, "Blocks and writes out all cluster health changes")
+	cmd.Flags().BoolVarP(&exit, "exit-when-healthy", "e", false, "When used with watch, exits after cluster is back in healthy state")
 	return cmd
 }
 
diff --git a/src/go/rpk/pkg/cli/cmd/cluster/maintenance/disable.go b/src/go/rpk/pkg/cli/cmd/cluster/maintenance/disable.go
index 390d102e844b6..5f1ea43a4ff35 100644
--- a/src/go/rpk/pkg/cli/cmd/cluster/maintenance/disable.go
+++ b/src/go/rpk/pkg/cli/cmd/cluster/maintenance/disable.go
@@ -24,7 +24,7 @@ import (
 func newDisableCommand(fs afero.Fs) *cobra.Command {
 	cmd := &cobra.Command{
 		Use:   "disable <broker-id>",
-		Short: "Disable maintenance mode for a node.",
+		Short: "Disable maintenance mode for a node",
 		Long:  `Disable maintenance mode for a node.`,
 		Args:  cobra.ExactArgs(1),
 		Run: func(cmd *cobra.Command, args []string) {
diff --git a/src/go/rpk/pkg/cli/cmd/cluster/maintenance/enable.go b/src/go/rpk/pkg/cli/cmd/cluster/maintenance/enable.go
index df59c5dc94a50..f94fcaa71e4b9 100644
--- a/src/go/rpk/pkg/cli/cmd/cluster/maintenance/enable.go
+++ b/src/go/rpk/pkg/cli/cmd/cluster/maintenance/enable.go
@@ -26,7 +26,7 @@ func newEnableCommand(fs afero.Fs) *cobra.Command {
 	var wait bool
 	cmd := &cobra.Command{
 		Use:   "enable <node-id>",
-		Short: "Enable maintenance mode for a node.",
+		Short: "Enable maintenance mode for a node",
 		Long: `Enable maintenance mode for a node.
 
 This command enables maintenance mode for the node with the specified ID. If a
diff --git a/src/go/rpk/pkg/cli/cmd/cluster/maintenance/maintenance.go b/src/go/rpk/pkg/cli/cmd/cluster/maintenance/maintenance.go
index 9f883e3da2d29..cd265d223d150 100644
--- a/src/go/rpk/pkg/cli/cmd/cluster/maintenance/maintenance.go
+++ b/src/go/rpk/pkg/cli/cmd/cluster/maintenance/maintenance.go
@@ -27,7 +27,7 @@ func NewMaintenanceCommand(fs afero.Fs) *cobra.Command {
 
 	cmd := &cobra.Command{
 		Use:   "maintenance",
-		Short: "Toggle a node's maintenance mode.",
+		Short: "Toggle a node's maintenance mode",
 		Long: `Interact with cluster maintenance mode.
 
 Maintenance mode is a state that a node may be placed into in which the node
diff --git a/src/go/rpk/pkg/cli/cmd/cluster/maintenance/status.go b/src/go/rpk/pkg/cli/cmd/cluster/maintenance/status.go
index 1135802517a48..5b562e3a0258b 100644
--- a/src/go/rpk/pkg/cli/cmd/cluster/maintenance/status.go
+++ b/src/go/rpk/pkg/cli/cmd/cluster/maintenance/status.go
@@ -40,7 +40,7 @@ func addBrokerMaintenanceReport(table *out.TabWriter, b admin.Broker) {
 func newStatusCommand(fs afero.Fs) *cobra.Command {
 	cmd := &cobra.Command{
 		Use:   "status",
-		Short: "Report maintenance status.",
+		Short: "Report maintenance status",
 		Long: `Report maintenance status.
 
 This command reports maintenance status for each node in the cluster. The output
diff --git a/src/go/rpk/pkg/cli/cmd/cluster/metadata.go b/src/go/rpk/pkg/cli/cmd/cluster/metadata.go
index 2b8486150c8b9..a77384b2b3a51 100644
--- a/src/go/rpk/pkg/cli/cmd/cluster/metadata.go
+++ b/src/go/rpk/pkg/cli/cmd/cluster/metadata.go
@@ -36,7 +36,7 @@ func NewMetadataCommand(fs afero.Fs) *cobra.Command {
 	cmd := &cobra.Command{
 		Use:     "metadata",
 		Aliases: []string{"status", "info"},
-		Short:   "Request broker metadata.",
+		Short:   "Request broker metadata",
 		Long: `Request broker metadata.
 
 The Kafka protocol's metadata contains information about brokers, topics, and
@@ -120,11 +120,11 @@ In the broker section, the controller node is suffixed with *.
 		},
 	}
 
-	cmd.Flags().BoolVarP(&cluster, "print-cluster", "c", false, "print cluster section")
-	cmd.Flags().BoolVarP(&brokers, "print-brokers", "b", false, "print brokers section")
-	cmd.Flags().BoolVarP(&topics, "print-topics", "t", false, "print topics section (implied if any topics are specified)")
-	cmd.Flags().BoolVarP(&internal, "print-internal-topics", "i", false, "print internal topics (if all topics requested, implies -t)")
-	cmd.Flags().BoolVarP(&detailed, "print-detailed-topics", "d", false, "print per-partition information for topics (implies -t)")
+	cmd.Flags().BoolVarP(&cluster, "print-cluster", "c", false, "Print cluster section")
+	cmd.Flags().BoolVarP(&brokers, "print-brokers", "b", false, "Print brokers section")
+	cmd.Flags().BoolVarP(&topics, "print-topics", "t", false, "Print topics section (implied if any topics are specified)")
+	cmd.Flags().BoolVarP(&internal, "print-internal-topics", "i", false, "Print internal topics (if all topics requested, implies -t)")
+	cmd.Flags().BoolVarP(&detailed, "print-detailed-topics", "d", false, "Print per-partition information for topics (implies -t)")
 	return cmd
 }
 
diff --git a/src/go/rpk/pkg/cli/cmd/common/common.go b/src/go/rpk/pkg/cli/cmd/common/common.go
index 1d60cf7c78c88..a44701a1e1cee 100644
--- a/src/go/rpk/pkg/cli/cmd/common/common.go
+++ b/src/go/rpk/pkg/cli/cmd/common/common.go
@@ -39,7 +39,7 @@ func AddKafkaFlags(
 		"Comma-separated list of broker ip:port pairs (e.g."+
 			" --brokers '192.168.78.34:9092,192.168.78.35:9092,192.179.23.54:9092' )."+
 			" Alternatively, you may set the REDPANDA_BROKERS environment"+
-			" variable with the comma-separated list of broker addresses.",
+			" variable with the comma-separated list of broker addresses",
 	)
 	command.PersistentFlags().StringVar(
 		configFile,
@@ -52,19 +52,19 @@ func AddKafkaFlags(
 		user,
 		"user",
 		"",
-		"SASL user to be used for authentication.",
+		"SASL user to be used for authentication",
 	)
 	command.PersistentFlags().StringVar(
 		password,
 		"password",
 		"",
-		"SASL password to be used for authentication.",
+		"SASL password to be used for authentication",
 	)
 	command.PersistentFlags().StringVar(
 		saslMechanism,
 		config.FlagSASLMechanism,
 		"",
-		"The authentication mechanism to use. Supported values: SCRAM-SHA-256, SCRAM-SHA-512.",
+		"The authentication mechanism to use. Supported values: SCRAM-SHA-256, SCRAM-SHA-512",
 	)
 
 	AddTLSFlags(command, enableTLS, certFile, keyFile, truststoreFile)
@@ -81,25 +81,25 @@ func AddTLSFlags(
 		enableTLS,
 		config.FlagEnableTLS,
 		false,
-		"Enable TLS for the Kafka API (not necessary if specifying custom certs).",
+		"Enable TLS for the Kafka API (not necessary if specifying custom certs)",
 	)
 	command.PersistentFlags().StringVar(
 		certFile,
 		config.FlagTLSCert,
 		"",
-		"The certificate to be used for TLS authentication with the broker.",
+		"The certificate to be used for TLS authentication with the broker",
 	)
 	command.PersistentFlags().StringVar(
 		keyFile,
 		config.FlagTLSKey,
 		"",
-		"The certificate key to be used for TLS authentication with the broker.",
+		"The certificate key to be used for TLS authentication with the broker",
 	)
 	command.PersistentFlags().StringVar(
 		truststoreFile,
 		config.FlagTLSCA,
 		"",
-		"The truststore to be used for TLS communication with the broker.",
+		"The truststore to be used for TLS communication with the broker",
 	)
 
 	return command
@@ -114,25 +114,25 @@ func AddAdminAPITLSFlags(
 		enableTLS,
 		config.FlagEnableAdminTLS,
 		false,
-		"Enable TLS for the Admin API (not necessary if specifying custom certs).",
+		"Enable TLS for the Admin API (not necessary if specifying custom certs)",
 	)
 	command.PersistentFlags().StringVar(
 		certFile,
 		config.FlagAdminTLSCert,
 		"",
-		"The certificate to be used for TLS authentication with the Admin API.",
+		"The certificate to be used for TLS authentication with the Admin API",
 	)
 	command.PersistentFlags().StringVar(
 		keyFile,
 		config.FlagAdminTLSKey,
 		"",
-		"The certificate key to be used for TLS authentication with the Admin API.",
+		"The certificate key to be used for TLS authentication with the Admin API",
 	)
 	command.PersistentFlags().StringVar(
 		truststoreFile,
 		config.FlagAdminTLSCA,
 		"",
-		"The truststore to be used for TLS communication with the Admin API.",
+		"The truststore to be used for TLS communication with the Admin API",
 	)
 
 	return command
diff --git a/src/go/rpk/pkg/cli/cmd/container.go b/src/go/rpk/pkg/cli/cmd/container.go
index 6229d38d0ad0d..2301d8a59338f 100644
--- a/src/go/rpk/pkg/cli/cmd/container.go
+++ b/src/go/rpk/pkg/cli/cmd/container.go
@@ -17,7 +17,7 @@ import (
 func NewContainerCommand() *cobra.Command {
 	command := &cobra.Command{
 		Use:   "container",
-		Short: "Manage a local container cluster.",
+		Short: "Manage a local container cluster",
 	}
 
 	command.AddCommand(container.Start())
diff --git a/src/go/rpk/pkg/cli/cmd/container/purge.go b/src/go/rpk/pkg/cli/cmd/container/purge.go
index 8447e8f7574d2..dfe4252a5633d 100644
--- a/src/go/rpk/pkg/cli/cmd/container/purge.go
+++ b/src/go/rpk/pkg/cli/cmd/container/purge.go
@@ -22,7 +22,7 @@ import (
 func Purge() *cobra.Command {
 	command := &cobra.Command{
 		Use:   "purge",
-		Short: "Stop and remove an existing local container cluster's data.",
+		Short: "Stop and remove an existing local container cluster's data",
 		RunE: func(_ *cobra.Command, _ []string) error {
 			c, err := common.NewDockerClient()
 			if err != nil {
diff --git a/src/go/rpk/pkg/cli/cmd/container/start.go b/src/go/rpk/pkg/cli/cmd/container/start.go
index 5fab06e45473f..f69ec1a5d9f58 100644
--- a/src/go/rpk/pkg/cli/cmd/container/start.go
+++ b/src/go/rpk/pkg/cli/cmd/container/start.go
@@ -58,7 +58,7 @@ func Start() *cobra.Command {
 	)
 	command := &cobra.Command{
 		Use:   "start",
-		Short: "Start a local container cluster.",
+		Short: "Start a local container cluster",
 		FParseErrWhitelist: cobra.FParseErrWhitelist{
 			// Allow unknown flags so that arbitrary flags can be passed
 			// through to the containers without the need to pass '--'
diff --git a/src/go/rpk/pkg/cli/cmd/container/stop.go b/src/go/rpk/pkg/cli/cmd/container/stop.go
index 260f386a2fffb..4f250b4ec6791 100644
--- a/src/go/rpk/pkg/cli/cmd/container/stop.go
+++ b/src/go/rpk/pkg/cli/cmd/container/stop.go
@@ -22,7 +22,7 @@ import (
 func Stop() *cobra.Command {
 	command := &cobra.Command{
 		Use:   "stop",
-		Short: "Stop an existing local container cluster.",
+		Short: "Stop an existing local container cluster",
 		RunE: func(_ *cobra.Command, _ []string) error {
 			c, err := common.NewDockerClient()
 			if err != nil {
diff --git a/src/go/rpk/pkg/cli/cmd/debug/bundle.go b/src/go/rpk/pkg/cli/cmd/debug/bundle.go
index 40ea7f333f690..3515dc95d1dc2 100644
--- a/src/go/rpk/pkg/cli/cmd/debug/bundle.go
+++ b/src/go/rpk/pkg/cli/cmd/debug/bundle.go
@@ -53,7 +53,7 @@ func newBundleCommand(fs afero.Fs) *cobra.Command {
 	)
 	command := &cobra.Command{
 		Use:   "bundle",
-		Short: "Collect environment data and create a bundle file for the Redpanda Data support team to inspect.",
+		Short: "Collect environment data and create a bundle file for the Redpanda Data support team to inspect",
 		Long:  bundleHelpText,
 		Run: func(cmd *cobra.Command, args []string) {
 			p := config.ParamsFromCommand(cmd)
@@ -78,7 +78,7 @@ func newBundleCommand(fs afero.Fs) *cobra.Command {
 		&adminURL,
 		"admin-url",
 		"",
-		"The address to the broker's admin API. Defaults to the one in the config file.",
+		"The address to the broker's admin API. Defaults to the one in the config file",
 	)
 	command.Flags().DurationVar(
 		&timeout,
@@ -102,7 +102,7 @@ func newBundleCommand(fs afero.Fs) *cobra.Command {
 		&logsSizeLimit,
 		"logs-size-limit",
 		"100MiB",
-		"Read the logs until the given size is reached. Multipliers are also supported, e.g. 3MB, 1GiB.",
+		"Read the logs until the given size is reached. Multipliers are also supported, e.g. 3MB, 1GiB",
 	)
 
 	common.AddKafkaFlags(
diff --git a/src/go/rpk/pkg/cli/cmd/debug/debug.go b/src/go/rpk/pkg/cli/cmd/debug/debug.go
index 067f2c8a672a8..b8e965f53f556 100644
--- a/src/go/rpk/pkg/cli/cmd/debug/debug.go
+++ b/src/go/rpk/pkg/cli/cmd/debug/debug.go
@@ -17,7 +17,7 @@ import (
 func NewCommand(fs afero.Fs) *cobra.Command {
 	cmd := &cobra.Command{
 		Use:   "debug",
-		Short: "Debug the local Redpanda process.",
+		Short: "Debug the local Redpanda process",
 	}
 
 	cmd.AddCommand(
diff --git a/src/go/rpk/pkg/cli/cmd/debug/info.go b/src/go/rpk/pkg/cli/cmd/debug/info.go
index f6ac657dd19f7..96c8fa080f541 100644
--- a/src/go/rpk/pkg/cli/cmd/debug/info.go
+++ b/src/go/rpk/pkg/cli/cmd/debug/info.go
@@ -34,7 +34,7 @@ func NewInfoCommand(fs afero.Fs) *cobra.Command {
 	)
 	cmd := &cobra.Command{
 		Use:     "info",
-		Short:   "Send usage stats to Redpanda Data.",
+		Short:   "Send usage stats to Redpanda Data",
 		Hidden:  true,
 		Aliases: []string{"status"},
 		Args:    cobra.ExactArgs(0),
@@ -95,7 +95,7 @@ func NewInfoCommand(fs afero.Fs) *cobra.Command {
 		},
 	}
 	cmd.Flags().StringVar(&configFile, "config", "", "Redpanda config file, if not set the file will be searched for in the default locations")
-	cmd.Flags().BoolVar(&send, "send", false, "If true, send resource usage data to Vectorzed.")
-	cmd.Flags().DurationVar(&timeout, "timeout", 2*time.Second, "How long to wait to calculate the Redpanda CPU % utilization.")
+	cmd.Flags().BoolVar(&send, "send", false, "If true, send resource usage data to Redpanda")
+	cmd.Flags().DurationVar(&timeout, "timeout", 2*time.Second, "How long to wait to calculate the Redpanda CPU % utilization")
 	return cmd
 }
diff --git a/src/go/rpk/pkg/cli/cmd/generate.go b/src/go/rpk/pkg/cli/cmd/generate.go
index dde60c4a9ce66..e52ac000372d4 100644
--- a/src/go/rpk/pkg/cli/cmd/generate.go
+++ b/src/go/rpk/pkg/cli/cmd/generate.go
@@ -18,7 +18,7 @@ import (
 func NewGenerateCommand(fs afero.Fs) *cobra.Command {
 	command := &cobra.Command{
 		Use:   "generate [template]",
-		Short: "Generate a configuration template for related services.",
+		Short: "Generate a configuration template for related services",
 	}
 	command.AddCommand(generate.NewGrafanaDashboardCmd())
 	command.AddCommand(generate.NewPrometheusConfigCmd(fs))
diff --git a/src/go/rpk/pkg/cli/cmd/generate/autocomplete.go b/src/go/rpk/pkg/cli/cmd/generate/autocomplete.go
index d664e0482918a..b0f7139475dd7 100644
--- a/src/go/rpk/pkg/cli/cmd/generate/autocomplete.go
+++ b/src/go/rpk/pkg/cli/cmd/generate/autocomplete.go
@@ -19,7 +19,7 @@ import (
 func NewShellCompletionCommand() *cobra.Command {
 	return &cobra.Command{
 		Use:   "shell-completion",
-		Short: "Generate shell completion commands.",
+		Short: "Generate shell completion commands",
 		Long: `
 Shell completion can help autocomplete rpk commands when you press tab.
 
diff --git a/src/go/rpk/pkg/cli/cmd/generate/grafana.go b/src/go/rpk/pkg/cli/cmd/generate/grafana.go
index 99f3af6987a97..8bfc2daebb3ec 100644
--- a/src/go/rpk/pkg/cli/cmd/generate/grafana.go
+++ b/src/go/rpk/pkg/cli/cmd/generate/grafana.go
@@ -65,7 +65,7 @@ func NewGrafanaDashboardCmd() *cobra.Command {
 	var metricsEndpoint string
 	command := &cobra.Command{
 		Use:   "grafana-dashboard",
-		Short: "Generate a Grafana dashboard for redpanda metrics.",
+		Short: "Generate a Grafana dashboard for redpanda metrics",
 		RunE: func(ccmd *cobra.Command, args []string) error {
 			if !(strings.HasPrefix(metricsEndpoint, "http://") ||
 				strings.HasPrefix(metricsEndpoint, "https://")) {
diff --git a/src/go/rpk/pkg/cli/cmd/generate/prometheus.go b/src/go/rpk/pkg/cli/cmd/generate/prometheus.go
index 8d78a55234039..045deb5a08db4 100644
--- a/src/go/rpk/pkg/cli/cmd/generate/prometheus.go
+++ b/src/go/rpk/pkg/cli/cmd/generate/prometheus.go
@@ -46,7 +46,7 @@ func NewPrometheusConfigCmd(fs afero.Fs) *cobra.Command {
 	)
 	command := &cobra.Command{
 		Use:   "prometheus-config",
-		Short: "Generate the Prometheus configuration to scrape redpanda nodes.",
+		Short: "Generate the Prometheus configuration to scrape redpanda nodes",
 		Long: `
 Generate the Prometheus configuration to scrape redpanda nodes. This command's
 output should be added to the 'scrape_configs' array in your Prometheus
diff --git a/src/go/rpk/pkg/cli/cmd/group/describe.go b/src/go/rpk/pkg/cli/cmd/group/describe.go
index 7b295f63262cf..bbca2fa14bcf5 100644
--- a/src/go/rpk/pkg/cli/cmd/group/describe.go
+++ b/src/go/rpk/pkg/cli/cmd/group/describe.go
@@ -28,7 +28,7 @@ func NewDescribeCommand(fs afero.Fs) *cobra.Command {
 
 	cmd := &cobra.Command{
 		Use:   "describe [GROUPS...]",
-		Short: "Describe group offset status & lag.",
+		Short: "Describe group offset status & lag",
 		Long: `Describe group offset status & lag.
 
 This command describes group members, calculates their lag, and prints detailed
diff --git a/src/go/rpk/pkg/cli/cmd/group/group.go b/src/go/rpk/pkg/cli/cmd/group/group.go
index 4f51ab9e2bdb1..c411375222f84 100644
--- a/src/go/rpk/pkg/cli/cmd/group/group.go
+++ b/src/go/rpk/pkg/cli/cmd/group/group.go
@@ -25,7 +25,7 @@ func NewCommand(fs afero.Fs) *cobra.Command {
 	cmd := &cobra.Command{
 		Use:     "group",
 		Aliases: []string{"g"},
-		Short:   `Describe, list, and delete consumer groups and manage their offsets.`,
+		Short:   `Describe, list, and delete consumer groups and manage their offsets`,
 		Long: `Describe, list, and delete consumer groups and manage their offsets.
 
 Consumer groups allow you to horizontally scale consuming from topics. A
@@ -108,7 +108,7 @@ func newListCommand(fs afero.Fs) *cobra.Command {
 	return &cobra.Command{
 		Use:     "list",
 		Aliases: []string{"ls"},
-		Short:   "List all groups.",
+		Short:   "List all groups",
 		Long: `List all groups.
 
 This command lists all groups currently known to Redpanda, including empty
@@ -144,7 +144,7 @@ groups, or to list groups that need to be cleaned up.
 func newDeleteCommand(fs afero.Fs) *cobra.Command {
 	return &cobra.Command{
 		Use:   "delete [GROUPS...]",
-		Short: "Delete groups from brokers.",
+		Short: "Delete groups from brokers",
 		Long: `Delete groups from brokers.
 
 Older versions of the Kafka protocol included a retention_millis field in
diff --git a/src/go/rpk/pkg/cli/cmd/group/seek.go b/src/go/rpk/pkg/cli/cmd/group/seek.go
index 8118b994a158e..4caa965013bb4 100644
--- a/src/go/rpk/pkg/cli/cmd/group/seek.go
+++ b/src/go/rpk/pkg/cli/cmd/group/seek.go
@@ -35,7 +35,7 @@ func newSeekCommand(fs afero.Fs) *cobra.Command {
 
 	cmd := &cobra.Command{
 		Use:   "seek [GROUP] --to (start|end|timestamp) --to-group ... --topics ...",
-		Short: "Modify a group's current offsets.",
+		Short: "Modify a group's current offsets",
 		Long: `Modify a group's current offsets.
 
 This command allows you to modify a group's offsets. Sometimes, you may need to
diff --git a/src/go/rpk/pkg/cli/cmd/iotune.go b/src/go/rpk/pkg/cli/cmd/iotune.go
index edc33f7712028..c6405e05e1012 100644
--- a/src/go/rpk/pkg/cli/cmd/iotune.go
+++ b/src/go/rpk/pkg/cli/cmd/iotune.go
@@ -33,7 +33,7 @@ func NewIoTuneCmd(fs afero.Fs) *cobra.Command {
 	)
 	command := &cobra.Command{
 		Use:   "iotune",
-		Short: "Measure filesystem performance and create IO configuration file.",
+		Short: "Measure filesystem performance and create IO configuration file",
 		Run: func(cmd *cobra.Command, args []string) {
 			timeout += duration
 			p := config.ParamsFromCommand(cmd)
@@ -57,7 +57,7 @@ func NewIoTuneCmd(fs afero.Fs) *cobra.Command {
 		"config",
 		"",
 		"Redpanda config file, if not set the file will be searched for"+
-			" in the default locations.",
+			" in the default locations",
 	)
 	command.Flags().StringVar(
 		&outputFile,
diff --git a/src/go/rpk/pkg/cli/cmd/plugin/plugin.go b/src/go/rpk/pkg/cli/cmd/plugin/plugin.go
index 634f5175e4a5d..daac9d785ec03 100644
--- a/src/go/rpk/pkg/cli/cmd/plugin/plugin.go
+++ b/src/go/rpk/pkg/cli/cmd/plugin/plugin.go
@@ -17,7 +17,7 @@ const urlBase = "https://vectorized-public.s3.us-west-2.amazonaws.com/rpk-plugin
 func NewCommand(fs afero.Fs) *cobra.Command {
 	cmd := &cobra.Command{
 		Use:   "plugin",
-		Short: "List, download, update, and remove rpk plugins.",
+		Short: "List, download, update, and remove rpk plugins",
 		Long: `List, download, update, and remove rpk plugins.
 	
 Plugins augment rpk with new commands.
@@ -76,7 +76,7 @@ func newListCommand(fs afero.Fs) *cobra.Command {
 
 	cmd := &cobra.Command{
 		Use:   "list",
-		Short: "List all available plugins.",
+		Short: "List all available plugins",
 		Long: `List all available plugins.
 
 By default, this command fetches the remote manifest and prints plugins
@@ -156,7 +156,7 @@ func newInstallCommand(fs afero.Fs) *cobra.Command {
 	cmd := &cobra.Command{
 		Use:     "install [PLUGIN]",
 		Aliases: []string{"download"},
-		Short:   "Install an rpk plugin.",
+		Short:   "Install an rpk plugin",
 		Long: `Install an rpk plugin.
 
 An rpk plugin must be saved in a directory that is in your $PATH. By default,
@@ -246,7 +246,7 @@ func newUninstallCommand(fs afero.Fs) *cobra.Command {
 	cmd := &cobra.Command{
 		Use:     "uninstall [NAME]",
 		Aliases: []string{"rm"},
-		Short:   "Uninstall / remove an existing local plugin.",
+		Short:   "Uninstall / remove an existing local plugin",
 		Long: `Uninstall / remove an existing local plugin.
 
 This command lists locally installed plugins and removes the first plugin that
diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/admin/admin.go b/src/go/rpk/pkg/cli/cmd/redpanda/admin/admin.go
index da326b218c4b5..de87c05e1e8b3 100644
--- a/src/go/rpk/pkg/cli/cmd/redpanda/admin/admin.go
+++ b/src/go/rpk/pkg/cli/cmd/redpanda/admin/admin.go
@@ -24,7 +24,7 @@ import (
 func NewCommand(fs afero.Fs) *cobra.Command {
 	cmd := &cobra.Command{
 		Use:   "admin",
-		Short: "Talk to the Redpanda admin listener.",
+		Short: "Talk to the Redpanda admin listener",
 		Args:  cobra.ExactArgs(0),
 	}
 
@@ -50,7 +50,7 @@ func NewCommand(fs afero.Fs) *cobra.Command {
 		config.FlagAdminHosts1,
 		[]string{},
 		"A comma-separated list of Admin API addresses (<IP>:<port>)."+
-			" You must specify one for each node.",
+			" You must specify one for each node",
 	)
 
 	common.AddAdminAPITLSFlags(
diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/admin/brokers/brokers.go b/src/go/rpk/pkg/cli/cmd/redpanda/admin/brokers/brokers.go
index bd08a9d4e17c6..fa09963ee7fae 100644
--- a/src/go/rpk/pkg/cli/cmd/redpanda/admin/brokers/brokers.go
+++ b/src/go/rpk/pkg/cli/cmd/redpanda/admin/brokers/brokers.go
@@ -26,7 +26,7 @@ import (
 func NewCommand(fs afero.Fs) *cobra.Command {
 	cmd := &cobra.Command{
 		Use:   "brokers",
-		Short: "View and configure Redpanda brokers through the admin listener.",
+		Short: "View and configure Redpanda brokers through the admin listener",
 		Args:  cobra.ExactArgs(0),
 	}
 	cmd.AddCommand(
@@ -41,7 +41,7 @@ func newListCommand(fs afero.Fs) *cobra.Command {
 	return &cobra.Command{
 		Use:     "list",
 		Aliases: []string{"ls"},
-		Short:   "List the brokers in your cluster.",
+		Short:   "List the brokers in your cluster",
 		Args:    cobra.ExactArgs(0),
 		Run: func(cmd *cobra.Command, _ []string) {
 			p := config.ParamsFromCommand(cmd)
@@ -82,7 +82,7 @@ func newListCommand(fs afero.Fs) *cobra.Command {
 func newDecommissionBroker(fs afero.Fs) *cobra.Command {
 	return &cobra.Command{
 		Use:   "decommission [BROKER ID]",
-		Short: "Decommission the given broker.",
+		Short: "Decommission the given broker",
 		Long: `Decommission the given broker.
 
 Decommissioning a broker removes it from the cluster.
@@ -116,7 +116,7 @@ leader handles the request.
 func newRecommissionBroker(fs afero.Fs) *cobra.Command {
 	return &cobra.Command{
 		Use:   "recommission [BROKER ID]",
-		Short: "Recommission the given broker if it is still decommissioning.",
+		Short: "Recommission the given broker if it is still decommissioning",
 		Long: `Recommission the given broker if is is still decommissioning.
 
 Recommissioning can stop an active decommission.
diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/admin/config/config.go b/src/go/rpk/pkg/cli/cmd/redpanda/admin/config/config.go
index fdcd40f81b55e..a73cf6839480b 100644
--- a/src/go/rpk/pkg/cli/cmd/redpanda/admin/config/config.go
+++ b/src/go/rpk/pkg/cli/cmd/redpanda/admin/config/config.go
@@ -20,7 +20,7 @@ import (
 func NewCommand(fs afero.Fs) *cobra.Command {
 	cmd := &cobra.Command{
 		Use:   "config",
-		Short: "View or modify Redpanda configuration through the admin listener.",
+		Short: "View or modify Redpanda configuration through the admin listener",
 		Args:  cobra.ExactArgs(0),
 	}
 	cmd.AddCommand(
@@ -35,7 +35,7 @@ func newPrintCommand(fs afero.Fs) *cobra.Command {
 	cmd := &cobra.Command{
 		Use:     "print",
 		Aliases: []string{"dump", "list", "ls", "display"},
-		Short:   "Display the current Redpanda configuration.",
+		Short:   "Display the current Redpanda configuration",
 		Args:    cobra.ExactArgs(0),
 		Run: func(cmd *cobra.Command, _ []string) {
 			p := config.ParamsFromCommand(cmd)
@@ -65,7 +65,7 @@ func newPrintCommand(fs afero.Fs) *cobra.Command {
 func newLogLevelCommand(fs afero.Fs) *cobra.Command {
 	cmd := &cobra.Command{
 		Use:   "log-level",
-		Short: "Manage a broker's log level.",
+		Short: "Manage a broker's log level",
 		Args:  cobra.ExactArgs(0),
 	}
 	cmd.AddCommand(
@@ -81,7 +81,7 @@ func newLogLevelSetCommand(fs afero.Fs) *cobra.Command {
 
 	cmd := &cobra.Command{
 		Use:   "set [LOGGERS...]",
-		Short: "Set broker logger's log level.",
+		Short: "Set broker logger's log level",
 		Long: `Set broker logger's log level.
 
 This command temporarily changes a broker logger's log level. Each Redpanda
diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/admin/partitions/partitions.go b/src/go/rpk/pkg/cli/cmd/redpanda/admin/partitions/partitions.go
index 6254e621f90a5..6bee4efa0f7cb 100644
--- a/src/go/rpk/pkg/cli/cmd/redpanda/admin/partitions/partitions.go
+++ b/src/go/rpk/pkg/cli/cmd/redpanda/admin/partitions/partitions.go
@@ -27,7 +27,7 @@ import (
 func NewCommand(fs afero.Fs) *cobra.Command {
 	cmd := &cobra.Command{
 		Use:   "partitions",
-		Short: "View and configure Redpanda partitions through the admin listener.",
+		Short: "View and configure Redpanda partitions through the admin listener",
 		Args:  cobra.ExactArgs(0),
 	}
 	cmd.AddCommand(
@@ -41,7 +41,7 @@ func newListCommand(fs afero.Fs) *cobra.Command {
 	cmd := &cobra.Command{
 		Use:     "list [BROKER ID]",
 		Aliases: []string{"ls"},
-		Short:   "List the partitions in a broker in the cluster.",
+		Short:   "List the partitions in a broker in the cluster",
 		Args:    cobra.ExactArgs(1),
 		Run: func(cmd *cobra.Command, args []string) {
 			brokerID, err := strconv.Atoi(args[0])
diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/check.go b/src/go/rpk/pkg/cli/cmd/redpanda/check.go
index 1b69922123130..36b7466d20e7d 100644
--- a/src/go/rpk/pkg/cli/cmd/redpanda/check.go
+++ b/src/go/rpk/pkg/cli/cmd/redpanda/check.go
@@ -34,7 +34,7 @@ func NewCheckCommand(fs afero.Fs) *cobra.Command {
 	)
 	command := &cobra.Command{
 		Use:   "check",
-		Short: "Check if system meets redpanda requirements.",
+		Short: "Check if system meets redpanda requirements",
 		Run: func(cmd *cobra.Command, args []string) {
 			p := config.ParamsFromCommand(cmd)
 			cfg, err := p.Load(fs)
diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/config.go b/src/go/rpk/pkg/cli/cmd/redpanda/config.go
index 8412479faae90..3daa5575bee94 100644
--- a/src/go/rpk/pkg/cli/cmd/redpanda/config.go
+++ b/src/go/rpk/pkg/cli/cmd/redpanda/config.go
@@ -33,7 +33,7 @@ const (
 func NewConfigCommand(fs afero.Fs) *cobra.Command {
 	root := &cobra.Command{
 		Use:   "config <command>",
-		Short: "Edit configuration.",
+		Short: "Edit configuration",
 	}
 	root.AddCommand(set(fs))
 	root.AddCommand(bootstrap(fs))
diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/mode.go b/src/go/rpk/pkg/cli/cmd/redpanda/mode.go
index 8019b1fcc6095..4433f0948390d 100644
--- a/src/go/rpk/pkg/cli/cmd/redpanda/mode.go
+++ b/src/go/rpk/pkg/cli/cmd/redpanda/mode.go
@@ -26,7 +26,7 @@ func NewModeCommand(fs afero.Fs) *cobra.Command {
 	var configFile string
 	command := &cobra.Command{
 		Use:   "mode <mode>",
-		Short: "Enable a default configuration mode.",
+		Short: "Enable a default configuration mode",
 		Long:  "",
 		Args: func(_ *cobra.Command, args []string) error {
 			if len(args) < 1 {
diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/start.go b/src/go/rpk/pkg/cli/cmd/redpanda/start.go
index fe9f5e779dcd6..565eacb4e21b8 100644
--- a/src/go/rpk/pkg/cli/cmd/redpanda/start.go
+++ b/src/go/rpk/pkg/cli/cmd/redpanda/start.go
@@ -134,7 +134,7 @@ func NewStartCommand(fs afero.Fs, launcher rp.Launcher) *cobra.Command {
 
 	command := &cobra.Command{
 		Use:   "start",
-		Short: "Start redpanda.",
+		Short: "Start redpanda",
 		FParseErrWhitelist: cobra.FParseErrWhitelist{
 			// Allow unknown flags so that arbitrary flags can be passed
 			// through to redpanda/seastar without the need to pass '--'
diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/stop.go b/src/go/rpk/pkg/cli/cmd/redpanda/stop.go
index 837b3f6177e28..f82aabdb61d80 100644
--- a/src/go/rpk/pkg/cli/cmd/redpanda/stop.go
+++ b/src/go/rpk/pkg/cli/cmd/redpanda/stop.go
@@ -34,7 +34,7 @@ func NewStopCommand(fs afero.Fs) *cobra.Command {
 	)
 	command := &cobra.Command{
 		Use:   "stop",
-		Short: "Stop redpanda.",
+		Short: "Stop redpanda",
 		Long: `Stop a local redpanda process. 'rpk stop'
 first sends SIGINT, and waits for the specified timeout. Then, if redpanda
 hasn't stopped, it sends SIGTERM. Lastly, it sends SIGKILL if it's still
diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/tune/help.go b/src/go/rpk/pkg/cli/cmd/redpanda/tune/help.go
index 658e27389e31d..e22a0065edf11 100644
--- a/src/go/rpk/pkg/cli/cmd/redpanda/tune/help.go
+++ b/src/go/rpk/pkg/cli/cmd/redpanda/tune/help.go
@@ -34,7 +34,7 @@ func newHelpCommand() *cobra.Command {
 
 	return &cobra.Command{
 		Use:   "help <tuner>",
-		Short: "Display detailed information about the tuner.",
+		Short: "Display detailed information about the tuner",
 		Args: func(_ *cobra.Command, args []string) error {
 			if len(args) != 1 {
 				return errors.New("requires the tuner name")
diff --git a/src/go/rpk/pkg/cli/cmd/root.go b/src/go/rpk/pkg/cli/cmd/root.go
index 947ed95fe0598..f630bab2a9885 100644
--- a/src/go/rpk/pkg/cli/cmd/root.go
+++ b/src/go/rpk/pkg/cli/cmd/root.go
@@ -56,11 +56,11 @@ func Execute() {
 
 	root := &cobra.Command{
 		Use:   "rpk",
-		Short: "rpk is the Redpanda CLI & toolbox.",
+		Short: "rpk is the Redpanda CLI & toolbox",
 		Long:  "",
 	}
 	root.PersistentFlags().BoolVarP(&verbose, config.FlagVerbose,
-		"v", false, "Enable verbose logging (default: false).")
+		"v", false, "Enable verbose logging (default: false)")
 
 	root.AddCommand(
 		NewGenerateCommand(fs),
@@ -114,6 +114,13 @@ func Execute() {
 		}
 	}
 
+	// Cobra creates help flag as: help for <command> if you want to override
+	// that message (capitalize the first letter) then this is the way.
+	// See: spf13/cobra#480
+	walk(root, func(c *cobra.Command) {
+		c.Flags().BoolP("help", "h", false, "Help for "+c.Name())
+	})
+
 	err := root.Execute()
 	if len(os.Args) > 1 {
 		switch os.Args[1] {
@@ -423,3 +430,11 @@ func (*osPluginHandler) exec(path string, args []string) error {
 	}
 	return syscall.Exec(path, args, env)
 }
+
+// walk calls f for c and all of its children.
+func walk(c *cobra.Command, f func(*cobra.Command)) {
+	f(c)
+	for _, c := range c.Commands() {
+		walk(c, f)
+	}
+}
diff --git a/src/go/rpk/pkg/cli/cmd/topic.go b/src/go/rpk/pkg/cli/cmd/topic.go
index c523988c26abf..0948f382934b0 100644
--- a/src/go/rpk/pkg/cli/cmd/topic.go
+++ b/src/go/rpk/pkg/cli/cmd/topic.go
@@ -30,7 +30,7 @@ func NewTopicCommand(fs afero.Fs) *cobra.Command {
 	)
 	command := &cobra.Command{
 		Use:   "topic",
-		Short: "Create, delete, produce to and consume from Redpanda topics.",
+		Short: "Create, delete, produce to and consume from Redpanda topics",
 	}
 
 	common.AddKafkaFlags(command, &configFile, &user, &password, &mechanism, &enableTLS, &certFile, &keyFile, &truststoreFile, &brokers)
diff --git a/src/go/rpk/pkg/cli/cmd/topic/add_partitions.go b/src/go/rpk/pkg/cli/cmd/topic/add_partitions.go
index d6b208d78aa08..d7b0c20da8ac2 100644
--- a/src/go/rpk/pkg/cli/cmd/topic/add_partitions.go
+++ b/src/go/rpk/pkg/cli/cmd/topic/add_partitions.go
@@ -27,7 +27,7 @@ func NewAddPartitionsCommand(fs afero.Fs) *cobra.Command {
 	var num int
 	cmd := &cobra.Command{
 		Use:   "add-partitions [TOPICS...] --num [#]",
-		Short: "Add partitions to existing topics.",
+		Short: "Add partitions to existing topics",
 		Args:  cobra.MinimumNArgs(1),
 		Long:  `Add partitions to existing topics.`,
 		Run: func(cmd *cobra.Command, topics []string) {
@@ -70,6 +70,6 @@ func NewAddPartitionsCommand(fs afero.Fs) *cobra.Command {
 			}
 		},
 	}
-	cmd.Flags().IntVarP(&num, "num", "n", 0, "numer of partitions to add to each topic")
+	cmd.Flags().IntVarP(&num, "num", "n", 0, "Number of partitions to add to each topic")
 	return cmd
 }
diff --git a/src/go/rpk/pkg/cli/cmd/topic/config.go b/src/go/rpk/pkg/cli/cmd/topic/config.go
index 7f1fdb4c1aa29..51eaa56b1c27d 100644
--- a/src/go/rpk/pkg/cli/cmd/topic/config.go
+++ b/src/go/rpk/pkg/cli/cmd/topic/config.go
@@ -33,7 +33,7 @@ func NewAlterConfigCommand(fs afero.Fs) *cobra.Command {
 
 	cmd := &cobra.Command{
 		Use:   "alter-config [TOPICS...] --set key=value --delete key2,key3",
-		Short: `Set, delete, add, and remove key/value configs for a topic.`,
+		Short: `Set, delete, add, and remove key/value configs for a topic`,
 		Long: `Set, delete, add, and remove key/value configs for a topic.
 
 This command allows you to incrementally alter the configuration for multiple
@@ -132,7 +132,7 @@ valid, but does not apply it.
 	cmd.Flags().StringArrayVar(&appends, "append", nil, "key=value; Value to append to a list-of-values key (repeatable)")
 	cmd.Flags().StringArrayVar(&subtracts, "subtract", nil, "key=value; Value to remove from list-of-values key (repeatable)")
 
-	cmd.Flags().BoolVar(&dry, "dry", false, "dry run: validate the alter request, but do not apply")
+	cmd.Flags().BoolVar(&dry, "dry", false, "Dry run: validate the alter request, but do not apply")
 
 	return cmd
 }
diff --git a/src/go/rpk/pkg/cli/cmd/topic/consume.go b/src/go/rpk/pkg/cli/cmd/topic/consume.go
index 13423edcc3c67..71c39ca179c65 100644
--- a/src/go/rpk/pkg/cli/cmd/topic/consume.go
+++ b/src/go/rpk/pkg/cli/cmd/topic/consume.go
@@ -66,7 +66,7 @@ func NewConsumeCommand(fs afero.Fs) *cobra.Command {
 
 	cmd := &cobra.Command{
 		Use:   "consume TOPICS...",
-		Short: "Consume records from topics.",
+		Short: "Consume records from topics",
 		Long:  helpConsume,
 		Args:  cobra.MinimumNArgs(1),
 		Run: func(cmd *cobra.Command, topics []string) {
@@ -126,7 +126,7 @@ func NewConsumeCommand(fs afero.Fs) *cobra.Command {
 	cmd.Flags().Int32SliceVarP(&c.partitions, "partitions", "p", nil, "Comma delimited list of specific partitions to consume")
 	cmd.Flags().BoolVarP(&c.regex, "regex", "r", false, "Parse topics as regex; consume any topic that matches any expression")
 
-	cmd.Flags().StringVarP(&c.group, "group", "g", "", "group to use for consuming (incompatible with -p)")
+	cmd.Flags().StringVarP(&c.group, "group", "g", "", "Group to use for consuming (incompatible with -p)")
 	cmd.Flags().StringVarP(&c.balancer, "balancer", "b", "cooperative-sticky", "Group balancer to use if group consuming (range, roundrobin, sticky, cooperative-sticky)")
 
 	cmd.Flags().Int32Var(&c.fetchMaxBytes, "fetch-max-bytes", 1<<20, "Maximum amount of bytes per fetch request per broker")
@@ -140,7 +140,7 @@ func NewConsumeCommand(fs afero.Fs) *cobra.Command {
 
 	// Deprecated.
 	cmd.Flags().BoolVar(new(bool), "commit", false, "")
-	cmd.Flags().MarkDeprecated("commit", "group consuming always commits")
+	cmd.Flags().MarkDeprecated("commit", "Group consuming always commits")
 
 	return cmd
 }
diff --git a/src/go/rpk/pkg/cli/cmd/topic/create.go b/src/go/rpk/pkg/cli/cmd/topic/create.go
index 2ff8f4cb652c6..e21e4a09189db 100644
--- a/src/go/rpk/pkg/cli/cmd/topic/create.go
+++ b/src/go/rpk/pkg/cli/cmd/topic/create.go
@@ -34,7 +34,7 @@ func NewCreateCommand(fs afero.Fs) *cobra.Command {
 	)
 	cmd := &cobra.Command{
 		Use:   "create [TOPICS...]",
-		Short: "Create topics.",
+		Short: "Create topics",
 		Args:  cobra.MinimumNArgs(1),
 		Long: `Create topics.
 
@@ -117,11 +117,11 @@ the cleanup.policy=compact config option set.
 	cmd.Flags().StringArrayVarP(&configKVs, "topic-config", "c", nil, "key=value; Config parameters (repeatable; e.g. -c cleanup.policy=compact)")
 	cmd.Flags().Int32VarP(&partitions, "partitions", "p", -1, "Number of partitions to create per topic; -1 defaults to the cluster's default_topic_partitions")
 	cmd.Flags().Int16VarP(&replicas, "replicas", "r", -1, "Replication factor (must be odd); -1 defaults to the cluster's default_topic_replications")
-	cmd.Flags().BoolVarP(&dry, "dry", "d", false, "dry run: validate the topic creation request; do not create topics")
+	cmd.Flags().BoolVarP(&dry, "dry", "d", false, "Dry run: validate the topic creation request; do not create topics")
 
 	// Sept 2021
-	cmd.Flags().BoolVar(&compact, "compact", false, "alias for -c cleanup.policy=compact")
-	cmd.Flags().MarkDeprecated("compact", "use -c cleanup.policy=compact")
+	cmd.Flags().BoolVar(&compact, "compact", false, "Alias for -c cleanup.policy=compact")
+	cmd.Flags().MarkDeprecated("compact", "Use -c cleanup.policy=compact")
 
 	return cmd
 }
diff --git a/src/go/rpk/pkg/cli/cmd/topic/delete.go b/src/go/rpk/pkg/cli/cmd/topic/delete.go
index 56290062694fd..08d6d547e4aed 100644
--- a/src/go/rpk/pkg/cli/cmd/topic/delete.go
+++ b/src/go/rpk/pkg/cli/cmd/topic/delete.go
@@ -23,7 +23,7 @@ func NewDeleteCommand(fs afero.Fs) *cobra.Command {
 	var re bool
 	cmd := &cobra.Command{
 		Use:   "delete [TOPICS...]",
-		Short: "Delete topics.",
+		Short: "Delete topics",
 		Long: `Delete topics.
 
 This command deletes all requested topics, printing the success or fail status
@@ -75,6 +75,6 @@ For example,
 			}
 		},
 	}
-	cmd.Flags().BoolVarP(&re, "regex", "r", false, "parse topics as regex; delete any topic that matches any input topic expression")
+	cmd.Flags().BoolVarP(&re, "regex", "r", false, "Parse topics as regex; delete any topic that matches any input topic expression")
 	return cmd
 }
diff --git a/src/go/rpk/pkg/cli/cmd/topic/describe.go b/src/go/rpk/pkg/cli/cmd/topic/describe.go
index 77bc9062d2c7d..ba2308eea8afb 100644
--- a/src/go/rpk/pkg/cli/cmd/topic/describe.go
+++ b/src/go/rpk/pkg/cli/cmd/topic/describe.go
@@ -37,7 +37,7 @@ func NewDescribeCommand(fs afero.Fs) *cobra.Command {
 	cmd := &cobra.Command{
 		Use:     "describe [TOPIC]",
 		Aliases: []string{"info"},
-		Short:   "Describe a topic.",
+		Short:   "Describe a topic",
 		Long: `Describe a topic.
 
 This command prints detailed information about a topic. There are three
@@ -179,10 +179,10 @@ partitions section. By default, the summary and configs sections are printed.
 	cmd.Flags().MarkDeprecated("watermarks", "deprecated - watermarks are always printed if the partition section is requested")
 	cmd.Flags().MarkDeprecated("detailed", "deprecated - info has been merged into describe, use -p to print detailed information")
 
-	cmd.Flags().BoolVarP(&summary, "print-summary", "s", false, "print the summary section")
-	cmd.Flags().BoolVarP(&configs, "print-configs", "c", false, "print the config section")
-	cmd.Flags().BoolVarP(&partitions, "print-partitions", "p", false, "print the detailed partitions section")
-	cmd.Flags().BoolVarP(&all, "print-all", "a", false, "print all sections")
+	cmd.Flags().BoolVarP(&summary, "print-summary", "s", false, "Print the summary section")
+	cmd.Flags().BoolVarP(&configs, "print-configs", "c", false, "Print the config section")
+	cmd.Flags().BoolVarP(&partitions, "print-partitions", "p", false, "Print the detailed partitions section")
+	cmd.Flags().BoolVarP(&all, "print-all", "a", false, "Print all sections")
 
 	return cmd
 }
diff --git a/src/go/rpk/pkg/cli/cmd/topic/list.go b/src/go/rpk/pkg/cli/cmd/topic/list.go
index 77e1c86727630..07383965160ad 100644
--- a/src/go/rpk/pkg/cli/cmd/topic/list.go
+++ b/src/go/rpk/pkg/cli/cmd/topic/list.go
@@ -29,7 +29,7 @@ func NewListCommand(fs afero.Fs) *cobra.Command {
 	cmd := &cobra.Command{
 		Use:     "list",
 		Aliases: []string{"ls"},
-		Short:   "List topics, optionally listing specific topics.",
+		Short:   "List topics, optionally listing specific topics",
 		Long: `List topics, optionally listing specific topics.
 
 This command lists all topics that you have access to by default. If specifying
@@ -78,8 +78,8 @@ information.
 		},
 	}
 
-	cmd.Flags().BoolVarP(&detailed, "detailed", "d", false, "print per-partition information for topics")
-	cmd.Flags().BoolVarP(&internal, "internal", "i", false, "print internal topics")
-	cmd.Flags().BoolVarP(&re, "regex", "r", false, "parse topics as regex; list any topic that matches any input topic expression")
+	cmd.Flags().BoolVarP(&detailed, "detailed", "d", false, "Print per-partition information for topics")
+	cmd.Flags().BoolVarP(&internal, "internal", "i", false, "Print internal topics")
+	cmd.Flags().BoolVarP(&re, "regex", "r", false, "Parse topics as regex; list any topic that matches any input topic expression")
 	return cmd
 }
diff --git a/src/go/rpk/pkg/cli/cmd/topic/produce.go b/src/go/rpk/pkg/cli/cmd/topic/produce.go
index 4f95227a9eaa1..6a648b7fa38de 100644
--- a/src/go/rpk/pkg/cli/cmd/topic/produce.go
+++ b/src/go/rpk/pkg/cli/cmd/topic/produce.go
@@ -43,7 +43,7 @@ func NewProduceCommand(fs afero.Fs) *cobra.Command {
 
 	cmd := &cobra.Command{
 		Use:   "produce [TOPIC]",
-		Short: "Produce records to a topic.",
+		Short: "Produce records to a topic",
 		Long:  helpProduce,
 		Args:  cobra.MaximumNArgs(1),
 		Run: func(cmd *cobra.Command, args []string) {
@@ -160,10 +160,10 @@ func NewProduceCommand(fs afero.Fs) *cobra.Command {
 	// The following flags require parsing before we initialize our client.
 	cmd.Flags().StringVarP(&compression, "compression", "z", "snappy", "Compression to use for producing batches (none, gzip, snapy, lz4, zstd)")
 	cmd.Flags().IntVar(&acks, "acks", -1, "Number of acks required for producing (-1=all, 0=none, 1=leader)")
-	cmd.Flags().DurationVar(&timeout, "delivery-timeout", 0, "per-record delivery timeout, if non-zero, min 1s")
-	cmd.Flags().Int32VarP(&partition, "partition", "p", -1, "partition to directly produce to, if non-negative (also allows %p parsing to set partitions)")
+	cmd.Flags().DurationVar(&timeout, "delivery-timeout", 0, "Per-record delivery timeout, if non-zero, min 1s")
+	cmd.Flags().Int32VarP(&partition, "partition", "p", -1, "Partition to directly produce to, if non-negative (also allows %p parsing to set partitions)")
 
-	cmd.Flags().StringVarP(&inFormat, "format", "f", "%v\n", "input record format")
+	cmd.Flags().StringVarP(&inFormat, "format", "f", "%v\n", "Input record format")
 	cmd.Flags().StringVarP(
 		&outFormat,
 		"output-format",
@@ -173,15 +173,15 @@ func NewProduceCommand(fs afero.Fs) *cobra.Command {
 	)
 	cmd.Flags().StringArrayVarP(&recHeaders, "header", "H", nil, "Headers in format key:value to add to each record (repeatable)")
 	cmd.Flags().StringVarP(&key, "key", "k", "", "A fixed key to use for each record (parsed input keys take precedence)")
-	cmd.Flags().BoolVarP(&tombstone, "tombstone", "Z", false, "produce empty values as tombstones")
+	cmd.Flags().BoolVarP(&tombstone, "tombstone", "Z", false, "Produce empty values as tombstones")
 
 	// Deprecated
 	cmd.Flags().IntVarP(new(int), "num", "n", 1, "")
-	cmd.Flags().MarkDeprecated("num", "invoke rpk multiple times if you wish to repeat records")
+	cmd.Flags().MarkDeprecated("num", "Invoke rpk multiple times if you wish to repeat records")
 	cmd.Flags().BoolVarP(new(bool), "jvm-partitioner", "j", false, "")
-	cmd.Flags().MarkDeprecated("jvm-partitioner", "the default is now the jvm-partitioner")
+	cmd.Flags().MarkDeprecated("jvm-partitioner", "The default is now the jvm-partitioner")
 	cmd.Flags().StringVarP(new(string), "timestamp", "t", "", "")
-	cmd.Flags().MarkDeprecated("timestamp", "record timestamps are set when producing")
+	cmd.Flags().MarkDeprecated("timestamp", "Record timestamps are set when producing")
 
 	return cmd
 }
diff --git a/src/go/rpk/pkg/cli/cmd/version.go b/src/go/rpk/pkg/cli/cmd/version.go
index de67ec4cc4e61..cd8c251bc3fdf 100644
--- a/src/go/rpk/pkg/cli/cmd/version.go
+++ b/src/go/rpk/pkg/cli/cmd/version.go
@@ -19,7 +19,7 @@ import (
 func NewVersionCommand() *cobra.Command {
 	command := &cobra.Command{
 		Use:   "version",
-		Short: "Check the current version.",
+		Short: "Check the current version",
 		Long:  "",
 		Run: func(_ *cobra.Command, _ []string) {
 			log.SetFormatter(cli.NewNoopFormatter())
diff --git a/src/go/rpk/pkg/cli/cmd/wasm.go b/src/go/rpk/pkg/cli/cmd/wasm.go
index 59199c2f03875..1492125907c82 100644
--- a/src/go/rpk/pkg/cli/cmd/wasm.go
+++ b/src/go/rpk/pkg/cli/cmd/wasm.go
@@ -31,7 +31,7 @@ func NewWasmCommand(fs afero.Fs) *cobra.Command {
 
 	command := &cobra.Command{
 		Use:   "wasm",
-		Short: "Deploy and remove inline WASM engine scripts.",
+		Short: "Deploy and remove inline WASM engine scripts",
 	}
 	common.AddKafkaFlags(
 		command,
diff --git a/src/go/rpk/pkg/cli/cmd/wasm/deploy.go b/src/go/rpk/pkg/cli/cmd/wasm/deploy.go
index e7ce367753c34..a88eaf48db01d 100644
--- a/src/go/rpk/pkg/cli/cmd/wasm/deploy.go
+++ b/src/go/rpk/pkg/cli/cmd/wasm/deploy.go
@@ -19,7 +19,7 @@ func NewDeployCommand(fs afero.Fs) *cobra.Command {
 	)
 	cmd := &cobra.Command{
 		Use:   "deploy [PATH]",
-		Short: "Deploy inline WASM function.",
+		Short: "Deploy inline WASM function",
 		Args:  cobra.ExactArgs(1),
 		Run: func(cmd *cobra.Command, args []string) {
 			p := config.ParamsFromCommand(cmd)
@@ -49,9 +49,9 @@ func NewDeployCommand(fs afero.Fs) *cobra.Command {
 		},
 	}
 
-	cmd.Flags().StringVar(&description, "description", "", "optional description about what the wasm function does")
+	cmd.Flags().StringVar(&description, "description", "", "Optional description about what the wasm function does")
 	cmd.Flags().StringVar(&coprocType, "type", "async", "WASM engine type (async, data-policy)")
-	cmd.Flags().StringVar(&name, "name", "", "unique deploy identifier attached to the instance of this script")
+	cmd.Flags().StringVar(&name, "name", "", "Unique deploy identifier attached to the instance of this script")
 	cmd.MarkFlagRequired("name")
 	return cmd
 }
diff --git a/src/go/rpk/pkg/cli/cmd/wasm/generate.go b/src/go/rpk/pkg/cli/cmd/wasm/generate.go
index f9f8532a4ac95..c345aa3649927 100644
--- a/src/go/rpk/pkg/cli/cmd/wasm/generate.go
+++ b/src/go/rpk/pkg/cli/cmd/wasm/generate.go
@@ -30,7 +30,7 @@ func NewGenerateCommand(fs afero.Fs) *cobra.Command {
 	var skipVersion bool
 	cmd := &cobra.Command{
 		Use:   "generate [PROJECT DIRECTORY]",
-		Short: "Create a npm template project for inline WASM engine.",
+		Short: "Create a npm template project for inline WASM engine",
 		Args:  cobra.ExactArgs(1),
 		Run: func(_ *cobra.Command, args []string) {
 			path, err := filepath.Abs(args[0])
@@ -39,7 +39,7 @@ func NewGenerateCommand(fs afero.Fs) *cobra.Command {
 			out.MaybeDie(err, "unable to generate all manifest files: %v", err)
 		},
 	}
-	cmd.Flags().BoolVar(&skipVersion, "skip-version", false, "omit wasm-api version check from npm, use default instead")
+	cmd.Flags().BoolVar(&skipVersion, "skip-version", false, "Omit wasm-api version check from npm, use default instead")
 	return cmd
 }
 
diff --git a/src/go/rpk/pkg/cli/cmd/wasm/remove.go b/src/go/rpk/pkg/cli/cmd/wasm/remove.go
index 01c522568c523..c35fd1057b0eb 100644
--- a/src/go/rpk/pkg/cli/cmd/wasm/remove.go
+++ b/src/go/rpk/pkg/cli/cmd/wasm/remove.go
@@ -15,7 +15,7 @@ func NewRemoveCommand(fs afero.Fs) *cobra.Command {
 
 	cmd := &cobra.Command{
 		Use:   "remove [NAME]",
-		Short: "Remove inline WASM function.",
+		Short: "Remove inline WASM function",
 		Args:  cobra.ExactArgs(1),
 		Run: func(cmd *cobra.Command, args []string) {
 			p := config.ParamsFromCommand(cmd)

From 7aa1607eeaab4a59737e865f6c40731d84cf52f9 Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Wed, 13 Jul 2022 13:24:47 +0100
Subject: [PATCH 082/201] pandaproxy/handler: Convert handler_adaptor to a
 coroutine

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/pandaproxy/server.cc | 40 +++++++++++++-------------------------
 1 file changed, 13 insertions(+), 27 deletions(-)

diff --git a/src/v/pandaproxy/server.cc b/src/v/pandaproxy/server.cc
index 0dd564a67ccf1..23a945f6ad3bb 100644
--- a/src/v/pandaproxy/server.cc
+++ b/src/v/pandaproxy/server.cc
@@ -80,33 +80,19 @@ struct handler_adaptor : ss::httpd::handler_base {
       const ss::sstring&,
       std::unique_ptr<ss::request> req,
       std::unique_ptr<ss::reply> rep) final {
-        return ss::try_with_gate(
-          _pending_requests,
-          [this,
-           req{std::move(req)},
-           rep{std::move(rep)},
-           m = _probe.hist().auto_measure()]() mutable {
-              server::request_t rq{std::move(req), this->_ctx};
-              server::reply_t rp{std::move(rep)};
-              auto req_size = get_request_size(*rq.req);
-
-              return ss::with_semaphore(
-                       _ctx.mem_sem,
-                       req_size,
-                       [this, rq{std::move(rq)}, rp{std::move(rp)}]() mutable {
-                           if (_ctx.as.abort_requested()) {
-                               set_reply_unavailable(*rp.rep);
-                               return ss::make_ready_future<
-                                 std::unique_ptr<ss::reply>>(std::move(rp.rep));
-                           }
-                           return _handler(std::move(rq), std::move(rp))
-                             .then([](server::reply_t rp) {
-                                 set_mime_type(*rp.rep, rp.mime_type);
-                                 return std::move(rp.rep);
-                             });
-                       })
-                .finally([m{std::move(m)}]() {});
-          });
+        auto measure = _probe.hist().auto_measure();
+        auto guard = gate_guard(_pending_requests);
+        server::request_t rq{std::move(req), this->_ctx};
+        server::reply_t rp{std::move(rep)};
+        auto req_size = get_request_size(*rq.req);
+        auto sem_units = co_await ss::get_units(_ctx.mem_sem, req_size);
+        if (_ctx.as.abort_requested()) {
+            set_reply_unavailable(*rp.rep);
+            co_return std::move(rp.rep);
+        }
+        rp = co_await _handler(std::move(rq), std::move(rp));
+        set_mime_type(*rp.rep, rp.mime_type);
+        co_return std::move(rp.rep);
     }
 
     ss::gate& _pending_requests;

From b710a1e59b8d1df8f2963b3845806d937b1234e6 Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Wed, 13 Jul 2022 13:28:32 +0100
Subject: [PATCH 083/201] pandaproxy/probe: Introduce http_status_metric

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/pandaproxy/probe.h | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/src/v/pandaproxy/probe.h b/src/v/pandaproxy/probe.h
index 22c2436bc1065..c26dec76e538c 100644
--- a/src/v/pandaproxy/probe.h
+++ b/src/v/pandaproxy/probe.h
@@ -15,9 +15,48 @@
 
 #include <seastar/core/metrics_registration.hh>
 #include <seastar/http/json_path.hh>
+#include <seastar/http/reply.hh>
 
 namespace pandaproxy {
 
+/// If the request is good, measure latency, otherwise record the error.
+class http_status_metric {
+public:
+    class measurement {
+    public:
+        measurement(
+          http_status_metric* p, std::unique_ptr<hdr_hist::measurement> m)
+          : _p(p)
+          , _m(std::move(m)) {}
+
+        void set_status(ss::httpd::reply::status_type s) {
+            using status_type = ss::httpd::reply::status_type;
+            if (s < status_type{300}) {
+                return;
+            }
+            if (s < status_type{400}) {
+                ++_p->_3xx_count;
+            } else if (s < status_type{500}) {
+                ++_p->_4xx_count;
+            } else {
+                ++_p->_5xx_count;
+            }
+            _m->set_trace(false);
+        }
+
+    private:
+        http_status_metric* _p;
+        std::unique_ptr<hdr_hist::measurement> _m;
+    };
+    hdr_hist& hist() { return _hist; }
+    auto auto_measure() { return measurement{this, _hist.auto_measure()}; }
+
+    hdr_hist _hist;
+    int64_t _5xx_count;
+    int64_t _4xx_count;
+    int64_t _3xx_count;
+};
+
 class probe {
 public:
     probe(

From 7fc3e614b9457efaa20b383f97b741a240698318 Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Wed, 13 Jul 2022 13:36:45 +0100
Subject: [PATCH 084/201] pandaproxy/probe: Switch to http_status_metric

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/pandaproxy/probe.cc | 9 ++++++---
 src/v/pandaproxy/probe.h  | 4 ++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/v/pandaproxy/probe.cc b/src/v/pandaproxy/probe.cc
index f1c1f976b1430..a98fb68410d92 100644
--- a/src/v/pandaproxy/probe.cc
+++ b/src/v/pandaproxy/probe.cc
@@ -21,7 +21,7 @@ namespace pandaproxy {
 
 probe::probe(
   ss::httpd::path_description& path_desc, const ss::sstring& group_name)
-  : _request_hist()
+  : _request_metrics()
   , _metrics()
   , _public_metrics(ssx::metrics::public_metrics_handle) {
     namespace sm = ss::metrics;
@@ -45,7 +45,9 @@ probe::probe(
              "request_latency",
              sm::description("Request latency"),
              labels,
-             [this] { return _request_hist.seastar_histogram_logform(); })
+             [this] {
+                 return _request_metrics.hist().seastar_histogram_logform();
+             })
              .aggregate(internal_aggregate_labels)});
     }
 
@@ -58,7 +60,8 @@ probe::probe(
                ssx::sformat("Internal latency of request for {}", group_name)),
              labels,
              [this] {
-                 return ssx::metrics::report_default_histogram(_request_hist);
+                 return ssx::metrics::report_default_histogram(
+                   _request_metrics.hist());
              })
              .aggregate(aggregate_labels)});
     }
diff --git a/src/v/pandaproxy/probe.h b/src/v/pandaproxy/probe.h
index c26dec76e538c..e265827be56aa 100644
--- a/src/v/pandaproxy/probe.h
+++ b/src/v/pandaproxy/probe.h
@@ -61,10 +61,10 @@ class probe {
 public:
     probe(
       ss::httpd::path_description& path_desc, const ss::sstring& group_name);
-    hdr_hist& hist() { return _request_hist; }
+    hdr_hist& hist() { return _request_metrics.hist(); }
 
 private:
-    hdr_hist _request_hist;
+    http_status_metric _request_metrics;
     ss::metrics::metric_groups _metrics;
     ss::metrics::metric_groups _public_metrics;
 };

From 4255e405cc5616b10cb3a44cde96dac1213e1935 Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Wed, 13 Jul 2022 14:44:57 +0100
Subject: [PATCH 085/201] pandaproxy/probe: Measure status

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/pandaproxy/probe.cc | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/src/v/pandaproxy/probe.cc b/src/v/pandaproxy/probe.cc
index a98fb68410d92..51b432b54f21b 100644
--- a/src/v/pandaproxy/probe.cc
+++ b/src/v/pandaproxy/probe.cc
@@ -52,6 +52,7 @@ probe::probe(
     }
 
     if (!config::shard_local_cfg().disable_public_metrics()) {
+        auto status_label = sm::label("status");
         _public_metrics.add_group(
           group_name,
           {sm::make_histogram(
@@ -63,6 +64,33 @@ probe::probe(
                  return ssx::metrics::report_default_histogram(
                    _request_metrics.hist());
              })
+             .aggregate(aggregate_labels),
+
+           sm::make_counter(
+             "request_errors_total",
+             [this] { return _request_metrics._5xx_count; },
+             sm::description(
+               ssx::sformat("Total number of {} server errors", group_name)),
+             {operation_label(path_desc.operations.nickname),
+              status_label("5xx")})
+             .aggregate(aggregate_labels),
+
+           sm::make_counter(
+             "request_errors_total",
+             [this] { return _request_metrics._4xx_count; },
+             sm::description(
+               ssx::sformat("Total number of {} client errors", group_name)),
+             {operation_label(path_desc.operations.nickname),
+              status_label("4xx")})
+             .aggregate(aggregate_labels),
+
+           sm::make_counter(
+             "request_errors_total",
+             [this] { return _request_metrics._3xx_count; },
+             sm::description(ssx::sformat(
+               "Total number of {} redirection errors", group_name)),
+             {operation_label(path_desc.operations.nickname),
+              status_label("3xx")})
              .aggregate(aggregate_labels)});
     }
 }

From 6b06142eb38c0d666438e8387ac254562fc151f9 Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Wed, 13 Jul 2022 13:47:43 +0100
Subject: [PATCH 086/201] pandaproxy/probe: Use auto_measure facade

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/pandaproxy/probe.h   | 2 +-
 src/v/pandaproxy/server.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/v/pandaproxy/probe.h b/src/v/pandaproxy/probe.h
index e265827be56aa..2fd5ffa2f27a6 100644
--- a/src/v/pandaproxy/probe.h
+++ b/src/v/pandaproxy/probe.h
@@ -61,7 +61,7 @@ class probe {
 public:
     probe(
       ss::httpd::path_description& path_desc, const ss::sstring& group_name);
-    hdr_hist& hist() { return _request_metrics.hist(); }
+    auto auto_measure() { return _request_metrics.auto_measure(); }
 
 private:
     http_status_metric _request_metrics;
diff --git a/src/v/pandaproxy/server.cc b/src/v/pandaproxy/server.cc
index 23a945f6ad3bb..18742635bab10 100644
--- a/src/v/pandaproxy/server.cc
+++ b/src/v/pandaproxy/server.cc
@@ -80,7 +80,7 @@ struct handler_adaptor : ss::httpd::handler_base {
       const ss::sstring&,
       std::unique_ptr<ss::request> req,
       std::unique_ptr<ss::reply> rep) final {
-        auto measure = _probe.hist().auto_measure();
+        auto measure = _probe.auto_measure();
         auto guard = gate_guard(_pending_requests);
         server::request_t rq{std::move(req), this->_ctx};
         server::reply_t rp{std::move(rep)};

From 4f020d95ae153f5b37fc228b005dc224cf1fe70c Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Wed, 13 Jul 2022 13:49:39 +0100
Subject: [PATCH 087/201] pandaproxy/handler: Measure status

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/pandaproxy/server.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/v/pandaproxy/server.cc b/src/v/pandaproxy/server.cc
index 18742635bab10..18ba53e5efedc 100644
--- a/src/v/pandaproxy/server.cc
+++ b/src/v/pandaproxy/server.cc
@@ -88,10 +88,12 @@ struct handler_adaptor : ss::httpd::handler_base {
         auto sem_units = co_await ss::get_units(_ctx.mem_sem, req_size);
         if (_ctx.as.abort_requested()) {
             set_reply_unavailable(*rp.rep);
+            measure.set_status(rp.rep->_status);
             co_return std::move(rp.rep);
         }
         rp = co_await _handler(std::move(rq), std::move(rp));
         set_mime_type(*rp.rep, rp.mime_type);
+        measure.set_status(rp.rep->_status);
         co_return std::move(rp.rep);
     }
 

From aec6873cd442cd5cf49a6bd496d447d84c91483e Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Wed, 13 Jul 2022 14:14:51 +0100
Subject: [PATCH 088/201] pandaproxy/handler: Measure status on exception

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/pandaproxy/server.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/v/pandaproxy/server.cc b/src/v/pandaproxy/server.cc
index 18ba53e5efedc..51ca673e4533e 100644
--- a/src/v/pandaproxy/server.cc
+++ b/src/v/pandaproxy/server.cc
@@ -91,7 +91,14 @@ struct handler_adaptor : ss::httpd::handler_base {
             measure.set_status(rp.rep->_status);
             co_return std::move(rp.rep);
         }
-        rp = co_await _handler(std::move(rq), std::move(rp));
+        try {
+            rp = co_await _handler(std::move(rq), std::move(rp));
+        } catch (const std::exception& e) {
+            auto eptr = std::current_exception();
+            auto rep = exception_reply(eptr);
+            measure.set_status(rep->_status);
+            std::rethrow_exception(eptr);
+        }
         set_mime_type(*rp.rep, rp.mime_type);
         measure.set_status(rp.rep->_status);
         co_return std::move(rp.rep);

From 29dddbde7956efc0d80f8e9af791ed66a789dd74 Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Thu, 14 Jul 2022 14:37:42 +0100
Subject: [PATCH 089/201] pandaproxy/server: Move error mime_type to
 constructor

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/pandaproxy/rest/proxy.cc              |  6 +++---
 src/v/pandaproxy/schema_registry/service.cc |  6 +++---
 src/v/pandaproxy/server.cc                  | 11 ++++++-----
 src/v/pandaproxy/server.h                   |  7 ++++---
 4 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/src/v/pandaproxy/rest/proxy.cc b/src/v/pandaproxy/rest/proxy.cc
index acdd0cb946b96..2a3b9a7fe55e1 100644
--- a/src/v/pandaproxy/rest/proxy.cc
+++ b/src/v/pandaproxy/rest/proxy.cc
@@ -74,15 +74,15 @@ proxy::proxy(
       ss::api_registry_builder20(_config.api_doc_dir(), "/v1"),
       "header",
       "/definitions",
-      _ctx) {}
+      _ctx,
+      json::serialization_format::application_json) {}
 
 ss::future<> proxy::start() {
     _server.routes(get_proxy_routes());
     return _server.start(
       _config.pandaproxy_api(),
       _config.pandaproxy_api_tls(),
-      _config.advertised_pandaproxy_api(),
-      json::serialization_format::application_json);
+      _config.advertised_pandaproxy_api());
 }
 
 ss::future<> proxy::stop() { return _server.stop(); }
diff --git a/src/v/pandaproxy/schema_registry/service.cc b/src/v/pandaproxy/schema_registry/service.cc
index e7d8e9b4c74a8..35571893bb6f2 100644
--- a/src/v/pandaproxy/schema_registry/service.cc
+++ b/src/v/pandaproxy/schema_registry/service.cc
@@ -229,7 +229,8 @@ service::service(
       ss::api_registry_builder20(_config.api_doc_dir(), "/v1"),
       "schema_registry_header",
       "/schema_registry_definitions",
-      _ctx)
+      _ctx,
+      json::serialization_format::schema_registry_v1_json)
   , _store(store)
   , _writer(sequencer)
   , _ensure_started{[this]() { return do_start(); }} {}
@@ -240,8 +241,7 @@ ss::future<> service::start() {
     return _server.start(
       _config.schema_registry_api(),
       _config.schema_registry_api_tls(),
-      not_advertised,
-      json::serialization_format::schema_registry_v1_json);
+      not_advertised);
 }
 
 ss::future<> service::stop() {
diff --git a/src/v/pandaproxy/server.cc b/src/v/pandaproxy/server.cc
index 51ca673e4533e..d705f17274aeb 100644
--- a/src/v/pandaproxy/server.cc
+++ b/src/v/pandaproxy/server.cc
@@ -116,13 +116,15 @@ server::server(
   ss::api_registry_builder20&& api20,
   const ss::sstring& header,
   const ss::sstring& definitions,
-  context_t& ctx)
+  context_t& ctx,
+  json::serialization_format exceptional_mime_type)
   : _server(server_name)
   , _public_metrics_group_name(public_metrics_group_name)
   , _pending_reqs()
   , _api20(std::move(api20))
   , _has_routes(false)
-  , _ctx(ctx) {
+  , _ctx(ctx)
+  , _exceptional_mime_type(exceptional_mime_type) {
     _api20.set_api_doc(_server._routes);
     _api20.register_api_file(_server._routes, header);
     _api20.add_definitions_file(_server._routes, definitions);
@@ -162,10 +164,9 @@ void server::routes(server::routes_t&& rts) {
 ss::future<> server::start(
   const std::vector<model::broker_endpoint>& endpoints,
   const std::vector<config::endpoint_tls_config>& endpoints_tls,
-  const std::vector<model::broker_endpoint>& advertised,
-  json::serialization_format exceptional_mime_type) {
+  const std::vector<model::broker_endpoint>& advertised) {
     _server._routes.register_exeption_handler(
-      exception_replier{ss::sstring{name(exceptional_mime_type)}});
+      exception_replier{ss::sstring{name(_exceptional_mime_type)}});
     _ctx.advertised_listeners.reserve(endpoints.size());
     for (auto& server_endpoint : endpoints) {
         auto addr = co_await net::resolve_dns(server_endpoint.address);
diff --git a/src/v/pandaproxy/server.h b/src/v/pandaproxy/server.h
index 3790c081a78a1..92640a0728484 100644
--- a/src/v/pandaproxy/server.h
+++ b/src/v/pandaproxy/server.h
@@ -84,7 +84,8 @@ class server {
       ss::api_registry_builder20&& api20,
       const ss::sstring& header,
       const ss::sstring& definitions,
-      context_t& ctx);
+      context_t& ctx,
+      json::serialization_format exceptional_mime_type);
 
     void route(route_t route);
     void routes(routes_t&& routes);
@@ -92,8 +93,7 @@ class server {
     ss::future<> start(
       const std::vector<model::broker_endpoint>& endpoints,
       const std::vector<config::endpoint_tls_config>& endpoints_tls,
-      const std::vector<model::broker_endpoint>& advertised,
-      json::serialization_format exceptional_mime_type);
+      const std::vector<model::broker_endpoint>& advertised);
     ss::future<> stop();
 
 private:
@@ -103,6 +103,7 @@ class server {
     ss::api_registry_builder20 _api20;
     bool _has_routes;
     context_t& _ctx;
+    json::serialization_format _exceptional_mime_type;
 };
 
 template<typename service_t>

From ab2c19ac1a395ac0185791065288548c12ae07d2 Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Thu, 14 Jul 2022 15:25:29 +0100
Subject: [PATCH 090/201] pandaproxy/handler: Always return an error_body on
 exception

If an exception escapes the handling here, then seastar will
invent a body of the wrong format.

The error_body requires the exception_mime type.

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/pandaproxy/server.cc | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/src/v/pandaproxy/server.cc b/src/v/pandaproxy/server.cc
index d705f17274aeb..617dc820db082 100644
--- a/src/v/pandaproxy/server.cc
+++ b/src/v/pandaproxy/server.cc
@@ -70,11 +70,13 @@ struct handler_adaptor : ss::httpd::handler_base {
       server::context_t& ctx,
       server::function_handler&& handler,
       ss::httpd::path_description& path_desc,
-      const ss::sstring& metrics_group_name)
+      const ss::sstring& metrics_group_name,
+      json::serialization_format exceptional_mime_type)
       : _pending_requests(pending_requests)
       , _ctx(ctx)
       , _handler(std::move(handler))
-      , _probe(path_desc, metrics_group_name) {}
+      , _probe(path_desc, metrics_group_name)
+      , _exceptional_mime_type(exceptional_mime_type) {}
 
     ss::future<std::unique_ptr<ss::reply>> handle(
       const ss::sstring&,
@@ -88,16 +90,15 @@ struct handler_adaptor : ss::httpd::handler_base {
         auto sem_units = co_await ss::get_units(_ctx.mem_sem, req_size);
         if (_ctx.as.abort_requested()) {
             set_reply_unavailable(*rp.rep);
-            measure.set_status(rp.rep->_status);
-            co_return std::move(rp.rep);
-        }
-        try {
-            rp = co_await _handler(std::move(rq), std::move(rp));
-        } catch (const std::exception& e) {
-            auto eptr = std::current_exception();
-            auto rep = exception_reply(eptr);
-            measure.set_status(rep->_status);
-            std::rethrow_exception(eptr);
+            rp.mime_type = _exceptional_mime_type;
+        } else {
+            try {
+                rp = co_await _handler(std::move(rq), std::move(rp));
+            } catch (...) {
+                rp = server::reply_t{
+                  exception_reply(std::current_exception()),
+                  _exceptional_mime_type};
+            }
         }
         set_mime_type(*rp.rep, rp.mime_type);
         measure.set_status(rp.rep->_status);
@@ -108,6 +109,7 @@ struct handler_adaptor : ss::httpd::handler_base {
     server::context_t& _ctx;
     server::function_handler _handler;
     probe _probe;
+    json::serialization_format _exceptional_mime_type;
 };
 
 server::server(
@@ -141,7 +143,8 @@ void server::route(server::route_t r) {
       _ctx,
       std::move(r.handler),
       r.path_desc,
-      _public_metrics_group_name);
+      _public_metrics_group_name,
+      _exceptional_mime_type);
     r.path_desc.set(_server._routes, handler);
 }
 

From 98d2c16f8e2c406e7515b2da86c5cfc054044a4f Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Thu, 14 Jul 2022 15:26:14 +0100
Subject: [PATCH 091/201] pandaproxy/reply: Always return an error_body on
 exception

If an exception escapes the handling here, then seastar will
invent a body of the wrong format.

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/pandaproxy/reply.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/v/pandaproxy/reply.h b/src/v/pandaproxy/reply.h
index 441245a75c161..5d6481c57dd20 100644
--- a/src/v/pandaproxy/reply.h
+++ b/src/v/pandaproxy/reply.h
@@ -104,12 +104,13 @@ inline std::unique_ptr<ss::httpd::reply> exception_reply(std::exception_ptr e) {
     } catch (const schema_registry::exception_base& e) {
         return errored_body(e.code(), e.message());
     } catch (const seastar::httpd::base_exception& e) {
-        return errored_body(
-          reply_error_code::kafka_bad_request,
-          e.what()); // TODO BP: Yarr!!
+        return errored_body(reply_error_code::kafka_bad_request, e.what());
     } catch (...) {
-        vlog(plog.error, "{}", std::current_exception());
-        throw;
+        vlog(plog.error, "exception_reply: {}", std::current_exception());
+        auto ise = make_error_condition(
+          reply_error_code::internal_server_error);
+        return errored_body(
+          reply_error_code::internal_server_error, ise.message());
     }
 }
 

From 62c25a904845662ad4e5a20872bcfee80ce2d5c6 Mon Sep 17 00:00:00 2001
From: Rogger Vasquez <rvasque3@gmail.com>
Date: Mon, 11 Jul 2022 17:11:05 -0500
Subject: [PATCH 092/201] rpk: accept io readers in admin API calls

now sendAndReceive will check if the passed body
implements io reader, usefull when you want to
pass an io.Reader as the body instead of a json
---
 src/go/rpk/pkg/api/admin/admin.go | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/go/rpk/pkg/api/admin/admin.go b/src/go/rpk/pkg/api/admin/admin.go
index e255975e2c283..c86b0cac8fb47 100644
--- a/src/go/rpk/pkg/api/admin/admin.go
+++ b/src/go/rpk/pkg/api/admin/admin.go
@@ -499,16 +499,23 @@ func maybeUnmarshalRespInto(
 
 // sendAndReceive sends a request and returns the response. If body is
 // non-nil, this json encodes the body and sends it with the request.
+// If the body is already an io.Reader, the reader is used directly
+// without marshaling.
 func (a *AdminAPI) sendAndReceive(
 	ctx context.Context, method, url string, body interface{}, retryable bool,
 ) (*http.Response, error) {
 	var r io.Reader
 	if body != nil {
-		bs, err := json.Marshal(body)
-		if err != nil {
-			return nil, fmt.Errorf("unable to encode request body for %s %s: %w", method, url, err) // should not happen
+		// We might be passing io reader already as body, e.g: license file.
+		if v, ok := body.(io.Reader); ok {
+			r = v
+		} else {
+			bs, err := json.Marshal(body)
+			if err != nil {
+				return nil, fmt.Errorf("unable to encode request body for %s %s: %w", method, url, err) // should not happen
+			}
+			r = bytes.NewBuffer(bs)
 		}
-		r = bytes.NewBuffer(bs)
 	}
 
 	req, err := http.NewRequestWithContext(ctx, method, url, r)
@@ -549,7 +556,7 @@ func (a *AdminAPI) sendAndReceive(
 		if err != nil {
 			return nil, fmt.Errorf("request %s %s failed: %s, unable to read body: %w", method, url, status, err)
 		}
-		return nil, &HTTPResponseError{Response: res, Body: resBody}
+		return nil, &HTTPResponseError{Response: res, Body: resBody, Method: method, URL: url}
 	}
 
 	return res, nil
@@ -562,6 +569,6 @@ func (he HTTPResponseError) DecodeGenericErrorBody() (GenericErrorBody, error) {
 }
 
 func (he HTTPResponseError) Error() string {
-	return fmt.Sprintf("request %s %s failed: %s, body: %q",
+	return fmt.Sprintf("request %s %s failed: %s, body: %q\n",
 		he.Method, he.URL, http.StatusText(he.Response.StatusCode), he.Body)
 }

From 255938f0327f241f2173669fbcceaf34bbc77260 Mon Sep 17 00:00:00 2001
From: Rogger Vasquez <rvasque3@gmail.com>
Date: Tue, 12 Jul 2022 14:19:10 -0500
Subject: [PATCH 093/201] rpk: use default data_directory if not set

old viper code used to do this without being
explicit, removing this behavior caused a failure
when running rpk redpanda start if you had a
config file without data_directory
---
 src/go/rpk/pkg/cli/cmd/redpanda/start.go      |  5 +++++
 src/go/rpk/pkg/cli/cmd/redpanda/start_test.go | 21 +++++++++++++++++++
 src/go/rpk/pkg/config/params_test.go          |  1 -
 src/go/rpk/pkg/config/schema.go               |  2 +-
 4 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/start.go b/src/go/rpk/pkg/cli/cmd/redpanda/start.go
index 565eacb4e21b8..9e5c2b59e9403 100644
--- a/src/go/rpk/pkg/cli/cmd/redpanda/start.go
+++ b/src/go/rpk/pkg/cli/cmd/redpanda/start.go
@@ -335,6 +335,11 @@ func NewStartCommand(fs afero.Fs, launcher rp.Launcher) *cobra.Command {
 				sendEnv(fs, env, cfg, !prestartCfg.checkEnabled, err)
 				return err
 			}
+
+			if cfg.Redpanda.Directory == "" {
+				cfg.Redpanda.Directory = config.Default().Redpanda.Directory
+			}
+
 			checkPayloads, tunerPayloads, err := prestart(
 				fs,
 				rpArgs,
diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/start_test.go b/src/go/rpk/pkg/cli/cmd/redpanda/start_test.go
index ac7a3b536760b..aa843be833d45 100644
--- a/src/go/rpk/pkg/cli/cmd/redpanda/start_test.go
+++ b/src/go/rpk/pkg/cli/cmd/redpanda/start_test.go
@@ -441,6 +441,27 @@ func TestStartCommand(t *testing.T) {
 			// Check that the generated config is as expected.
 			require.Exactly(st, config.Default().Redpanda.ID, conf.Redpanda.ID)
 		},
+	}, {
+		name: "it should write default data_directory if loaded config doesn't have one",
+		args: []string{
+			"--config", config.Default().ConfigFile,
+			"--install-dir", "/var/lib/redpanda",
+		},
+		before: func(fs afero.Fs) error {
+			conf := config.Default()
+			conf.Redpanda.Directory = ""
+			return conf.Write(fs)
+		},
+		postCheck: func(
+			fs afero.Fs,
+			_ *redpanda.RedpandaArgs,
+			st *testing.T,
+		) {
+			conf, err := new(config.Params).Load(fs)
+			require.NoError(st, err)
+			// Check that the generated config is as expected.
+			require.Exactly(st, config.Default().Redpanda.Directory, conf.Redpanda.Directory)
+		},
 	}, {
 		name: "it should leave redpanda.node_id untouched if --node-id wasn't passed",
 		args: []string{
diff --git a/src/go/rpk/pkg/config/params_test.go b/src/go/rpk/pkg/config/params_test.go
index 111a1ebaf1898..e3de9f09430fc 100644
--- a/src/go/rpk/pkg/config/params_test.go
+++ b/src/go/rpk/pkg/config/params_test.go
@@ -56,7 +56,6 @@ redpanda:
 			},
 			exp: `config_file: /etc/redpanda/redpanda.yaml
 redpanda:
-    data_directory: ""
     node_id: 6
     rack: my_rack
 `,
diff --git a/src/go/rpk/pkg/config/schema.go b/src/go/rpk/pkg/config/schema.go
index c3440460a4f1c..f3e7ebfdd5be4 100644
--- a/src/go/rpk/pkg/config/schema.go
+++ b/src/go/rpk/pkg/config/schema.go
@@ -44,7 +44,7 @@ func (c *Config) File() *Config {
 }
 
 type RedpandaConfig struct {
-	Directory                  string                 `yaml:"data_directory" json:"data_directory"`
+	Directory                  string                 `yaml:"data_directory,omitempty" json:"data_directory"`
 	ID                         int                    `yaml:"node_id" json:"node_id"`
 	Rack                       string                 `yaml:"rack,omitempty" json:"rack"`
 	SeedServers                []SeedServer           `yaml:"seed_servers" json:"seed_servers"`

From 35eac78119252651baac5d0fc475db47b4ba9a6a Mon Sep 17 00:00:00 2001
From: Rogger Vasquez <rvasque3@gmail.com>
Date: Tue, 12 Jul 2022 15:28:05 -0500
Subject: [PATCH 094/201] rpk: make set command to only write desired value

there was a bug that made rpk redpanda config set
to write unset defaults to the config file even
if the user didn't request it
---
 src/go/rpk/pkg/cli/cmd/redpanda/config.go     |   1 +
 .../rpk/pkg/cli/cmd/redpanda/config_test.go   | 148 ++++++++++++++++++
 src/go/rpk/pkg/config/config.go               |  15 ++
 3 files changed, 164 insertions(+)

diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/config.go b/src/go/rpk/pkg/cli/cmd/redpanda/config.go
index 3daa5575bee94..e15f553b27ae5 100644
--- a/src/go/rpk/pkg/cli/cmd/redpanda/config.go
+++ b/src/go/rpk/pkg/cli/cmd/redpanda/config.go
@@ -67,6 +67,7 @@ partial json/yaml config objects:
 			p := config.ParamsFromCommand(cmd)
 			cfg, err := p.Load(fs)
 			out.MaybeDie(err, "unable to load config: %v", err)
+			cfg = cfg.FileOrDefaults() // we set fields in the raw file without writing env / flag overrides
 
 			if format == "single" {
 				fmt.Println("'--format single' is deprecated, either remove it or use yaml/json")
diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/config_test.go b/src/go/rpk/pkg/cli/cmd/redpanda/config_test.go
index 7891c9b3dfed4..7f9d716684aea 100644
--- a/src/go/rpk/pkg/cli/cmd/redpanda/config_test.go
+++ b/src/go/rpk/pkg/cli/cmd/redpanda/config_test.go
@@ -184,3 +184,151 @@ func TestInitNode(t *testing.T) {
 		})
 	}
 }
+
+// This is a top level command test, individual cases for set are
+// tested in 'rpk/pkg/config/config_test.go'.
+func TestSetCommand(t *testing.T) {
+	for _, test := range []struct {
+		name    string
+		cfgFile string
+		exp     string
+		args    []string
+	}{
+		{
+			name: "set without config file on disk",
+			exp: `config_file: /etc/redpanda/redpanda.yaml
+redpanda:
+    data_directory: /var/lib/redpanda/data
+    node_id: 0
+    rack: redpanda-rack
+    seed_servers: []
+    rpc_server:
+        address: 0.0.0.0
+        port: 33145
+    kafka_api:
+        - address: 0.0.0.0
+          port: 9092
+    admin:
+        - address: 0.0.0.0
+          port: 9644
+    developer_mode: true
+rpk:
+    enable_usage_stats: false
+    tune_network: false
+    tune_disk_scheduler: false
+    tune_disk_nomerges: false
+    tune_disk_write_cache: false
+    tune_disk_irq: false
+    tune_fstrim: false
+    tune_cpu: false
+    tune_aio_events: false
+    tune_clocksource: false
+    tune_swappiness: false
+    tune_transparent_hugepages: false
+    enable_memory_locking: false
+    tune_coredump: false
+    coredump_dir: /var/lib/redpanda/coredump
+    tune_ballast_file: false
+    overprovisioned: false
+pandaproxy: {}
+schema_registry: {}
+`,
+			args: []string{"redpanda.rack", "redpanda-rack"},
+		},
+		{
+			name: "set with loaded config",
+			cfgFile: `config_file: /etc/redpanda/redpanda.yaml
+redpanda:
+    data_directory: ""
+    node_id: 0
+    rack: redpanda-rack
+    seed_servers: []
+    rpc_server:
+        address: 0.0.0.0
+        port: 33145
+    kafka_api:
+        - address: 0.0.0.0
+          port: 9092
+    admin:
+        - address: 0.0.0.0
+          port: 9644
+    developer_mode: true
+rpk:
+    enable_usage_stats: false
+    tune_network: false
+    tune_disk_scheduler: false
+    tune_disk_nomerges: false
+    tune_disk_write_cache: false
+    tune_disk_irq: false
+    tune_fstrim: false
+    tune_cpu: false
+    tune_aio_events: false
+    tune_clocksource: false
+    tune_swappiness: false
+    tune_transparent_hugepages: false
+    enable_memory_locking: false
+    tune_coredump: false
+    tune_ballast_file: false
+    overprovisioned: false
+`,
+			exp: `config_file: /etc/redpanda/redpanda.yaml
+redpanda:
+    node_id: 0
+    rack: redpanda-rack
+    seed_servers: []
+    rpc_server:
+        address: 0.0.0.0
+        port: 33145
+    kafka_api:
+        - address: 0.0.0.0
+          port: 9092
+    admin:
+        - address: 0.0.0.0
+          port: 9644
+    developer_mode: true
+rpk:
+    enable_usage_stats: true
+    tune_network: false
+    tune_disk_scheduler: false
+    tune_disk_nomerges: false
+    tune_disk_write_cache: false
+    tune_disk_irq: false
+    tune_fstrim: false
+    tune_cpu: false
+    tune_aio_events: false
+    tune_clocksource: false
+    tune_swappiness: false
+    tune_transparent_hugepages: false
+    enable_memory_locking: false
+    tune_coredump: false
+    tune_ballast_file: false
+    overprovisioned: false
+`,
+			args: []string{"rpk.enable_usage_stats", "true"},
+		},
+	} {
+		fs := afero.NewMemMapFs()
+
+		// We create a config file in default redpanda location
+		if test.cfgFile != "" {
+			err := afero.WriteFile(fs, "/etc/redpanda/redpanda.yaml", []byte(test.cfgFile), 0o644)
+			if err != nil {
+				t.Errorf("unexpected failure writing passed config file: %v", err)
+			}
+		}
+
+		c := set(fs)
+		c.SetArgs(test.args)
+		err := c.Execute()
+		if err != nil {
+			t.Errorf("error during command execution: %v", err)
+		}
+
+		// Read back from that default location and compare.
+		file, err := afero.ReadFile(fs, "/etc/redpanda/redpanda.yaml")
+		if err != nil {
+			t.Errorf("unexpected failure reading config file: %v", err)
+		}
+		require.Equal(t, test.exp, string(file))
+	}
+}
diff --git a/src/go/rpk/pkg/config/config.go b/src/go/rpk/pkg/config/config.go
index 0881da098d619..06d439df26caa 100644
--- a/src/go/rpk/pkg/config/config.go
+++ b/src/go/rpk/pkg/config/config.go
@@ -143,6 +143,21 @@ func AvailableModes() []string {
 	}
 }
 
+// FileOrDefaults return the configuration as read from the file or
+// the default configuration if there is no file loaded.
+func (c *Config) FileOrDefaults() *Config {
+	if c.File() != nil {
+		cfg := c.File()
+		cfg.loadedPath = c.loadedPath
+		cfg.ConfigFile = c.ConfigFile // preserve loaded ConfigFile property.
+		return cfg
+	} else {
+		cfg := Default()
+		cfg.ConfigFile = c.ConfigFile
+		return cfg // no file, write the defaults
+	}
+}
+
 // Check checks if the redpanda and rpk configuration is valid before running
 // the tuners. See: redpanda_checkers.
 func (c *Config) Check() (bool, []error) {

From df4697e92a2b7c7bbbe6373697fe0f8ea5a30c59 Mon Sep 17 00:00:00 2001
From: Rogger Vasquez <rvasque3@gmail.com>
Date: Tue, 12 Jul 2022 16:19:37 -0500
Subject: [PATCH 095/201] rpk: bootstrap and init to use file or default

we want that every write path of rpk use only what
is in the file or the default if there is no file
on disk.
---
 src/go/rpk/pkg/cli/cmd/redpanda/config.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/config.go b/src/go/rpk/pkg/cli/cmd/redpanda/config.go
index e15f553b27ae5..c5be52368df2b 100644
--- a/src/go/rpk/pkg/cli/cmd/redpanda/config.go
+++ b/src/go/rpk/pkg/cli/cmd/redpanda/config.go
@@ -105,6 +105,7 @@ func bootstrap(fs afero.Fs) *cobra.Command {
 			p := config.ParamsFromCommand(cmd)
 			cfg, err := p.Load(fs)
 			out.MaybeDie(err, "unable to load config: %v", err)
+			cfg = cfg.FileOrDefaults() // we modify fields in the raw file without writing env / flag overrides
 
 			seeds, err := parseSeedIPs(ips)
 			out.MaybeDieErr(err)
@@ -168,6 +169,7 @@ func initNode(fs afero.Fs) *cobra.Command {
 			p := config.ParamsFromCommand(cmd)
 			cfg, err := p.Load(fs)
 			out.MaybeDie(err, "unable to load config: %v", err)
+			cfg = cfg.FileOrDefaults() // we modify fields in the raw file without writing env / flag overrides
 
 			// Don't reset the node's UUID if it has already been set.
 			if cfg.NodeUUID == "" {

From a08b73de2573594d45ea8a2c97e61a10d2a692bf Mon Sep 17 00:00:00 2001
From: Noah Watkins <noahwatkins@gmail.com>
Date: Wed, 13 Jul 2022 22:42:52 -0700
Subject: [PATCH 096/201] ssx: add missing header

Signed-off-by: Noah Watkins <noahwatkins@gmail.com>
---
 src/v/ssx/async-clear.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/v/ssx/async-clear.h b/src/v/ssx/async-clear.h
index 0ea00807aa1e3..d8ac2d48dd0ff 100644
--- a/src/v/ssx/async-clear.h
+++ b/src/v/ssx/async-clear.h
@@ -11,6 +11,8 @@
 
 #pragma once
 
+#include "seastarx.h"
+
 #include <seastar/core/future.hh>
 #include <seastar/coroutine/maybe_yield.hh>
 

From ed6386bfe4d307772dcb75bb8531fb42bb8c326b Mon Sep 17 00:00:00 2001
From: Noah Watkins <noahwatkins@gmail.com>
Date: Wed, 13 Jul 2022 22:43:11 -0700
Subject: [PATCH 097/201] ssx: mark single arg ctor as explicit

Signed-off-by: Noah Watkins <noahwatkins@gmail.com>
---
 src/v/ssx/async-clear.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/v/ssx/async-clear.h b/src/v/ssx/async-clear.h
index d8ac2d48dd0ff..e6d6391c01960 100644
--- a/src/v/ssx/async-clear.h
+++ b/src/v/ssx/async-clear.h
@@ -30,7 +30,7 @@ namespace ssx {
 template<typename K, typename V>
 class async_clear {
 public:
-    async_clear(absl::flat_hash_map<K, V>& c)
+    explicit async_clear(absl::flat_hash_map<K, V>& c)
       : _container(c) {}
 
     /**

From 6e1c90bf1b6bba77faf0295e71c0e962a7070d72 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noahwatkins@gmail.com>
Date: Wed, 13 Jul 2022 17:11:00 -0700
Subject: [PATCH 098/201] serde: better async handling for move-only types

1. reading moves the result into the returned ready future.
2. writing takes a value rather than a const-ref

For 2 the motivation is that serialization may require non-const actions
like reading from a record batch reader (e.g. serializing an append
entries request). In this case the reference cannot be const. However,
if we make it non-const then we can't pass temporaries and otherwise the
call sites are more confusing.

Signed-off-by: Noah Watkins <noahwatkins@gmail.com>
---
 src/v/serde/serde.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/v/serde/serde.h b/src/v/serde/serde.h
index 4174ef5a53471..e00e2177c3a0f 100644
--- a/src/v/serde/serde.h
+++ b/src/v/serde/serde.h
@@ -679,7 +679,7 @@ template<typename T>
 ss::future<std::decay_t<T>> read_async(iobuf_parser& in) {
     return read_async_nested<T>(in, 0).then([&](std::decay_t<T>&& t) {
         if (likely(in.bytes_left() == 0)) {
-            return ss::make_ready_future<std::decay_t<T>>(t);
+            return ss::make_ready_future<std::decay_t<T>>(std::move(t));
         } else {
             return ss::make_exception_future<std::decay_t<T>>(
               serde_exception{fmt_with_ctx(
@@ -693,7 +693,7 @@ ss::future<std::decay_t<T>> read_async(iobuf_parser& in) {
 }
 
 template<typename T>
-ss::future<> write_async(iobuf& out, T const& t) {
+ss::future<> write_async(iobuf& out, T t) {
     using Type = std::decay_t<T>;
     if constexpr (is_envelope_v<Type> && has_serde_async_write<Type>) {
         write(out, Type::redpanda_serde_version);
@@ -702,6 +702,7 @@ ss::future<> write_async(iobuf& out, T const& t) {
         auto size_placeholder = out.reserve(sizeof(serde_size_t));
         auto const size_before = out.size_bytes();
 
+        return ss::do_with(std::move(t), [&out, size_before, size_placeholder = std::move(size_placeholder)](T& t) mutable {
         return t.serde_async_write(out).then(
           [&out,
            size_before,
@@ -718,8 +719,9 @@ ss::future<> write_async(iobuf& out, T const& t) {
 
               return ss::make_ready_future<>();
           });
+        });
     } else {
-        write(out, t);
+        write(out, std::move(t));
         return ss::make_ready_future<>();
     }
 }

From 75cb94ce659b9ae559e9bf0eaf893b99746c37b8 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noahwatkins@gmail.com>
Date: Wed, 13 Jul 2022 21:34:25 -0700
Subject: [PATCH 099/201] serde: apply clang-format

Signed-off-by: Noah Watkins <noahwatkins@gmail.com>
---
 src/v/serde/serde.h | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/src/v/serde/serde.h b/src/v/serde/serde.h
index e00e2177c3a0f..83af6abaaa7a4 100644
--- a/src/v/serde/serde.h
+++ b/src/v/serde/serde.h
@@ -702,24 +702,29 @@ ss::future<> write_async(iobuf& out, T t) {
         auto size_placeholder = out.reserve(sizeof(serde_size_t));
         auto const size_before = out.size_bytes();
 
-        return ss::do_with(std::move(t), [&out, size_before, size_placeholder = std::move(size_placeholder)](T& t) mutable {
-        return t.serde_async_write(out).then(
-          [&out,
-           size_before,
-           size_placeholder = std::move(size_placeholder)]() mutable {
-              auto const written_size = out.size_bytes() - size_before;
-              if (unlikely(
-                    written_size > std::numeric_limits<serde_size_t>::max())) {
-                  throw serde_exception{"envelope too big"};
-              }
-              auto const size = ss::cpu_to_le(
-                static_cast<serde_size_t>(written_size));
-              size_placeholder.write(
-                reinterpret_cast<char const*>(&size), sizeof(serde_size_t));
-
-              return ss::make_ready_future<>();
+        return ss::do_with(
+          std::move(t),
+          [&out, size_before, size_placeholder = std::move(size_placeholder)](
+            T& t) mutable {
+              return t.serde_async_write(out).then(
+                [&out,
+                 size_before,
+                 size_placeholder = std::move(size_placeholder)]() mutable {
+                    auto const written_size = out.size_bytes() - size_before;
+                    if (unlikely(
+                          written_size
+                          > std::numeric_limits<serde_size_t>::max())) {
+                        throw serde_exception{"envelope too big"};
+                    }
+                    auto const size = ss::cpu_to_le(
+                      static_cast<serde_size_t>(written_size));
+                    size_placeholder.write(
+                      reinterpret_cast<char const*>(&size),
+                      sizeof(serde_size_t));
+
+                    return ss::make_ready_future<>();
+                });
           });
-        });
     } else {
         write(out, std::move(t));
         return ss::make_ready_future<>();

From 89b3e019989711575da15a52b42c0d6673f711fd Mon Sep 17 00:00:00 2001
From: Noah Watkins <noahwatkins@gmail.com>
Date: Wed, 13 Jul 2022 11:39:50 -0700
Subject: [PATCH 100/201] raft: remove unused code

Signed-off-by: Noah Watkins <noahwatkins@gmail.com>
---
 src/v/raft/types.cc | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/src/v/raft/types.cc b/src/v/raft/types.cc
index 3a2229a7a469d..87e857c843446 100644
--- a/src/v/raft/types.cc
+++ b/src/v/raft/types.cc
@@ -161,27 +161,6 @@ std::ostream& operator<<(std::ostream& o, const install_snapshot_reply& r) {
 
 namespace reflection {
 
-struct rpc_model_reader_consumer {
-    explicit rpc_model_reader_consumer(iobuf& oref)
-      : ref(oref) {}
-    ss::future<ss::stop_iteration> operator()(model::record_batch batch) {
-        reflection::serialize(ref, batch.header());
-        if (!batch.compressed()) {
-            reflection::serialize<int8_t>(ref, 0);
-            batch.for_each_record([this](model::record r) {
-                reflection::serialize(ref, std::move(r));
-            });
-        } else {
-            reflection::serialize<int8_t>(ref, 1);
-            reflection::serialize(ref, std::move(batch).release_data());
-        }
-        return ss::make_ready_future<ss::stop_iteration>(
-          ss::stop_iteration::no);
-    }
-    void end_of_stream(){};
-    iobuf& ref;
-};
-
 ss::future<> async_adl<raft::append_entries_request>::to(
   iobuf& out, raft::append_entries_request&& request) {
     return model::consume_reader_to_memory(

From 7ce22e1963084ab0cf62ef61d0f56454bc88c689 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noahwatkins@gmail.com>
Date: Wed, 13 Jul 2022 14:44:36 -0700
Subject: [PATCH 101/201] raft: support default ctor for append_entries_request

Serde requires that types are default constructable. Since
append_entries_request contains a record_batch_reader it is
not default construtable. Until serde is improved in this
area and provides interfaces/customization points for dealing
with such types we need to make this request default ctor.

To do this we wrap the reader in std::optional and provide
an accessor to the underlying value. The only case then
in which the optional is not set is when default constructor
is invoked by serde.

Signed-off-by: Noah Watkins <noahwatkins@gmail.com>
---
 src/v/raft/consensus.cc                      |  6 ++--
 src/v/raft/replicate_entries_stm.cc          |  6 ++--
 src/v/raft/tests/type_serialization_tests.cc |  2 +-
 src/v/raft/types.cc                          |  2 +-
 src/v/raft/types.h                           | 31 ++++++++++++++++----
 5 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/src/v/raft/consensus.cc b/src/v/raft/consensus.cc
index b30bff1c0977b..d0e5e246916ae 100644
--- a/src/v/raft/consensus.cc
+++ b/src/v/raft/consensus.cc
@@ -1628,7 +1628,7 @@ consensus::do_append_entries(append_entries_request&& r) {
     // section 1
     // For an entry to fit into our log, it must not leave a gap.
     if (r.meta.prev_log_index > last_log_offset) {
-        if (!r.batches.is_end_of_stream()) {
+        if (!r.batches().is_end_of_stream()) {
             vlog(
               _ctxlog.debug,
               "Rejecting append entries. Would leave gap in log, last log "
@@ -1668,7 +1668,7 @@ consensus::do_append_entries(append_entries_request&& r) {
     // special case heartbeat case
     // we need to handle it early (before executing truncation)
     // as timeouts are asynchronous to append calls and can have stall data
-    if (r.batches.is_end_of_stream()) {
+    if (r.batches().is_end_of_stream()) {
         if (r.meta.prev_log_index < last_log_offset) {
             // do not tuncate on heartbeat just response with false
             reply.result = append_entries_reply::status::failure;
@@ -1761,7 +1761,7 @@ consensus::do_append_entries(append_entries_request&& r) {
 
     // success. copy entries for each subsystem
     using offsets_ret = storage::append_result;
-    return disk_append(std::move(r.batches), update_last_quorum_index::no)
+    return disk_append(std::move(r.batches()), update_last_quorum_index::no)
       .then([this, m = r.meta, target = r.node_id](offsets_ret ofs) {
           auto f = ss::make_ready_future<>();
           auto last_visible = std::min(ofs.last_offset, m.last_visible_index);
diff --git a/src/v/raft/replicate_entries_stm.cc b/src/v/raft/replicate_entries_stm.cc
index c63749712b5f1..58c92ab420dea 100644
--- a/src/v/raft/replicate_entries_stm.cc
+++ b/src/v/raft/replicate_entries_stm.cc
@@ -33,10 +33,10 @@ using namespace std::chrono_literals;
 ss::future<append_entries_request> replicate_entries_stm::share_request() {
     // one extra copy is needed for retries
     return with_semaphore(_share_sem, 1, [this] {
-        return details::foreign_share_n(std::move(_req->batches), 2)
+        return details::foreign_share_n(std::move(_req->batches()), 2)
           .then([this](std::vector<model::record_batch_reader> readers) {
               // keep a copy around until the end
-              _req->batches = std::move(readers.back());
+              _req->batches() = std::move(readers.back());
               readers.pop_back();
               return append_entries_request(
                 _req->node_id,
@@ -188,7 +188,7 @@ replicate_entries_stm::append_to_self() {
             = _req->flush ? consistency_level::quorum_ack
                           : consistency_level::leader_ack;
           return _ptr->disk_append(
-            std::move(req.batches),
+            std::move(req.batches()),
             _req->flush ? consensus::update_last_quorum_index::yes
                         : consensus::update_last_quorum_index::no);
       })
diff --git a/src/v/raft/tests/type_serialization_tests.cc b/src/v/raft/tests/type_serialization_tests.cc
index 83fa6ae49e4de..ec4c945c9e479 100644
--- a/src/v/raft/tests/type_serialization_tests.cc
+++ b/src/v/raft/tests/type_serialization_tests.cc
@@ -99,7 +99,7 @@ SEASTAR_THREAD_TEST_CASE(append_entries_requests) {
     auto batches_result = model::consume_reader_to_memory(
                             std::move(readers.back()), model::no_timeout)
                             .get0();
-    d.batches
+    d.batches()
       .consume(checking_consumer(std::move(batches_result)), model::no_timeout)
       .get0();
 }
diff --git a/src/v/raft/types.cc b/src/v/raft/types.cc
index 87e857c843446..e30e760a3739a 100644
--- a/src/v/raft/types.cc
+++ b/src/v/raft/types.cc
@@ -164,7 +164,7 @@ namespace reflection {
 ss::future<> async_adl<raft::append_entries_request>::to(
   iobuf& out, raft::append_entries_request&& request) {
     return model::consume_reader_to_memory(
-             std::move(request.batches), model::no_timeout)
+             std::move(request.batches()), model::no_timeout)
       .then([&out, request = std::move(request)](
               ss::circular_buffer<model::record_batch> batches) {
           reflection::adl<uint32_t>{}.to(out, batches.size());
diff --git a/src/v/raft/types.h b/src/v/raft/types.h
index a017935fbb3d6..61e169bca25a8 100644
--- a/src/v/raft/types.h
+++ b/src/v/raft/types.h
@@ -189,8 +189,8 @@ struct append_entries_request {
       flush_after_append f = flush_after_append::yes) noexcept
       : node_id(src)
       , meta(m)
-      , batches(std::move(r))
-      , flush(f){};
+      , flush(f)
+      , _batches(std::move(r)) {}
 
     append_entries_request(
       vnode src,
@@ -201,8 +201,8 @@ struct append_entries_request {
       : node_id(src)
       , target_node_id(target)
       , meta(m)
-      , batches(std::move(r))
-      , flush(f){};
+      , flush(f)
+      , _batches(std::move(r)) {}
     ~append_entries_request() noexcept = default;
     append_entries_request(const append_entries_request&) = delete;
     append_entries_request& operator=(const append_entries_request&) = delete;
@@ -215,16 +215,35 @@ struct append_entries_request {
     vnode node_id;
     vnode target_node_id;
     protocol_metadata meta;
-    model::record_batch_reader batches;
+    model::record_batch_reader& batches() {
+        /*
+         * note that some call sites do:
+         *
+         *   auto b = std::move(req.batches())
+         *
+         * which does not reset the std::optional value. so this assertion is
+         * merely here to protect against use of a default constructed request.
+         */
+        vassert(_batches.has_value(), "request contains no batches");
+        return _batches.value();
+    }
     flush_after_append flush;
     static append_entries_request make_foreign(append_entries_request&& req) {
         return append_entries_request(
           req.node_id,
           req.target_node_id,
           std::move(req.meta),
-          model::make_foreign_record_batch_reader(std::move(req.batches)),
+          model::make_foreign_record_batch_reader(std::move(req.batches())),
           req.flush);
     }
+
+private:
+    /*
+     * batches is optional to allow append_entries_request to have a default
+     * constructor and integrate with serde until serde provides a more powerful
+     * interface for dealing with this.
+     */
+    std::optional<model::record_batch_reader> _batches;
 };
 
 struct append_entries_reply {

From f5fe5d12f5a47b3902845fbcd5002807921e8622 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noahwatkins@gmail.com>
Date: Tue, 12 Jul 2022 21:36:59 -0700
Subject: [PATCH 102/201] raft: assert heartbeat request is non empty

The serialization assumes that is non-empty and unconditionally
dereferences heartbeat vector entries leading to undefined behavior if
the request is empty.

Signed-off-by: Noah Watkins <noahwatkins@gmail.com>
---
 src/v/raft/types.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/v/raft/types.cc b/src/v/raft/types.cc
index e30e760a3739a..5921b89b30391 100644
--- a/src/v/raft/types.cc
+++ b/src/v/raft/types.cc
@@ -316,6 +316,8 @@ T read_one_varint_delta(iobuf_parser& in, const T& prev) {
 
 ss::future<> async_adl<raft::heartbeat_request>::to(
   iobuf& out, raft::heartbeat_request&& request) {
+    vassert(
+      !request.heartbeats.empty(), "cannot serialize empty heartbeats request");
     struct sorter_fn {
         constexpr bool operator()(
           const raft::heartbeat_metadata& lhs,

From 8c9615dc3af4ebad8cb282b227c7807d8b30036f Mon Sep 17 00:00:00 2001
From: Noah Watkins <noahwatkins@gmail.com>
Date: Tue, 12 Jul 2022 15:21:45 -0700
Subject: [PATCH 103/201] raft: factor out common serialization utilities

Will be used in both serde and adl.

Signed-off-by: Noah Watkins <noahwatkins@gmail.com>
---
 src/v/raft/types.cc | 181 ++++++++++++++++++++++----------------------
 1 file changed, 92 insertions(+), 89 deletions(-)

diff --git a/src/v/raft/types.cc b/src/v/raft/types.cc
index 5921b89b30391..b4369e3852012 100644
--- a/src/v/raft/types.cc
+++ b/src/v/raft/types.cc
@@ -24,6 +24,98 @@
 #include <chrono>
 #include <type_traits>
 
+namespace {
+template<typename T>
+T decode_signed(T value) {
+    return value < T(0) ? T{} : value;
+}
+
+template<typename T>
+T varlong_reader(iobuf_parser& in) {
+    auto [val, len] = in.read_varlong();
+    return T(val);
+}
+
+namespace internal {
+struct hbeat_soa {
+    explicit hbeat_soa(size_t n)
+      : groups(n)
+      , commit_indices(n)
+      , terms(n)
+      , prev_log_indices(n)
+      , prev_log_terms(n)
+      , last_visible_indices(n)
+      , revisions(n)
+      , target_revisions(n) {}
+
+    ~hbeat_soa() noexcept = default;
+    hbeat_soa(const hbeat_soa&) = delete;
+    hbeat_soa& operator=(const hbeat_soa&) = delete;
+    hbeat_soa(hbeat_soa&&) noexcept = default;
+    hbeat_soa& operator=(hbeat_soa&&) noexcept = default;
+
+    std::vector<raft::group_id> groups;
+    std::vector<model::offset> commit_indices;
+    std::vector<model::term_id> terms;
+    std::vector<model::offset> prev_log_indices;
+    std::vector<model::term_id> prev_log_terms;
+    std::vector<model::offset> last_visible_indices;
+    std::vector<model::revision_id> revisions;
+    std::vector<model::revision_id> target_revisions;
+};
+
+struct hbeat_response_array {
+    explicit hbeat_response_array(size_t n)
+      : groups(n)
+      , terms(n)
+      , last_flushed_log_index(n)
+      , last_dirty_log_index(n)
+      , last_term_base_offset(n)
+      , revisions(n)
+      , target_revisions(n) {}
+
+    std::vector<raft::group_id> groups;
+    std::vector<model::term_id> terms;
+    std::vector<model::offset> last_flushed_log_index;
+    std::vector<model::offset> last_dirty_log_index;
+    std::vector<model::offset> last_term_base_offset;
+    std::vector<model::revision_id> revisions;
+    std::vector<model::revision_id> target_revisions;
+};
+template<typename T>
+void encode_one_vint(iobuf& out, const T& t) {
+    auto b = vint::to_bytes(t);
+    // NOLINTNEXTLINE
+    out.append(reinterpret_cast<const char*>(b.data()), b.size());
+}
+
+template<typename T>
+void encode_varint_delta(iobuf& out, const T& prev, const T& current) {
+    // TODO: use delta-delta:
+    // https://github.com/facebookarchive/beringei/blob/92784ec6e2/beringei/lib/BitUtil.cpp
+    auto delta = current - prev;
+    encode_one_vint(out, delta);
+}
+
+template<typename T>
+void encode_one_delta_array(iobuf& o, const std::vector<T>& v) {
+    if (v.empty()) {
+        return;
+    }
+    const size_t max = v.size();
+    encode_one_vint(o, v[0]);
+    for (size_t i = 1; i < max; ++i) {
+        encode_varint_delta(o, v[i - 1], v[i]);
+    }
+}
+template<typename T>
+T read_one_varint_delta(iobuf_parser& in, const T& prev) {
+    auto dst = varlong_reader<T>(in);
+    return prev + dst;
+}
+} // namespace internal
+} // namespace
+
 namespace raft {
 
 replicate_stages::replicate_stages(
@@ -219,12 +311,6 @@ void adl<raft::protocol_metadata>::to(
       idx);
 }
 
-template<typename T>
-T varlong_reader(iobuf_parser& in) {
-    auto [val, len] = in.read_varlong();
-    return T(val);
-}
-
 raft::protocol_metadata adl<raft::protocol_metadata>::from(iobuf_parser& in) {
     raft::protocol_metadata ret;
     ret.group = varlong_reader<raft::group_id>(in);
@@ -235,84 +321,6 @@ raft::protocol_metadata adl<raft::protocol_metadata>::from(iobuf_parser& in) {
     ret.last_visible_index = varlong_reader<model::offset>(in);
     return ret;
 }
-namespace internal {
-struct hbeat_soa {
-    explicit hbeat_soa(size_t n)
-      : groups(n)
-      , commit_indices(n)
-      , terms(n)
-      , prev_log_indices(n)
-      , prev_log_terms(n)
-      , last_visible_indices(n)
-      , revisions(n)
-      , target_revisions(n) {}
-
-    ~hbeat_soa() noexcept = default;
-    hbeat_soa(const hbeat_soa&) = delete;
-    hbeat_soa& operator=(const hbeat_soa&) = delete;
-    hbeat_soa(hbeat_soa&&) noexcept = default;
-    hbeat_soa& operator=(hbeat_soa&&) noexcept = default;
-
-    std::vector<raft::group_id> groups;
-    std::vector<model::offset> commit_indices;
-    std::vector<model::term_id> terms;
-    std::vector<model::offset> prev_log_indices;
-    std::vector<model::term_id> prev_log_terms;
-    std::vector<model::offset> last_visible_indices;
-    std::vector<model::revision_id> revisions;
-    std::vector<model::revision_id> target_revisions;
-};
-
-struct hbeat_response_array {
-    explicit hbeat_response_array(size_t n)
-      : groups(n)
-      , terms(n)
-      , last_flushed_log_index(n)
-      , last_dirty_log_index(n)
-      , last_term_base_offset(n)
-      , revisions(n)
-      , target_revisions(n) {}
-
-    std::vector<raft::group_id> groups;
-    std::vector<model::term_id> terms;
-    std::vector<model::offset> last_flushed_log_index;
-    std::vector<model::offset> last_dirty_log_index;
-    std::vector<model::offset> last_term_base_offset;
-    std::vector<model::revision_id> revisions;
-    std::vector<model::revision_id> target_revisions;
-};
-template<typename T>
-void encode_one_vint(iobuf& out, const T& t) {
-    auto b = vint::to_bytes(t);
-    // NOLINTNEXTLINE
-    out.append(reinterpret_cast<const char*>(b.data()), b.size());
-}
-
-template<typename T>
-void encode_varint_delta(iobuf& out, const T& prev, const T& current) {
-    // TODO: use delta-delta:
-    // https://github.com/facebookarchive/beringei/blob/92784ec6e2/beringei/lib/BitUtil.cpp
-    auto delta = current - prev;
-    encode_one_vint(out, delta);
-}
-
-template<typename T>
-void encode_one_delta_array(iobuf& o, const std::vector<T>& v) {
-    if (v.empty()) {
-        return;
-    }
-    const size_t max = v.size();
-    encode_one_vint(o, v[0]);
-    for (size_t i = 1; i < max; ++i) {
-        encode_varint_delta(o, v[i - 1], v[i]);
-    }
-}
-template<typename T>
-T read_one_varint_delta(iobuf_parser& in, const T& prev) {
-    auto dst = varlong_reader<T>(in);
-    return prev + dst;
-}
-} // namespace internal
 
 ss::future<> async_adl<raft::heartbeat_request>::to(
   iobuf& out, raft::heartbeat_request&& request) {
@@ -386,11 +394,6 @@ ss::future<> async_adl<raft::heartbeat_request>::to(
       });
 }
 
-template<typename T>
-T decode_signed(T value) {
-    return value < T(0) ? T{} : value;
-}
-
 ss::future<raft::heartbeat_request>
 async_adl<raft::heartbeat_request>::from(iobuf_parser& in) {
     raft::heartbeat_request req;

From 3c05c18adf7d65a45b13646ffcee89cc5c3d14ff Mon Sep 17 00:00:00 2001
From: Noah Watkins <noahwatkins@gmail.com>
Date: Tue, 12 Jul 2022 15:49:06 -0700
Subject: [PATCH 104/201] raft: add serde support to heartbeat_request

Despite retaining the optimized, custom encoding protocol used in ADL
the code is duplicated for use with serde because some primitives are
using normal ADL encoding and those have been switched to serde.

Writing heartbeat request is made async to deal with potential reactor
stalls.

Signed-off-by: Noah Watkins <noahwatkins@gmail.com>
---
 src/v/cluster/tests/serialization_rt_test.cc |  67 ++++++++
 src/v/raft/types.cc                          | 158 +++++++++++++++++++
 src/v/raft/types.h                           |  20 ++-
 3 files changed, 244 insertions(+), 1 deletion(-)

diff --git a/src/v/cluster/tests/serialization_rt_test.cc b/src/v/cluster/tests/serialization_rt_test.cc
index 99d34637f639e..fb7bb3e2b9c0a 100644
--- a/src/v/cluster/tests/serialization_rt_test.cc
+++ b/src/v/cluster/tests/serialization_rt_test.cc
@@ -20,6 +20,7 @@
 #include "raft/types.h"
 #include "random/generators.h"
 #include "reflection/adl.h"
+#include "reflection/async_adl.h"
 #include "storage/types.h"
 #include "test_utils/randoms.h"
 #include "test_utils/rpc.h"
@@ -2088,6 +2089,72 @@ SEASTAR_THREAD_TEST_CASE(serde_reflection_roundtrip) {
         };
         roundtrip_test(data);
     }
+    {
+        raft::heartbeat_request data;
+
+        // heartbeat request uses the first node/target_node for all of the
+        // heartbeat meatdata entries. so here we arrange for that to be true in
+        // the input data so that equality works as expected.
+        const auto node_id = tests::random_named_int<model::node_id>();
+        const auto target_node_id = tests::random_named_int<model::node_id>();
+
+        for (auto i = 0, mi = random_generators::get_int(1, 20); i < mi; ++i) {
+            raft::protocol_metadata meta{
+              .group = tests::random_named_int<raft::group_id>(),
+              .commit_index = tests::random_named_int<model::offset>(),
+              .term = tests::random_named_int<model::term_id>(),
+              .prev_log_index = tests::random_named_int<model::offset>(),
+              .prev_log_term = tests::random_named_int<model::term_id>(),
+              .last_visible_index = tests::random_named_int<model::offset>(),
+            };
+            raft::heartbeat_metadata hm{
+              .meta = meta,
+              .node_id = raft::
+                vnode{node_id, tests::random_named_int<model::revision_id>()},
+              .target_node_id = raft::
+                vnode{target_node_id, tests::random_named_int<model::revision_id>()},
+            };
+            data.heartbeats.push_back(hm);
+        }
+
+        // encoder will sort automatically. so for equality to work as expected
+        // we use the same sorting for the input as the expected output.
+        struct sorter_fn {
+            constexpr bool operator()(
+              const raft::heartbeat_metadata& lhs,
+              const raft::heartbeat_metadata& rhs) const {
+                return lhs.meta.commit_index < rhs.meta.commit_index;
+            }
+        };
+
+        std::sort(data.heartbeats.begin(), data.heartbeats.end(), sorter_fn{});
+
+        // serde round trip test async version
+        {
+            auto serde_in = data;
+            iobuf serde_out;
+            serde::write_async(serde_out, std::move(serde_in)).get();
+            auto from_serde = serde::from_iobuf<raft::heartbeat_request>(
+              std::move(serde_out));
+            BOOST_REQUIRE(data == from_serde);
+        }
+
+        // the adl test needs to force async to avoid the automatic reflection
+        // version of the encoder.
+        {
+            auto adl_in = data;
+            iobuf adl_out;
+            reflection::async_adl<raft::heartbeat_request>{}
+              .to(adl_out, std::move(adl_in))
+              .get();
+            iobuf_parser in(std::move(adl_out));
+            auto from_adl = reflection::async_adl<raft::heartbeat_request>{}
+                              .from(in)
+                              .get0();
+
+            BOOST_REQUIRE(data == from_adl);
+        }
+    }
 }
 
 SEASTAR_THREAD_TEST_CASE(cluster_property_kv_exchangable_with_pair) {
diff --git a/src/v/raft/types.cc b/src/v/raft/types.cc
index b4369e3852012..c88b76da3acb4 100644
--- a/src/v/raft/types.cc
+++ b/src/v/raft/types.cc
@@ -19,6 +19,8 @@
 #include "vassert.h"
 #include "vlog.h"
 
+#include <seastar/coroutine/maybe_yield.hh>
+
 #include <fmt/ostream.h>
 
 #include <chrono>
@@ -249,6 +251,162 @@ std::ostream& operator<<(std::ostream& o, const install_snapshot_reply& r) {
     return o;
 }
 
+ss::future<> heartbeat_request::serde_async_write(iobuf& dst) {
+    vassert(!heartbeats.empty(), "cannot serialize empty heartbeats request");
+
+    struct sorter_fn {
+        constexpr bool operator()(
+          const raft::heartbeat_metadata& lhs,
+          const raft::heartbeat_metadata& rhs) const {
+            return lhs.meta.commit_index < rhs.meta.commit_index;
+        }
+    };
+
+    iobuf out;
+    auto& request = *this;
+
+    std::sort(
+      request.heartbeats.begin(), request.heartbeats.end(), sorter_fn{});
+
+    co_await ss::coroutine::maybe_yield();
+
+    internal::hbeat_soa encodee(request.heartbeats.size());
+    // target physical node id is always the same it differs only by
+    // revision
+
+    const size_t size = request.heartbeats.size();
+    for (size_t i = 0; i < size; ++i) {
+        const auto& m = request.heartbeats[i].meta;
+        const raft::vnode node = request.heartbeats[i].node_id;
+        const raft::vnode target_node = request.heartbeats[i].target_node_id;
+        vassert(m.group() >= 0, "Negative raft group detected. {}", m.group);
+        encodee.groups[i] = m.group;
+        encodee.commit_indices[i] = std::max(model::offset(-1), m.commit_index);
+        encodee.terms[i] = std::max(model::term_id(-1), m.term);
+        encodee.prev_log_indices[i] = std::max(
+          model::offset(-1), m.prev_log_index);
+        encodee.prev_log_terms[i] = std::max(
+          model::term_id(-1), m.prev_log_term);
+        encodee.last_visible_indices[i] = std::max(
+          model::offset(-1), m.last_visible_index);
+        encodee.revisions[i] = std::max(
+          model::revision_id(-1), node.revision());
+        encodee.target_revisions[i] = std::max(
+          model::revision_id(-1), target_node.revision());
+
+        co_await ss::coroutine::maybe_yield();
+    }
+    // important to release this memory after this function
+    // request.meta = {}; // release memory
+
+    using serde::write;
+
+    // physical node ids are the same for all requests
+    write(out, request.heartbeats.front().node_id.id());
+    write(out, request.heartbeats.front().target_node_id.id());
+    write(out, static_cast<uint32_t>(size));
+
+    internal::encode_one_delta_array<raft::group_id>(out, encodee.groups);
+    internal::encode_one_delta_array<model::offset>(
+      out, encodee.commit_indices);
+    internal::encode_one_delta_array<model::term_id>(out, encodee.terms);
+    internal::encode_one_delta_array<model::offset>(
+      out, encodee.prev_log_indices);
+    internal::encode_one_delta_array<model::term_id>(
+      out, encodee.prev_log_terms);
+    internal::encode_one_delta_array<model::offset>(
+      out, encodee.last_visible_indices);
+    internal::encode_one_delta_array<model::revision_id>(
+      out, encodee.revisions);
+    internal::encode_one_delta_array<model::revision_id>(
+      out, encodee.target_revisions);
+
+    write(dst, std::move(out));
+}
+
+void heartbeat_request::serde_read(
+  iobuf_parser& src, const serde::header& hdr) {
+    using serde::read_nested;
+    auto tmp = read_nested<iobuf>(src, hdr._bytes_left_limit);
+    iobuf_parser in(std::move(tmp));
+
+    auto& req = *this;
+    auto node_id = read_nested<model::node_id>(in, 0U);
+    auto target_node = read_nested<model::node_id>(in, 0U);
+    req.heartbeats = std::vector<raft::heartbeat_metadata>(
+      read_nested<uint32_t>(in, 0U));
+    if (req.heartbeats.empty()) {
+        return;
+    }
+    const size_t max = req.heartbeats.size();
+    req.heartbeats[0].meta.group = varlong_reader<raft::group_id>(in);
+    for (size_t i = 1; i < max; ++i) {
+        req.heartbeats[i].meta.group
+          = internal::read_one_varint_delta<raft::group_id>(
+            in, req.heartbeats[i - 1].meta.group);
+    }
+    req.heartbeats[0].meta.commit_index = varlong_reader<model::offset>(in);
+    for (size_t i = 1; i < max; ++i) {
+        req.heartbeats[i].meta.commit_index
+          = internal::read_one_varint_delta<model::offset>(
+            in, req.heartbeats[i - 1].meta.commit_index);
+    }
+    req.heartbeats[0].meta.term = varlong_reader<model::term_id>(in);
+    for (size_t i = 1; i < max; ++i) {
+        req.heartbeats[i].meta.term
+          = internal::read_one_varint_delta<model::term_id>(
+            in, req.heartbeats[i - 1].meta.term);
+    }
+    req.heartbeats[0].meta.prev_log_index = varlong_reader<model::offset>(in);
+    for (size_t i = 1; i < max; ++i) {
+        req.heartbeats[i].meta.prev_log_index
+          = internal::read_one_varint_delta<model::offset>(
+            in, req.heartbeats[i - 1].meta.prev_log_index);
+    }
+    req.heartbeats[0].meta.prev_log_term = varlong_reader<model::term_id>(in);
+    for (size_t i = 1; i < max; ++i) {
+        req.heartbeats[i].meta.prev_log_term
+          = internal::read_one_varint_delta<model::term_id>(
+            in, req.heartbeats[i - 1].meta.prev_log_term);
+    }
+    req.heartbeats[0].meta.last_visible_index = varlong_reader<model::offset>(
+      in);
+    for (size_t i = 1; i < max; ++i) {
+        req.heartbeats[i].meta.last_visible_index
+          = internal::read_one_varint_delta<model::offset>(
+            in, req.heartbeats[i - 1].meta.last_visible_index);
+    }
+
+    req.heartbeats[0].node_id = raft::vnode(
+      node_id, varlong_reader<model::revision_id>(in));
+    for (size_t i = 1; i < max; ++i) {
+        req.heartbeats[i].node_id = raft::vnode(
+          node_id,
+          internal::read_one_varint_delta<model::revision_id>(
+            in, req.heartbeats[i - 1].node_id.revision()));
+    }
+
+    req.heartbeats[0].target_node_id = raft::vnode(
+      target_node, varlong_reader<model::revision_id>(in));
+    for (size_t i = 1; i < max; ++i) {
+        req.heartbeats[i].target_node_id = raft::vnode(
+          target_node,
+          internal::read_one_varint_delta<model::revision_id>(
+            in, req.heartbeats[i - 1].target_node_id.revision()));
+    }
+
+    for (auto& hb : req.heartbeats) {
+        hb.meta.prev_log_index = decode_signed(hb.meta.prev_log_index);
+        hb.meta.commit_index = decode_signed(hb.meta.commit_index);
+        hb.meta.prev_log_term = decode_signed(hb.meta.prev_log_term);
+        hb.meta.last_visible_index = decode_signed(hb.meta.last_visible_index);
+        hb.node_id = raft::vnode(
+          hb.node_id.id(), decode_signed(hb.node_id.revision()));
+        hb.target_node_id = raft::vnode(
+          hb.target_node_id.id(), decode_signed(hb.target_node_id.revision()));
+    }
+}
+
 } // namespace raft
 
 namespace reflection {
diff --git a/src/v/raft/types.h b/src/v/raft/types.h
index 61e169bca25a8..aae451f968afd 100644
--- a/src/v/raft/types.h
+++ b/src/v/raft/types.h
@@ -54,6 +54,9 @@ struct protocol_metadata {
 
     friend std::ostream&
     operator<<(std::ostream& o, const protocol_metadata& m);
+
+    friend bool operator==(const protocol_metadata&, const protocol_metadata&)
+      = default;
 };
 
 // The sequence used to track the order of follower append entries request
@@ -279,6 +282,9 @@ struct heartbeat_metadata {
     protocol_metadata meta;
     vnode node_id;
     vnode target_node_id;
+
+    friend bool operator==(const heartbeat_metadata&, const heartbeat_metadata&)
+      = default;
 };
 
 /// \brief this is our _biggest_ modification to how raft works
@@ -287,10 +293,22 @@ struct heartbeat_metadata {
 /// at a time, as well as the receiving side will trigger the
 /// individual raft responses one at a time - for example to start replaying the
 /// log at some offset
-struct heartbeat_request {
+struct heartbeat_request
+  : serde::envelope<heartbeat_request, serde::version<0>> {
     std::vector<heartbeat_metadata> heartbeats;
+
+    heartbeat_request() noexcept = default;
+    explicit heartbeat_request(std::vector<heartbeat_metadata> heartbeats)
+      : heartbeats(std::move(heartbeats)) {}
+
     friend std::ostream&
     operator<<(std::ostream& o, const heartbeat_request& r);
+
+    friend bool operator==(const heartbeat_request&, const heartbeat_request&)
+      = default;
+
+    ss::future<> serde_async_write(iobuf& out);
+    void serde_read(iobuf_parser&, const serde::header&);
 };
 struct heartbeat_reply {
     std::vector<append_entries_reply> meta;

From 4d60f4307d2d40cfb7776d09b58b759a37270ad5 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noahwatkins@gmail.com>
Date: Wed, 13 Jul 2022 10:25:14 -0700
Subject: [PATCH 105/201] raft: add serde support to heartbeat_reply

Despite retaining the optimized, custom encoding protocol
used in ADL the code is duplicated for use with serde
because some primitives are using normal ADL encoding and
those have been switched to serde.

Signed-off-by: Noah Watkins <noahwatkins@gmail.com>
---
 src/v/cluster/tests/serialization_rt_test.cc |  55 +++++++
 src/v/raft/types.cc                          | 146 +++++++++++++++++++
 src/v/raft/types.h                           |  18 ++-
 3 files changed, 218 insertions(+), 1 deletion(-)

diff --git a/src/v/cluster/tests/serialization_rt_test.cc b/src/v/cluster/tests/serialization_rt_test.cc
index fb7bb3e2b9c0a..fc98070395f21 100644
--- a/src/v/cluster/tests/serialization_rt_test.cc
+++ b/src/v/cluster/tests/serialization_rt_test.cc
@@ -2155,6 +2155,61 @@ SEASTAR_THREAD_TEST_CASE(serde_reflection_roundtrip) {
             BOOST_REQUIRE(data == from_adl);
         }
     }
+    {
+        raft::heartbeat_reply data;
+
+        // heartbeat reply uses the first node/target_node for all of the
+        // reply meatdata entries. so here we arrange for that to be true in
+        // the input data so that equality works as expected.
+        const auto node_id = tests::random_named_int<model::node_id>();
+        const auto target_node_id = tests::random_named_int<model::node_id>();
+
+        for (auto i = 0, mi = random_generators::get_int(1, 20); i < mi; ++i) {
+            raft::append_entries_reply reply{
+              .target_node_id = raft::
+                vnode{target_node_id, tests::random_named_int<model::revision_id>()},
+              .node_id = raft::
+                vnode{node_id, tests::random_named_int<model::revision_id>()},
+              .group = tests::random_named_int<raft::group_id>(),
+              .term = tests::random_named_int<model::term_id>(),
+              .last_flushed_log_index
+              = tests::random_named_int<model::offset>(),
+              .last_dirty_log_index = tests::random_named_int<model::offset>(),
+              .last_term_base_offset = tests::random_named_int<model::offset>(),
+              .result = raft::append_entries_reply::status::group_unavailable,
+            };
+            data.meta.push_back(reply);
+        }
+
+        // encoder will sort automatically. so for equality to work as expected
+        // we use the same sorting for the input as the expected output.
+        struct sorter_fn {
+            constexpr bool operator()(
+              const raft::append_entries_reply& lhs,
+              const raft::append_entries_reply& rhs) const {
+                return lhs.last_flushed_log_index < rhs.last_flushed_log_index;
+            }
+        };
+
+        std::sort(data.meta.begin(), data.meta.end(), sorter_fn{});
+
+        serde_roundtrip_test(data);
+
+        // the adl test needs to force async to avoid the automatic reflection
+        // version of the encoder.
+        {
+            auto adl_in = data;
+            iobuf adl_out;
+            reflection::async_adl<raft::heartbeat_reply>{}
+              .to(adl_out, std::move(adl_in))
+              .get();
+            iobuf_parser in(std::move(adl_out));
+            auto from_adl
+              = reflection::async_adl<raft::heartbeat_reply>{}.from(in).get0();
+
+            BOOST_REQUIRE(data == from_adl);
+        }
+    }
 }
 
 SEASTAR_THREAD_TEST_CASE(cluster_property_kv_exchangable_with_pair) {
diff --git a/src/v/raft/types.cc b/src/v/raft/types.cc
index c88b76da3acb4..4abdef3295a63 100644
--- a/src/v/raft/types.cc
+++ b/src/v/raft/types.cc
@@ -407,6 +407,152 @@ void heartbeat_request::serde_read(
     }
 }
 
+void heartbeat_reply::serde_write(iobuf& dst) {
+    using serde::write;
+
+    auto& reply = *this;
+    iobuf out;
+
+    struct sorter_fn {
+        constexpr bool operator()(
+          const raft::append_entries_reply& lhs,
+          const raft::append_entries_reply& rhs) const {
+            return lhs.last_flushed_log_index < rhs.last_flushed_log_index;
+        }
+    };
+
+    write(out, static_cast<uint32_t>(reply.meta.size()));
+    // no requests
+    if (reply.meta.empty()) {
+        return;
+    }
+
+    // replies are comming from the same physical node
+    write(out, reply.meta.front().node_id.id());
+    // replies are addressed to the same physical node
+    write(out, reply.meta.front().target_node_id.id());
+    std::sort(reply.meta.begin(), reply.meta.end(), sorter_fn{});
+    internal::hbeat_response_array encodee(reply.meta.size());
+
+    for (size_t i = 0; i < reply.meta.size(); ++i) {
+        encodee.groups[i] = reply.meta[i].group;
+        encodee.terms[i] = std::max(model::term_id(-1), reply.meta[i].term);
+
+        encodee.last_flushed_log_index[i] = std::max(
+          model::offset(-1), reply.meta[i].last_flushed_log_index);
+        encodee.last_dirty_log_index[i] = std::max(
+          model::offset(-1), reply.meta[i].last_dirty_log_index);
+        encodee.last_term_base_offset[i] = std::max(
+          model::offset(-1), reply.meta[i].last_term_base_offset);
+        encodee.revisions[i] = std::max(
+          model::revision_id(-1), reply.meta[i].node_id.revision());
+        encodee.target_revisions[i] = std::max(
+          model::revision_id(-1), reply.meta[i].target_node_id.revision());
+    }
+    internal::encode_one_delta_array<raft::group_id>(out, encodee.groups);
+    internal::encode_one_delta_array<model::term_id>(out, encodee.terms);
+
+    internal::encode_one_delta_array<model::offset>(
+      out, encodee.last_flushed_log_index);
+    internal::encode_one_delta_array<model::offset>(
+      out, encodee.last_dirty_log_index);
+    internal::encode_one_delta_array<model::offset>(
+      out, encodee.last_term_base_offset);
+    internal::encode_one_delta_array<model::revision_id>(
+      out, encodee.revisions);
+    internal::encode_one_delta_array<model::revision_id>(
+      out, encodee.target_revisions);
+    for (auto& m : reply.meta) {
+        write(out, m.result);
+    }
+
+    write(dst, std::move(out));
+}
+
+void heartbeat_reply::serde_read(iobuf_parser& src, const serde::header& hdr) {
+    using serde::read_nested;
+    auto tmp = read_nested<iobuf>(src, hdr._bytes_left_limit);
+    iobuf_parser in(std::move(tmp));
+
+    auto& reply = *this;
+    reply.meta = std::vector<raft::append_entries_reply>(
+      read_nested<uint32_t>(in, 0U));
+
+    // empty reply
+    if (reply.meta.empty()) {
+        return;
+    }
+
+    auto node_id = read_nested<model::node_id>(in, 0U);
+    auto target_node_id = read_nested<model::node_id>(in, 0U);
+
+    size_t size = reply.meta.size();
+    reply.meta[0].group = varlong_reader<raft::group_id>(in);
+    for (size_t i = 1; i < size; ++i) {
+        reply.meta[i].group = internal::read_one_varint_delta<raft::group_id>(
+          in, reply.meta[i - 1].group);
+    }
+    reply.meta[0].term = varlong_reader<model::term_id>(in);
+    for (size_t i = 1; i < size; ++i) {
+        reply.meta[i].term = internal::read_one_varint_delta<model::term_id>(
+          in, reply.meta[i - 1].term);
+    }
+
+    reply.meta[0].last_flushed_log_index = varlong_reader<model::offset>(in);
+    for (size_t i = 1; i < size; ++i) {
+        reply.meta[i].last_flushed_log_index
+          = internal::read_one_varint_delta<model::offset>(
+            in, reply.meta[i - 1].last_flushed_log_index);
+    }
+
+    reply.meta[0].last_dirty_log_index = varlong_reader<model::offset>(in);
+    for (size_t i = 1; i < size; ++i) {
+        reply.meta[i].last_dirty_log_index
+          = internal::read_one_varint_delta<model::offset>(
+            in, reply.meta[i - 1].last_dirty_log_index);
+    }
+
+    reply.meta[0].last_term_base_offset = varlong_reader<model::offset>(in);
+    for (size_t i = 1; i < size; ++i) {
+        reply.meta[i].last_term_base_offset
+          = internal::read_one_varint_delta<model::offset>(
+            in, reply.meta[i - 1].last_term_base_offset);
+    }
+
+    reply.meta[0].node_id = raft::vnode(
+      node_id, varlong_reader<model::revision_id>(in));
+    for (size_t i = 1; i < size; ++i) {
+        reply.meta[i].node_id = raft::vnode(
+          node_id,
+          internal::read_one_varint_delta<model::revision_id>(
+            in, reply.meta[i - 1].node_id.revision()));
+    }
+
+    reply.meta[0].target_node_id = raft::vnode(
+      target_node_id, varlong_reader<model::revision_id>(in));
+    for (size_t i = 1; i < size; ++i) {
+        reply.meta[i].target_node_id = raft::vnode(
+          target_node_id,
+          internal::read_one_varint_delta<model::revision_id>(
+            in, reply.meta[i - 1].target_node_id.revision()));
+    }
+
+    for (size_t i = 0; i < size; ++i) {
+        reply.meta[i].result = read_nested<raft::append_entries_reply::status>(
+          in, 0U);
+    }
+
+    for (auto& m : reply.meta) {
+        m.last_flushed_log_index = decode_signed(m.last_flushed_log_index);
+        m.last_dirty_log_index = decode_signed(m.last_dirty_log_index);
+        m.last_term_base_offset = decode_signed(m.last_term_base_offset);
+        m.node_id = raft::vnode(
+          m.node_id.id(), decode_signed(m.node_id.revision()));
+        m.target_node_id = raft::vnode(
+          m.target_node_id.id(), decode_signed(m.target_node_id.revision()));
+    }
+}
+
 } // namespace raft
 
 namespace reflection {
diff --git a/src/v/raft/types.h b/src/v/raft/types.h
index aae451f968afd..ca655fc3e1933 100644
--- a/src/v/raft/types.h
+++ b/src/v/raft/types.h
@@ -276,6 +276,10 @@ struct append_entries_reply {
 
     friend std::ostream&
     operator<<(std::ostream& o, const append_entries_reply& r);
+
+    friend bool
+    operator==(const append_entries_reply&, const append_entries_reply&)
+      = default;
 };
 
 struct heartbeat_metadata {
@@ -310,9 +314,21 @@ struct heartbeat_request
     ss::future<> serde_async_write(iobuf& out);
     void serde_read(iobuf_parser&, const serde::header&);
 };
-struct heartbeat_reply {
+
+struct heartbeat_reply : serde::envelope<heartbeat_reply, serde::version<0>> {
     std::vector<append_entries_reply> meta;
+
+    heartbeat_reply() noexcept = default;
+    explicit heartbeat_reply(std::vector<append_entries_reply> meta)
+      : meta(std::move(meta)) {}
+
     friend std::ostream& operator<<(std::ostream& o, const heartbeat_reply& r);
+
+    friend bool operator==(const heartbeat_reply&, const heartbeat_reply&)
+      = default;
+
+    void serde_write(iobuf& out);
+    void serde_read(iobuf_parser&, const serde::header&);
 };
 
 struct vote_request : serde::envelope<vote_request, serde::version<0>> {

From ccdee42471dc067f056e369c3e5e6ba0ecdf29b4 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noahwatkins@gmail.com>
Date: Wed, 13 Jul 2022 20:39:26 -0700
Subject: [PATCH 106/201] raft: add serde support for protocol_metadata

Signed-off-by: Noah Watkins <noahwatkins@gmail.com>
---
 src/v/cluster/tests/serialization_rt_test.cc | 11 +++++++++++
 src/v/raft/types.h                           | 14 +++++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/v/cluster/tests/serialization_rt_test.cc b/src/v/cluster/tests/serialization_rt_test.cc
index fc98070395f21..6dc69bdb78f29 100644
--- a/src/v/cluster/tests/serialization_rt_test.cc
+++ b/src/v/cluster/tests/serialization_rt_test.cc
@@ -2210,6 +2210,17 @@ SEASTAR_THREAD_TEST_CASE(serde_reflection_roundtrip) {
             BOOST_REQUIRE(data == from_adl);
         }
     }
+    {
+        raft::protocol_metadata data{
+          .group = tests::random_named_int<raft::group_id>(),
+          .commit_index = tests::random_named_int<model::offset>(),
+          .term = tests::random_named_int<model::term_id>(),
+          .prev_log_index = tests::random_named_int<model::offset>(),
+          .prev_log_term = tests::random_named_int<model::term_id>(),
+          .last_visible_index = tests::random_named_int<model::offset>(),
+        };
+        roundtrip_test(data);
+    }
 }
 
 SEASTAR_THREAD_TEST_CASE(cluster_property_kv_exchangable_with_pair) {
diff --git a/src/v/raft/types.h b/src/v/raft/types.h
index ca655fc3e1933..d921c8bff2db9 100644
--- a/src/v/raft/types.h
+++ b/src/v/raft/types.h
@@ -44,7 +44,9 @@ static constexpr clock_type::time_point no_timeout
   = clock_type::time_point::max();
 
 using group_id = named_type<int64_t, struct raft_group_id_type>;
-struct protocol_metadata {
+
+struct protocol_metadata
+  : serde::envelope<protocol_metadata, serde::version<0>> {
     group_id group;
     model::offset commit_index;
     model::term_id term;
@@ -57,6 +59,16 @@ struct protocol_metadata {
 
     friend bool operator==(const protocol_metadata&, const protocol_metadata&)
       = default;
+
+    auto serde_fields() {
+        return std::tie(
+          group,
+          commit_index,
+          term,
+          prev_log_index,
+          prev_log_term,
+          last_visible_index);
+    }
 };
 
 // The sequence used to track the order of follower append entries request

From 713e2d5ef86125add5b1aca5fc9bc9ac4b1df462 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noahwatkins@gmail.com>
Date: Wed, 13 Jul 2022 11:39:37 -0700
Subject: [PATCH 107/201] raft: add serde support to append_entries_request

Signed-off-by: Noah Watkins <noahwatkins@gmail.com>
---
 src/v/cluster/tests/serialization_rt_test.cc | 58 ++++++++++++++++++++
 src/v/raft/types.cc                          | 45 +++++++++++++++
 src/v/raft/types.h                           | 12 +++-
 3 files changed, 114 insertions(+), 1 deletion(-)

diff --git a/src/v/cluster/tests/serialization_rt_test.cc b/src/v/cluster/tests/serialization_rt_test.cc
index 6dc69bdb78f29..a33fd60cdd875 100644
--- a/src/v/cluster/tests/serialization_rt_test.cc
+++ b/src/v/cluster/tests/serialization_rt_test.cc
@@ -15,6 +15,7 @@
 #include "model/compression.h"
 #include "model/fundamental.h"
 #include "model/metadata.h"
+#include "model/tests/random_batch.h"
 #include "model/tests/randoms.h"
 #include "model/timestamp.h"
 #include "raft/types.h"
@@ -2221,6 +2222,63 @@ SEASTAR_THREAD_TEST_CASE(serde_reflection_roundtrip) {
         };
         roundtrip_test(data);
     }
+    {
+        const auto gold = model::test::make_random_batches(
+          model::offset(0), 20);
+
+        // make a copy of the source batches for later comparison because the
+        // copy moved into the request will get eaten.
+        ss::circular_buffer<model::record_batch> batches_in;
+        for (const auto& batch : gold) {
+            batches_in.push_back(batch.copy());
+        }
+
+        raft::protocol_metadata pmd{
+          .group = tests::random_named_int<raft::group_id>(),
+          .commit_index = tests::random_named_int<model::offset>(),
+          .term = tests::random_named_int<model::term_id>(),
+          .prev_log_index = tests::random_named_int<model::offset>(),
+          .prev_log_term = tests::random_named_int<model::term_id>(),
+          .last_visible_index = tests::random_named_int<model::offset>(),
+        };
+
+        raft::append_entries_request data{
+          raft::vnode{
+            tests::random_named_int<model::node_id>(),
+            tests::random_named_int<model::revision_id>()},
+          raft::vnode{
+            tests::random_named_int<model::node_id>(),
+            tests::random_named_int<model::revision_id>()},
+          pmd,
+          model::make_memory_record_batch_reader(std::move(batches_in)),
+          raft::append_entries_request::flush_after_append(
+            tests::random_bool()),
+        };
+
+        // append_entries_request -> iobuf
+        iobuf serde_out;
+        serde::write_async(serde_out, std::move(data)).get();
+
+        // iobuf -> append_entries_request
+        iobuf_parser serde_in(std::move(serde_out));
+        auto from_serde
+          = serde::read_async<raft::append_entries_request>(serde_in).get0();
+
+        BOOST_REQUIRE(from_serde.node_id == data.node_id);
+        BOOST_REQUIRE(from_serde.target_node_id == data.target_node_id);
+        BOOST_REQUIRE(from_serde.meta == data.meta);
+        BOOST_REQUIRE(from_serde.flush == data.flush);
+
+        auto batches_from_serde = model::consume_reader_to_memory(
+                                    std::move(from_serde.batches()),
+                                    model::no_timeout)
+                                    .get0();
+        BOOST_REQUIRE(gold.size() > 0);
+        BOOST_REQUIRE(batches_from_serde.size() == gold.size());
+        for (size_t i = 0; i < gold.size(); i++) {
+            BOOST_REQUIRE(batches_from_serde[i] == gold[i]);
+        }
+    }
 }
 
 SEASTAR_THREAD_TEST_CASE(cluster_property_kv_exchangable_with_pair) {
diff --git a/src/v/raft/types.cc b/src/v/raft/types.cc
index 4abdef3295a63..174daab975eaa 100644
--- a/src/v/raft/types.cc
+++ b/src/v/raft/types.cc
@@ -553,6 +553,51 @@ void heartbeat_reply::serde_read(iobuf_parser& src, const serde::header& hdr) {
     }
 }
 
+ss::future<> append_entries_request::serde_async_write(iobuf& dst) {
+    auto mem_batches = co_await model::consume_reader_to_memory(
+      std::move(batches()), model::no_timeout);
+
+    iobuf out;
+    using serde::write;
+
+    write(out, static_cast<uint32_t>(mem_batches.size()));
+    for (auto& batch : mem_batches) {
+        // intentionally using reflection here for batches which are not yet
+        // supported with serde, but also have largely solidified.
+        reflection::serialize(out, std::move(batch));
+        co_await ss::coroutine::maybe_yield();
+    }
+
+    write(out, node_id);
+    write(out, target_node_id);
+    write(out, meta);
+    write(out, flush);
+
+    write(dst, std::move(out));
+}
+
+ss::future<> append_entries_request::serde_async_read(
+  iobuf_parser& src, const serde::header& hdr) {
+    using serde::read_nested;
+    auto tmp = read_nested<iobuf>(src, hdr._bytes_left_limit);
+    iobuf_parser in(std::move(tmp));
+
+    auto batch_count = read_nested<uint32_t>(in, 0U);
+    auto batches = ss::circular_buffer<model::record_batch>{};
+    batches.reserve(batch_count);
+    for (uint32_t i = 0; i < batch_count; ++i) {
+        batches.push_back(reflection::adl<model::record_batch>{}.from(in));
+        co_await ss::coroutine::maybe_yield();
+    }
+
+    _batches = model::make_memory_record_batch_reader(std::move(batches));
+    node_id = read_nested<raft::vnode>(in, 0U);
+    target_node_id = read_nested<raft::vnode>(in, 0U);
+    meta = read_nested<raft::protocol_metadata>(in, 0U);
+    flush = read_nested<raft::append_entries_request::flush_after_append>(
+      in, 0U);
+}
+
 } // namespace raft
 
 namespace reflection {
diff --git a/src/v/raft/types.h b/src/v/raft/types.h
index d921c8bff2db9..7320477c38270 100644
--- a/src/v/raft/types.h
+++ b/src/v/raft/types.h
@@ -192,9 +192,16 @@ struct follower_metrics {
     bool under_replicated;
 };
 
-struct append_entries_request {
+struct append_entries_request
+  : serde::envelope<append_entries_request, serde::version<0>> {
     using flush_after_append = ss::bool_class<struct flush_after_append_tag>;
 
+    /*
+     * default initialize with no record batch reader. default construction
+     * should only be used by serialization frameworks.
+     */
+    append_entries_request() noexcept = default;
+
     // required for the cases where we will set the target node id before
     // sending request to the node
     append_entries_request(
@@ -252,6 +259,9 @@ struct append_entries_request {
           req.flush);
     }
 
+    ss::future<> serde_async_write(iobuf& out);
+    ss::future<> serde_async_read(iobuf_parser&, const serde::header&);
+
 private:
     /*
      * batches is optional to allow append_entries_request to have a default

From f387d75832393699d011ad95f35a63670151e2ae Mon Sep 17 00:00:00 2001
From: Noah Watkins <noahwatkins@gmail.com>
Date: Wed, 13 Jul 2022 11:53:41 -0700
Subject: [PATCH 108/201] raft: add adl specialization for append_entries_reply

Signed-off-by: Noah Watkins <noahwatkins@gmail.com>
---
 src/v/raft/types.h | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/src/v/raft/types.h b/src/v/raft/types.h
index 7320477c38270..56afee7a3eafa 100644
--- a/src/v/raft/types.h
+++ b/src/v/raft/types.h
@@ -886,4 +886,41 @@ struct adl<raft::vote_reply> {
         };
     }
 };
+
+template<>
+struct adl<raft::append_entries_reply> {
+    void to(iobuf& out, raft::append_entries_reply&& r) {
+        serialize(
+          out,
+          r.target_node_id,
+          r.node_id,
+          r.group,
+          r.term,
+          r.last_flushed_log_index,
+          r.last_dirty_log_index,
+          r.last_term_base_offset,
+          r.result);
+    }
+    raft::append_entries_reply from(iobuf_parser& in) {
+        auto target_node_id = adl<raft::vnode>{}.from(in);
+        auto node_id = adl<raft::vnode>{}.from(in);
+        auto group = adl<raft::group_id>{}.from(in);
+        auto term = adl<model::term_id>{}.from(in);
+        auto last_flushed_log_index = adl<model::offset>{}.from(in);
+        auto last_dirty_log_index = adl<model::offset>{}.from(in);
+        auto last_term_base_offset = adl<model::offset>{}.from(in);
+        auto result = adl<raft::append_entries_reply::status>{}.from(in);
+        return {
+          .target_node_id = target_node_id,
+          .node_id = node_id,
+          .group = group,
+          .term = term,
+          .last_flushed_log_index = last_flushed_log_index,
+          .last_dirty_log_index = last_dirty_log_index,
+          .last_term_base_offset = last_term_base_offset,
+          .result = result,
+        };
+    }
+};
+
 } // namespace reflection

From 7b45a66e6edf1c81d46e7ded9defc11d5ccc21e1 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noahwatkins@gmail.com>
Date: Wed, 13 Jul 2022 11:57:24 -0700
Subject: [PATCH 109/201] raft: add serde support for append_entries_reply

Signed-off-by: Noah Watkins <noahwatkins@gmail.com>
---
 src/v/cluster/tests/serialization_rt_test.cc | 15 ++++++++++++++
 src/v/raft/types.h                           | 21 +++++++++++++++++++-
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/src/v/cluster/tests/serialization_rt_test.cc b/src/v/cluster/tests/serialization_rt_test.cc
index a33fd60cdd875..a5f51bbdc05f5 100644
--- a/src/v/cluster/tests/serialization_rt_test.cc
+++ b/src/v/cluster/tests/serialization_rt_test.cc
@@ -2279,6 +2279,21 @@ SEASTAR_THREAD_TEST_CASE(serde_reflection_roundtrip) {
             BOOST_REQUIRE(batches_from_serde[i] == gold[i]);
         }
     }
+    {
+        raft::append_entries_reply data{
+          .target_node_id = raft::
+            vnode{tests::random_named_int<model::node_id>(), tests::random_named_int<model::revision_id>()},
+          .node_id = raft::
+            vnode{tests::random_named_int<model::node_id>(), tests::random_named_int<model::revision_id>()},
+          .group = tests::random_named_int<raft::group_id>(),
+          .term = tests::random_named_int<model::term_id>(),
+          .last_flushed_log_index = tests::random_named_int<model::offset>(),
+          .last_dirty_log_index = tests::random_named_int<model::offset>(),
+          .last_term_base_offset = tests::random_named_int<model::offset>(),
+          .result = raft::append_entries_reply::status::group_unavailable,
+        };
+        roundtrip_test(data);
+    }
 }
 
 SEASTAR_THREAD_TEST_CASE(cluster_property_kv_exchangable_with_pair) {
diff --git a/src/v/raft/types.h b/src/v/raft/types.h
index 56afee7a3eafa..46d4d730637fd 100644
--- a/src/v/raft/types.h
+++ b/src/v/raft/types.h
@@ -271,7 +271,14 @@ struct append_entries_request
     std::optional<model::record_batch_reader> _batches;
 };
 
-struct append_entries_reply {
+/*
+ * append_entries_reply uses two different types of serialization: when
+ * encoding/decoding directly normal adl/serde per-field serialization is used.
+ * the second type is a custom encoding used by heartbeat_reply for more
+ * efficient encoding of a vectory of append_entries_reply.
+ */
+struct append_entries_reply
+  : serde::envelope<append_entries_reply, serde::version<0>> {
     enum class status : uint8_t {
         success,
         failure,
@@ -302,6 +309,18 @@ struct append_entries_reply {
     friend bool
     operator==(const append_entries_reply&, const append_entries_reply&)
       = default;
+
+    auto serde_fields() {
+        return std::tie(
+          target_node_id,
+          node_id,
+          group,
+          term,
+          last_flushed_log_index,
+          last_dirty_log_index,
+          last_term_base_offset,
+          result);
+    }
 };
 
 struct heartbeat_metadata {

From 289a7b7fa850bbaed64968ad5d565bacc1ac6230 Mon Sep 17 00:00:00 2001
From: Rogger Vasquez <rvasque3@gmail.com>
Date: Tue, 12 Jul 2022 17:57:35 -0500
Subject: [PATCH 110/201] rpk:make redpanda start use loaded file or default

In the viper removal PR (#5061) we introduced
a bug:

rpk redpanda start will load the file and update
some unset defaults such as rpk.kafka_api.

This breaks backward compatibility for some users
so we are correcting it in this commit.
---
 src/go/rpk/pkg/cli/cmd/redpanda/start.go      |  7 +++++++
 src/go/rpk/pkg/cli/cmd/redpanda/start_test.go |  6 ------
 tests/rptest/tests/rpk_config_test.py         | 15 ---------------
 3 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/start.go b/src/go/rpk/pkg/cli/cmd/redpanda/start.go
index 9e5c2b59e9403..29bd6805541ec 100644
--- a/src/go/rpk/pkg/cli/cmd/redpanda/start.go
+++ b/src/go/rpk/pkg/cli/cmd/redpanda/start.go
@@ -152,6 +152,13 @@ func NewStartCommand(fs afero.Fs, launcher rp.Launcher) *cobra.Command {
 			if err != nil {
 				return fmt.Errorf("unable to load config file: %s", err)
 			}
+			// We set fields in the raw file without writing rpk specific env
+			// or flag overrides. This command itself has all redpanda specific
+			// flags installed, and handles redpanda specific env vars itself.
+			// The magic `--set` flag is what modifies any redpanda.yaml fields.
+			// Thus, we can ignore any env / flags that would come from rpk
+			// configuration itself.
+			cfg = cfg.FileOrDefaults()
 
 			if len(configKvs) > 0 {
 				if err = setConfig(cfg, configKvs); err != nil {
diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/start_test.go b/src/go/rpk/pkg/cli/cmd/redpanda/start_test.go
index aa843be833d45..cfcfde0a3556d 100644
--- a/src/go/rpk/pkg/cli/cmd/redpanda/start_test.go
+++ b/src/go/rpk/pkg/cli/cmd/redpanda/start_test.go
@@ -14,9 +14,7 @@ package redpanda
 
 import (
 	"bytes"
-	"net"
 	"os"
-	"strconv"
 	"testing"
 
 	"github.com/redpanda-data/redpanda/src/go/rpk/pkg/config"
@@ -194,10 +192,6 @@ func TestStartCommand(t *testing.T) {
 				path,
 			)
 			c := config.Default()
-			// Adding unset default that get added on first load.
-			b0 := c.Redpanda.KafkaAPI[0]
-			c.Rpk.KafkaAPI.Brokers = []string{net.JoinHostPort(b0.Address, strconv.Itoa(b0.Port))}
-			c.Rpk.AdminAPI.Addresses = []string{"127.0.0.1:9644"}
 
 			conf, err := new(config.Params).Load(fs)
 			require.NoError(st, err)
diff --git a/tests/rptest/tests/rpk_config_test.py b/tests/rptest/tests/rpk_config_test.py
index 78711b6f8c4a0..c18d529697d52 100644
--- a/tests/rptest/tests/rpk_config_test.py
+++ b/tests/rptest/tests/rpk_config_test.py
@@ -53,15 +53,9 @@ def test_config_init(self):
           port: 9644
     developer_mode: true
 rpk:
-    admin_api:
-        addresses:
-            - 127.0.0.1:9644
     coredump_dir: /var/lib/redpanda/coredump
     enable_memory_locking: false
     enable_usage_stats: false
-    kafka_api:
-        brokers:
-            - 0.0.0.0:9092    
     overprovisioned: false
     tune_aio_events: false
     tune_ballast_file: false
@@ -182,9 +176,6 @@ def test_config_set_json(self):
         rpk.config_set(key, value, format='json')
 
         expected_config = yaml.full_load('''
-admin_api:
-    addresses:
-        - 127.0.0.1:9644
 coredump_dir: /var/lib/redpanda/coredump
 enable_memory_locking: false
 enable_usage_stats: false  
@@ -210,12 +201,6 @@ def test_config_set_json(self):
             with open(os.path.join(d, 'redpanda.yaml')) as f:
                 actual_config = yaml.full_load(f.read())
 
-                assert actual_config['rpk']['kafka_api'] is not None
-
-                # Delete 'kafka_api' so they can be compared since the
-                # brokers change depending on the container it's running
-                del actual_config['rpk']['kafka_api']
-
                 if actual_config['rpk'] != expected_config:
                     self.logger.error("Configs differ")
                     self.logger.error(

From d1b376934cd61d0dd9a7123c68d21df5757e2e68 Mon Sep 17 00:00:00 2001
From: Rogger Vasquez <rvasque3@gmail.com>
Date: Thu, 14 Jul 2022 11:47:36 -0500
Subject: [PATCH 111/201] rpk: use file or default in redpanda mode

---
 src/go/rpk/pkg/cli/cmd/redpanda/mode.go      | 1 +
 src/go/rpk/pkg/cli/cmd/redpanda/mode_test.go | 4 ----
 tests/rptest/tests/rpk_config_test.py        | 6 ------
 3 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/mode.go b/src/go/rpk/pkg/cli/cmd/redpanda/mode.go
index 4433f0948390d..a487737ecfd1f 100644
--- a/src/go/rpk/pkg/cli/cmd/redpanda/mode.go
+++ b/src/go/rpk/pkg/cli/cmd/redpanda/mode.go
@@ -56,6 +56,7 @@ func executeMode(fs afero.Fs, cmd *cobra.Command, mode string) error {
 	if err != nil {
 		return fmt.Errorf("unable to load config: %v", err)
 	}
+	cfg = cfg.FileOrDefaults() // we modify fields in the raw file without writing env / flag overrides
 	cfg, err = config.SetMode(mode, cfg)
 	if err != nil {
 		return err
diff --git a/src/go/rpk/pkg/cli/cmd/redpanda/mode_test.go b/src/go/rpk/pkg/cli/cmd/redpanda/mode_test.go
index 5d556bffc1329..85eec5c677384 100644
--- a/src/go/rpk/pkg/cli/cmd/redpanda/mode_test.go
+++ b/src/go/rpk/pkg/cli/cmd/redpanda/mode_test.go
@@ -42,10 +42,6 @@ func fillRpkConfig(path, mode string) *config.Config {
 		Overprovisioned:    !val,
 		TuneBallastFile:    val,
 	}
-	// Unset defaults that get added after command execution, needed to compare
-	// expected config with loaded config.
-	conf.Rpk.KafkaAPI = config.RpkKafkaAPI{Brokers: []string{"0.0.0.0:9092"}}
-	conf.Rpk.AdminAPI = config.RpkAdminAPI{Addresses: []string{"127.0.0.1:9644"}}
 	return conf
 }
 
diff --git a/tests/rptest/tests/rpk_config_test.py b/tests/rptest/tests/rpk_config_test.py
index c18d529697d52..ab42007b1431d 100644
--- a/tests/rptest/tests/rpk_config_test.py
+++ b/tests/rptest/tests/rpk_config_test.py
@@ -254,12 +254,6 @@ def test_config_change_mode_prod(self):
             with open(os.path.join(d, 'redpanda.yaml')) as f:
                 actual_config = yaml.full_load(f.read())
 
-                # Delete 'admin_api' and 'kafka_api' since they are not
-                # needed for this test and the brokers change depending
-                # on the container it's running.
-                del actual_config['rpk']['kafka_api']
-                del actual_config['rpk']['admin_api']
-
                 if actual_config['rpk'] != expected_config:
                     self.logger.error("Configs differ")
                     self.logger.error(

From 83f2ae59d8b7a40807a0f9d857c21a1f2fb3d6b9 Mon Sep 17 00:00:00 2001
From: Rogger Vasquez <rvasque3@gmail.com>
Date: Mon, 11 Jul 2022 17:13:15 -0500
Subject: [PATCH 112/201] rpk: add license commands (set and info)

This enable the option to query admin api for
license information and also set a new license.
---
 src/go/rpk/pkg/api/admin/api_features.go      | 21 ++++++
 src/go/rpk/pkg/cli/cmd/cluster.go             | 13 ++--
 .../rpk/pkg/cli/cmd/cluster/license/info.go   | 68 ++++++++++++++++++
 .../pkg/cli/cmd/cluster/license/license.go    | 44 ++++++++++++
 src/go/rpk/pkg/cli/cmd/cluster/license/set.go | 71 +++++++++++++++++++
 5 files changed, 212 insertions(+), 5 deletions(-)
 create mode 100644 src/go/rpk/pkg/cli/cmd/cluster/license/info.go
 create mode 100644 src/go/rpk/pkg/cli/cmd/cluster/license/license.go
 create mode 100644 src/go/rpk/pkg/cli/cmd/cluster/license/set.go

diff --git a/src/go/rpk/pkg/api/admin/api_features.go b/src/go/rpk/pkg/api/admin/api_features.go
index 27ba6578c244b..15cce28e4a0f2 100644
--- a/src/go/rpk/pkg/api/admin/api_features.go
+++ b/src/go/rpk/pkg/api/admin/api_features.go
@@ -38,6 +38,18 @@ type FeaturesResponse struct {
 	Features       []Feature `json:"features"`
 }
 
+type License struct {
+	Loaded     bool              `json:"loaded"`
+	Properties LicenseProperties `json:"license"`
+}
+
+type LicenseProperties struct {
+	Version      int    `json:"format_version"`
+	Organization string `json:"org"`
+	Type         string `json:"type"`
+	Expires      int    `json:"expires"`
+}
+
 // GetFeatures returns information about the available features.
 func (a *AdminAPI) GetFeatures(ctx context.Context) (FeaturesResponse, error) {
 	var features FeaturesResponse
@@ -48,3 +60,12 @@ func (a *AdminAPI) GetFeatures(ctx context.Context) (FeaturesResponse, error) {
 		nil,
 		&features)
 }
+
+func (a *AdminAPI) GetLicenseInfo(ctx context.Context) (License, error) {
+	var license License
+	return license, a.sendAny(ctx, http.MethodGet, "/v1/features/license", nil, &license)
+}
+
+func (a *AdminAPI) SetLicense(ctx context.Context, license interface{}) error {
+	return a.sendToLeader(ctx, http.MethodPut, "/v1/features/license", license, nil)
+}
diff --git a/src/go/rpk/pkg/cli/cmd/cluster.go b/src/go/rpk/pkg/cli/cmd/cluster.go
index a3be6165a55b8..1c11c28ad7a22 100644
--- a/src/go/rpk/pkg/cli/cmd/cluster.go
+++ b/src/go/rpk/pkg/cli/cmd/cluster.go
@@ -12,6 +12,7 @@ package cmd
 import (
 	"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/cluster"
 	"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/cluster/config"
+	"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/cluster/license"
 	"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/cluster/maintenance"
 	"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/common"
 	"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/group"
@@ -54,11 +55,13 @@ func NewClusterCommand(fs afero.Fs) *cobra.Command {
 	offsets.Deprecated = "replaced by 'rpk group describe'"
 	offsets.Hidden = true
 	offsets.Use = "offsets"
-	command.AddCommand(offsets)
-
-	command.AddCommand(config.NewConfigCommand(fs))
-	command.AddCommand(maintenance.NewMaintenanceCommand(fs))
-	command.AddCommand(cluster.NewHealthOverviewCommand(fs))
+	command.AddCommand(
+		cluster.NewHealthOverviewCommand(fs),
+		config.NewConfigCommand(fs),
+		license.NewLicenseCommand(fs),
+		maintenance.NewMaintenanceCommand(fs),
+		offsets,
+	)
 
 	return command
 }
diff --git a/src/go/rpk/pkg/cli/cmd/cluster/license/info.go b/src/go/rpk/pkg/cli/cmd/cluster/license/info.go
new file mode 100644
index 0000000000000..03153aa33f1e7
--- /dev/null
+++ b/src/go/rpk/pkg/cli/cmd/cluster/license/info.go
@@ -0,0 +1,68 @@
+package license
+
+import (
+	"encoding/json"
+	"fmt"
+
+	"github.com/redpanda-data/redpanda/src/go/rpk/pkg/api/admin"
+	"github.com/redpanda-data/redpanda/src/go/rpk/pkg/config"
+	"github.com/redpanda-data/redpanda/src/go/rpk/pkg/out"
+	"github.com/spf13/afero"
+	"github.com/spf13/cobra"
+)
+
+func newInfoCommand(fs afero.Fs) *cobra.Command {
+	var format string
+	command := &cobra.Command{
+		Use:   "info",
+		Args:  cobra.ExactArgs(0),
+		Short: "Retrieve license information",
+		Long: `Retrieve license information:
+
+    Organization:    Organization the license was generated for.
+    Type:            Type of license: free, enterprise, etc.
+    Expires:         Number of days the license is valid until or -1 if is expired.
+    Version:         License schema version.
+`,
+		Run: func(cmd *cobra.Command, args []string) {
+			p := config.ParamsFromCommand(cmd)
+			cfg, err := p.Load(fs)
+			out.MaybeDie(err, "unable to load config: %v", err)
+
+			cl, err := admin.NewClient(fs, cfg)
+			out.MaybeDie(err, "unable to initialize admin client: %v", err)
+
+			info, err := cl.GetLicenseInfo(cmd.Context())
+			out.MaybeDie(err, "unable to retrieve license info: %v", err)
+
+			if !info.Loaded {
+				out.Die("this cluster is missing a license")
+			}
+
+			if info.Properties != (admin.LicenseProperties{}) {
+				if format == "json" {
+					props, err := json.MarshalIndent(info.Properties, "", "  ")
+					out.MaybeDie(err, "unable to print license information as json: %v", err)
+					fmt.Printf("%s\n", props)
+				} else {
+					printLicenseInfo(info.Properties)
+				}
+			} else {
+				out.Die("no license loaded")
+			}
+		},
+	}
+
+	command.Flags().StringVar(&format, "format", "text", "Output format (text, json)")
+	return command
+}
+
+func printLicenseInfo(p admin.LicenseProperties) {
+	out.Section("LICENSE INFORMATION")
+	licenseFormat := `Organization:    %v
+Type:            %v
+Expires:         %v days
+Version:         %v
+`
+	fmt.Printf(licenseFormat, p.Organization, p.Type, p.Expires, p.Version)
+}
diff --git a/src/go/rpk/pkg/cli/cmd/cluster/license/license.go b/src/go/rpk/pkg/cli/cmd/cluster/license/license.go
new file mode 100644
index 0000000000000..7f5d02cd6196f
--- /dev/null
+++ b/src/go/rpk/pkg/cli/cmd/cluster/license/license.go
@@ -0,0 +1,44 @@
+package license
+
+import (
+	"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/cmd/common"
+	"github.com/redpanda-data/redpanda/src/go/rpk/pkg/config"
+	"github.com/spf13/afero"
+	"github.com/spf13/cobra"
+)
+
+func NewLicenseCommand(fs afero.Fs) *cobra.Command {
+	var (
+		adminURL       string
+		adminEnableTLS bool
+		adminCertFile  string
+		adminKeyFile   string
+		adminCAFile    string
+	)
+
+	cmd := &cobra.Command{
+		Use:   "license",
+		Args:  cobra.ExactArgs(0),
+		Short: "Manage cluster license.",
+	}
+
+	common.AddAdminAPITLSFlags(cmd,
+		&adminEnableTLS,
+		&adminCertFile,
+		&adminKeyFile,
+		&adminCAFile,
+	)
+
+	cmd.AddCommand(
+		newInfoCommand(fs),
+		newSetCommand(fs),
+	)
+
+	cmd.PersistentFlags().StringVar(
+		&adminURL,
+		config.FlagAdminHosts2,
+		"",
+		"Comma-separated list of admin API addresses (<IP>:<port>)")
+
+	return cmd
+}
diff --git a/src/go/rpk/pkg/cli/cmd/cluster/license/set.go b/src/go/rpk/pkg/cli/cmd/cluster/license/set.go
new file mode 100644
index 0000000000000..8a8b0ade048cf
--- /dev/null
+++ b/src/go/rpk/pkg/cli/cmd/cluster/license/set.go
@@ -0,0 +1,71 @@
+package license
+
+import (
+	"fmt"
+	"io"
+	"strings"
+
+	"github.com/redpanda-data/redpanda/src/go/rpk/pkg/api/admin"
+	"github.com/redpanda-data/redpanda/src/go/rpk/pkg/config"
+	"github.com/redpanda-data/redpanda/src/go/rpk/pkg/out"
+	"github.com/spf13/afero"
+	"github.com/spf13/cobra"
+)
+
+func newSetCommand(fs afero.Fs) *cobra.Command {
+	var licPath string
+	cmd := &cobra.Command{
+		Use:   "set",
+		Args:  cobra.MaximumNArgs(1),
+		Short: "Upload license to the cluster",
+		Long: `Upload license to the cluster
+
+You can either provide a path to a file containing the license:
+
+    rpk cluster license set --path /home/organization/redpanda.license
+
+Or inline the license string:
+
+    rpk cluster license set <license string>
+
+If neither are present, rpk will look for the license in the
+default location '/etc/redpanda/redpanda.license'.
+`,
+
+		Run: func(cmd *cobra.Command, args []string) {
+			if licPath != "" && len(args) > 0 {
+				out.Die("inline license cannot be passed if flag '--path' is set")
+			}
+			if licPath == "" && len(args) == 0 {
+				fmt.Println("Neither license file nor inline license was provided, checking '/etc/redpanda/redpanda.license'.")
+				licPath = "/etc/redpanda/redpanda.license"
+			}
+
+			p := config.ParamsFromCommand(cmd)
+			cfg, err := p.Load(fs)
+			out.MaybeDie(err, "unable to load config: %v", err)
+
+			cl, err := admin.NewClient(fs, cfg)
+			out.MaybeDie(err, "unable to initialize admin client: %v", err)
+
+			var r io.Reader
+			if licPath != "" {
+				file, err := fs.Open(licPath)
+				out.MaybeDie(err, "unable to open %q: %v", licPath, err)
+				defer file.Close()
+				r = file
+			} else {
+				r = strings.NewReader(args[0])
+			}
+
+			err = cl.SetLicense(cmd.Context(), r)
+			out.MaybeDie(err, "unable to set license: %v", err)
+
+			fmt.Println("Successfully uploaded license.")
+		},
+	}
+
+	cmd.Flags().StringVar(&licPath, "path", "", "Path to the license file")
+
+	return cmd
+}

From 6d88cbc2a7eb9b526ba2f11657a1afef9960c003 Mon Sep 17 00:00:00 2001
From: Rogger Vasquez <rvasque3@gmail.com>
Date: Thu, 14 Jul 2022 11:56:19 -0500
Subject: [PATCH 113/201] tests: add rpk cluster license ducktape tests

---
 tests/rptest/clients/rpk.py            | 23 ++++++++
 tests/rptest/tests/rpk_cluster_test.py | 76 ++++++++++++++++++++++++++
 tests/rptest/util.py                   | 12 ++++
 3 files changed, 111 insertions(+)

diff --git a/tests/rptest/clients/rpk.py b/tests/rptest/clients/rpk.py
index feb0be56fde81..c6c266ac89041 100644
--- a/tests/rptest/clients/rpk.py
+++ b/tests/rptest/clients/rpk.py
@@ -713,3 +713,26 @@ def cluster_metadata_id(self):
             return None
         else:
             return lines[2]
+
+    def license_set(self, path, license=""):
+        cmd = [
+            self._rpk_binary(), "--api-urls",
+            self._admin_host(), "cluster", "license", "set"
+        ]
+
+        if license:
+            cmd += [license]
+        if path:
+            cmd += ["--path", path]
+
+        return self._execute(cmd)
+
+    def license_info(self):
+
+        cmd = [
+            self._rpk_binary(), "--api-urls",
+            self._admin_host(), "cluster", "license", "info", "--format",
+            "json"
+        ]
+
+        return self._execute(cmd)
diff --git a/tests/rptest/tests/rpk_cluster_test.py b/tests/rptest/tests/rpk_cluster_test.py
index 4df7b9be69cca..a2d6f2ea30a8b 100644
--- a/tests/rptest/tests/rpk_cluster_test.py
+++ b/tests/rptest/tests/rpk_cluster_test.py
@@ -9,11 +9,14 @@
 
 import os
 import re
+import datetime
+import tempfile
 import zipfile
 import json
 
 from rptest.services.cluster import cluster
 from rptest.services.redpanda import RESTART_LOG_ALLOW_LIST
+from rptest.util import expect_exception, get_cluster_license
 from ducktape.utils.util import wait_until
 
 from rptest.tests.redpanda_test import RedpandaTest
@@ -184,3 +187,76 @@ def test_cluster_down(self):
             pass
         else:
             assert False, f"Unexpected success: '{r}'"
+
+    @cluster(num_nodes=3)
+    def test_upload_and_query_cluster_license_rpk(self):
+        """
+        Test uploading and retrieval of license via rpk
+        using --path option
+        """
+        license = get_cluster_license()
+        if license is None:
+            self.logger.info(
+                "Skipping test, REDPANDA_SAMPLE_LICENSE env var not found")
+            return
+
+        with tempfile.NamedTemporaryFile() as tf:
+            tf.write(bytes(license, 'UTF-8'))
+            tf.seek(0)
+            output = self._rpk.license_set(tf.name)
+            assert "Successfully uploaded license" in output
+
+        def get_license():
+            output = self._rpk.license_info()
+            resp = json.loads(output)
+            if resp['org'] == "redpanda-testing":
+                return True
+
+            return False
+
+        wait_until(get_license,
+                   timeout_sec=10,
+                   backoff_sec=1,
+                   retry_on_exc=True,
+                   err_msg="unable to retrieve license information")
+
+        expected_license = {
+            'expires':
+            (datetime.date(2122, 6, 6) - datetime.date.today()).days,
+            'format_version': 0,
+            'org': 'redpanda-testing',
+            'type': 'enterprise'
+        }
+        output = self._rpk.license_info()
+        assert expected_license == json.loads(output)
+
+    @cluster(num_nodes=3)
+    def test_upload_cluster_license_rpk(self):
+        """
+        Test uploading of license via rpk
+        using inline license option
+        """
+        license = get_cluster_license()
+        if license is None:
+            self.logger.info(
+                "Skipping test, REDPANDA_SAMPLE_LICENSE env var not found")
+            return
+
+        output = self._rpk.license_set("", license)
+        assert "Successfully uploaded license" in output
+
+    @cluster(num_nodes=3)
+    def test_upload_cluster_license_error(self):
+        with expect_exception(RpkException,
+                              lambda e: "Internal Server Error" in str(e)):
+            license = get_cluster_license()
+            if license is None:
+                self.logger.info(
+                    "Skipping test, REDPANDA_SAMPLE_LICENSE env var not found")
+                return
+
+            with tempfile.NamedTemporaryFile() as tf:
+                tf.write(bytes(license + 'r', 'UTF-8'))
+                tf.seek(0)
+
+                self._rpk.license_set(tf.name)
diff --git a/tests/rptest/util.py b/tests/rptest/util.py
index 0049cda0cc86c..5dc2cca9c34f5 100644
--- a/tests/rptest/util.py
+++ b/tests/rptest/util.py
@@ -215,6 +215,18 @@ def inject_remote_script(node, script_name):
     return remote_path
 
 
+def get_cluster_license():
+    license = os.environ.get("REDPANDA_SAMPLE_LICENSE", None)
+    if license is None:
+        is_ci = os.environ.get("CI", "false")
+        if is_ci == "true":
+            raise RuntimeError(
+                "Expected REDPANDA_SAMPLE_LICENSE variable to be set in this environment"
+            )
+
+    return license
+
+
 class firewall_blocked:
     """Temporary firewall barrier that isolates set of redpanda
     nodes from the ip-address"""

From 137f95cb61aba650d72213b8f71267b90e0c5588 Mon Sep 17 00:00:00 2001
From: Rogger Vasquez <rvasque3@gmail.com>
Date: Wed, 13 Jul 2022 16:52:31 -0500
Subject: [PATCH 114/201] rpk: print errors in configChecker failure

ConfigFileChecker was ignoring the error, the
user will get only :

 Error: System check 'Config file valid' failed.
 Required: true, Current false

With this change we are now appending the errors.
---
 src/go/rpk/pkg/tuners/redpanda_checkers.go | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/go/rpk/pkg/tuners/redpanda_checkers.go b/src/go/rpk/pkg/tuners/redpanda_checkers.go
index f176a637dfdff..b907b94b6d40b 100644
--- a/src/go/rpk/pkg/tuners/redpanda_checkers.go
+++ b/src/go/rpk/pkg/tuners/redpanda_checkers.go
@@ -13,8 +13,10 @@ package tuners
 
 import (
 	"errors"
+	"fmt"
 	"time"
 
+	"github.com/hashicorp/go-multierror"
 	"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cloud"
 	"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cloud/gcp"
 	"github.com/redpanda-data/redpanda/src/go/rpk/pkg/config"
@@ -71,8 +73,14 @@ func NewConfigChecker(conf *config.Config) Checker {
 		Fatal,
 		true,
 		func() (interface{}, error) {
-			ok, _ := conf.Check()
-			return ok, nil
+			ok, errs := conf.Check()
+			var err error
+			if len(errs) > 0 {
+				s := multierror.ListFormatFunc(errs)
+				err = fmt.Errorf("config file checker error: %v", s)
+			}
+
+			return ok, err
 		})
 }
 

From 6d050959bac938e023225df76eecab8f5436004f Mon Sep 17 00:00:00 2001
From: Rogger Vasquez <rvasque3@gmail.com>
Date: Wed, 13 Jul 2022 17:10:36 -0500
Subject: [PATCH 115/201] rpk: improve error handling in tuner checks

---
 src/go/rpk/pkg/tuners/check.go | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/go/rpk/pkg/tuners/check.go b/src/go/rpk/pkg/tuners/check.go
index 453018698b854..f2f9eaff1e217 100644
--- a/src/go/rpk/pkg/tuners/check.go
+++ b/src/go/rpk/pkg/tuners/check.go
@@ -10,6 +10,7 @@
 package tuners
 
 import (
+	"fmt"
 	"path/filepath"
 	"sort"
 	"time"
@@ -32,14 +33,15 @@ func Check(
 
 	for _, checkers := range checkersMap {
 		for _, c := range checkers {
+			log.Debugf("Starting checker %q", c.GetDesc())
 			result := c.Check()
 			if result.Err != nil {
 				if c.GetSeverity() == Fatal {
-					return results, result.Err
+					return results, fmt.Errorf("fatal error during checker %q execution: %v", c.GetDesc(), result.Err)
 				}
-				log.Warnf("System check '%s' failed with non-fatal error '%s'", c.GetDesc(), result.Err)
+				fmt.Printf("System check %q failed with non-fatal error %q\n", c.GetDesc(), result.Err)
 			}
-			log.Debugf("Checker '%s' result %+v", c.GetDesc(), result)
+			log.Debugf("Finished checker %q; result %+v", c.GetDesc(), result)
 			results = append(results, *result)
 		}
 	}

From a3a330d2f8a61eed3986a6b19e51ac2157f58be5 Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Thu, 14 Jul 2022 19:26:03 +0100
Subject: [PATCH 116/201] controller/probe: Reduce lifecycle

The probe requires access to members, topics, and partition_leaders.

* Start the probe after all dependencies have been started
* Stop the probe before its dependencies are stopped

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/cluster/controller.cc       |  4 ++-
 src/v/cluster/controller_probe.cc | 45 ++++++++++++++++++-------------
 src/v/cluster/controller_probe.h  |  5 ++++
 3 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/src/v/cluster/controller.cc b/src/v/cluster/controller.cc
index 71cf181ecb74a..508429df43abe 100644
--- a/src/v/cluster/controller.cc
+++ b/src/v/cluster/controller.cc
@@ -92,7 +92,8 @@ ss::future<> controller::wire_up() {
           return _authorizer.start(
             []() { return config::shard_local_cfg().superusers.bind(); });
       })
-      .then([this] { return _tp_state.start(); });
+      .then([this] { return _tp_state.start(); })
+      .then([this] { _probe.start(); });
 }
 
 ss::future<> controller::start() {
@@ -377,6 +378,7 @@ ss::future<> controller::stop() {
         f = shutdown_input();
     }
 
+    _probe.stop();
     return f.then([this] {
         auto stop_leader_balancer = _leader_balancer ? _leader_balancer->stop()
                                                      : ss::now();
diff --git a/src/v/cluster/controller_probe.cc b/src/v/cluster/controller_probe.cc
index 833d76e47d22c..e930c239decd9 100644
--- a/src/v/cluster/controller_probe.cc
+++ b/src/v/cluster/controller_probe.cc
@@ -24,24 +24,33 @@
 namespace cluster {
 
 controller_probe::controller_probe(controller& c) noexcept
-  : _controller(c) {
-    _controller._raft_manager.local().register_leadership_notification(
-      [this](
-        raft::group_id group,
-        model::term_id /*term*/,
-        std::optional<model::node_id> leader_id) {
-          // We are only interested in notifications regarding the controller
-          // group.
-          if (_controller._raft0->group() != group) {
-              return;
-          }
-
-          if (leader_id != _controller.self()) {
-              _public_metrics.reset();
-          } else {
-              setup_metrics();
-          }
-      });
+  : _controller(c)
+  , _leadership_notification_handle{} {}
+
+void controller_probe::start() {
+    _leadership_notification_handle
+      = _controller._raft_manager.local().register_leadership_notification(
+        [this](
+          raft::group_id group,
+          model::term_id /*term*/,
+          std::optional<model::node_id> leader_id) {
+            // We are only interested in notifications regarding the controller
+            // group.
+            if (_controller._raft0->group() != group) {
+                return;
+            }
+
+            if (leader_id != _controller.self()) {
+                _public_metrics.reset();
+            } else {
+                setup_metrics();
+            }
+        });
+}
+
+void controller_probe::stop() {
+    _controller._raft_manager.local().unregister_leadership_notification(
+      _leadership_notification_handle);
 }
 
 void controller_probe::setup_metrics() {
diff --git a/src/v/cluster/controller_probe.h b/src/v/cluster/controller_probe.h
index 781f2de1ff68c..1688fb82496fd 100644
--- a/src/v/cluster/controller_probe.h
+++ b/src/v/cluster/controller_probe.h
@@ -12,6 +12,7 @@
 #pragma once
 
 #include "cluster/fwd.h"
+#include "cluster/types.h"
 #include "seastarx.h"
 
 #include <seastar/core/metrics_registration.hh>
@@ -22,11 +23,15 @@ class controller_probe {
 public:
     explicit controller_probe(cluster::controller&) noexcept;
 
+    void start();
+    void stop();
+
     void setup_metrics();
 
 private:
     cluster::controller& _controller;
     std::unique_ptr<ss::metrics::metric_groups> _public_metrics;
+    cluster::notification_id_type _leadership_notification_handle;
 };
 
 } // namespace cluster

From 984f1d3ece95476e0b8d4dd77d5a0e2daa717e85 Mon Sep 17 00:00:00 2001
From: NyaliaLui <nyalia@redpanda.com>
Date: Thu, 14 Jul 2022 16:01:33 -0400
Subject: [PATCH 117/201] Revert "build(deps): bump jackson-databind in
 /tests/java/e2e-verifiers"

This reverts commit d4aeba6ca8af737f22c70bc7de32b7fd07d6db7c.
---
 tests/java/e2e-verifiers/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/java/e2e-verifiers/pom.xml b/tests/java/e2e-verifiers/pom.xml
index abb1ab621d904..bd4841c852566 100644
--- a/tests/java/e2e-verifiers/pom.xml
+++ b/tests/java/e2e-verifiers/pom.xml
@@ -17,7 +17,7 @@
         <argparse4j.version>0.8.1</argparse4j.version>
         <slf4j.version>1.7.30</slf4j.version>
         <buildDir>${project.basedir}/target</buildDir>
-        <jackson.version>2.13.2.1</jackson.version>
+        <jackson.version>2.13.1</jackson.version>
         <log4j.version>1.2.17</log4j.version>
     </properties>
     <dependencies>

From eea0de57e7714748e353694addb3c8a22ce8cb24 Mon Sep 17 00:00:00 2001
From: Travis Downs <travis.downs@vectorized.io>
Date: Wed, 29 Jun 2022 15:19:11 -0700
Subject: [PATCH 118/201] Clarify behavior of process_next_response

The behavior of process_next_response is worth clarifying as the
returned future does not nececessarily wait for all enqueued respones
to be finished before resolving.

We also rename the method to better reflect its purpose.
---
 src/v/kafka/server/connection_context.cc | 17 +++++++++++++++--
 src/v/kafka/server/connection_context.h  | 14 +++++++++++++-
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/src/v/kafka/server/connection_context.cc b/src/v/kafka/server/connection_context.cc
index 4847f601013bd..bffc5dd8a9d3a 100644
--- a/src/v/kafka/server/connection_context.cc
+++ b/src/v/kafka/server/connection_context.cc
@@ -367,7 +367,7 @@ connection_context::dispatch_method_once(request_header hdr, size_t size) {
                                               response_ptr r) mutable {
                                   r->set_correlation(correlation);
                                   _responses.insert({seq, std::move(r)});
-                                  return process_next_response();
+                                  return maybe_process_responses();
                               });
                           })
                           .handle_exception([self](std::exception_ptr e) {
@@ -410,7 +410,20 @@ connection_context::dispatch_method_once(request_header hdr, size_t size) {
     });
 }
 
-ss::future<> connection_context::process_next_response() {
+/**
+ * This method processes as many responses as possible, in request order. Since
+ * we proces the second stage asynchronously within a given connection, reponses
+ * may become ready out of order, but Kafka clients expect responses exactly in
+ * request order.
+ *
+ * The _responses queue handles that: responses are enqueued there in completion
+ * order, but only sent to the client in response order. So this method, called
+ * after every response is ready, may end up sending zero, one or more requests,
+ * depending on the completion order.
+ *
+ * @return ss::future<>
+ */
+ss::future<> connection_context::maybe_process_responses() {
     return ss::repeat([this]() mutable {
         auto it = _responses.find(_next_response);
         if (it == _responses.end()) {
diff --git a/src/v/kafka/server/connection_context.h b/src/v/kafka/server/connection_context.h
index 4276f8f9da652..5445480c72533 100644
--- a/src/v/kafka/server/connection_context.h
+++ b/src/v/kafka/server/connection_context.h
@@ -166,7 +166,19 @@ class connection_context final
 
     ss::future<> handle_mtls_auth();
     ss::future<> dispatch_method_once(request_header, size_t sz);
-    ss::future<> process_next_response();
+
+    /**
+     * Process zero or more ready responses in request order.
+     *
+     * The future<> returned by this method resolves when all ready *and*
+     * in-order responses have been processed, which is not the same as all
+     * ready responses. In particular, responses which are ready may not be
+     * processed if there are earlier (lower sequence number) responses which
+     * are not yet ready: they will be processed by a future invocation.
+     *
+     * @return ss::future<> a future which as described above.
+     */
+    ss::future<> maybe_process_responses();
     ss::future<> do_process(request_context);
 
     ss::future<> handle_auth_v0(size_t);

From 5f5c7c6beb636af516dd1df6d1031c8e51b12ebe Mon Sep 17 00:00:00 2001
From: Travis Downs <travis.downs@vectorized.io>
Date: Wed, 29 Jun 2022 15:55:44 -0700
Subject: [PATCH 119/201] Add comment to session resources

---
 src/v/kafka/server/connection_context.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/v/kafka/server/connection_context.h b/src/v/kafka/server/connection_context.h
index 5445480c72533..12ef73e40d18d 100644
--- a/src/v/kafka/server/connection_context.h
+++ b/src/v/kafka/server/connection_context.h
@@ -148,7 +148,14 @@ class connection_context final
     private:
         net::server_probe& _probe;
     };
-    // used to pass around some internal state
+
+    // Used to hold resources associated with a given request until
+    // the response has been send, as well as to track some statistics
+    // about the request.
+    //
+    // The resources in particular should be not be destroyed until
+    // the request is complete (e.g., all the information written to
+    // the socket so that no userspace buffers remain).
     struct session_resources {
         ss::lowres_clock::duration backpressure_delay;
         ss::semaphore_units<> memlocks;

From d2bac6cef4a959beda691dbc3bf3c79733ccfa50 Mon Sep 17 00:00:00 2001
From: Travis Downs <travis.downs@vectorized.io>
Date: Wed, 29 Jun 2022 16:57:29 -0700
Subject: [PATCH 120/201] Release resources after the response is written

Currently we release resources after the response is enqueued
in connection_context and response processing is called, but it
may not have been sent at this point as we require in-order responses
but second-stage processing may happen out of order.

In this change, we instead tunnel the resource object through to
the place where the response is written, and release it there.

FIxes #5278.
---
 src/v/kafka/server/connection_context.cc | 47 +++++++++++++++---------
 src/v/kafka/server/connection_context.h  | 10 ++++-
 2 files changed, 39 insertions(+), 18 deletions(-)

diff --git a/src/v/kafka/server/connection_context.cc b/src/v/kafka/server/connection_context.cc
index bffc5dd8a9d3a..d046f5cbfdb0e 100644
--- a/src/v/kafka/server/connection_context.cc
+++ b/src/v/kafka/server/connection_context.cc
@@ -333,7 +333,7 @@ connection_context::dispatch_method_once(request_header hdr, size_t size) {
                                seq,
                                correlation,
                                self,
-                               s = std::move(sres)](ss::future<> d) mutable {
+                               sres = std::move(sres)](ss::future<> d) mutable {
                     /*
                      * if the dispatch/first stage failed, then we need to
                      * need to consume the second stage since it might be
@@ -362,13 +362,22 @@ connection_context::dispatch_method_once(request_header hdr, size_t size) {
                     ssx::background
                       = ssx::spawn_with_gate_then(
                           _rs.conn_gate(),
-                          [this, f = std::move(f), seq, correlation]() mutable {
-                              return f.then([this, seq, correlation](
-                                              response_ptr r) mutable {
-                                  r->set_correlation(correlation);
-                                  _responses.insert({seq, std::move(r)});
-                                  return maybe_process_responses();
-                              });
+                          [this,
+                           f = std::move(f),
+                           sres = std::move(sres),
+                           seq,
+                           correlation]() mutable {
+                              return f.then(
+                                [this,
+                                 sres = std::move(sres),
+                                 seq,
+                                 correlation](response_ptr r) mutable {
+                                    r->set_correlation(correlation);
+                                    response_and_resources randr{
+                                      std::move(r), std::move(sres)};
+                                    _responses.insert({seq, std::move(randr)});
+                                    return maybe_process_responses();
+                                });
                           })
                           .handle_exception([self](std::exception_ptr e) {
                               // ssx::spawn_with_gate already caught
@@ -397,8 +406,7 @@ connection_context::dispatch_method_once(request_header hdr, size_t size) {
 
                               self->_rs.probe().service_error();
                               self->_rs.conn->shutdown_input();
-                          })
-                          .finally([s = std::move(s), self] {});
+                          });
                     return d;
                 })
                 .handle_exception([self](std::exception_ptr e) {
@@ -433,20 +441,25 @@ ss::future<> connection_context::maybe_process_responses() {
         // found one; increment counter
         _next_response = _next_response + sequence_id(1);
 
-        auto r = std::move(it->second);
+        auto resp_and_res = std::move(it->second);
+
         _responses.erase(it);
 
-        if (r->is_noop()) {
+        if (resp_and_res.response->is_noop()) {
             return ss::make_ready_future<ss::stop_iteration>(
               ss::stop_iteration::no);
         }
 
-        auto msg = response_as_scattered(std::move(r));
+        auto msg = response_as_scattered(std::move(resp_and_res.response));
         try {
-            return _rs.conn->write(std::move(msg)).then([] {
-                return ss::make_ready_future<ss::stop_iteration>(
-                  ss::stop_iteration::no);
-            });
+            return _rs.conn->write(std::move(msg))
+              .then([] {
+                  return ss::make_ready_future<ss::stop_iteration>(
+                    ss::stop_iteration::no);
+              })
+              // release the resources only once it has been written to the
+              // connection.
+              .finally([resources = std::move(resp_and_res.resources)] {});
         } catch (...) {
             vlog(
               klog.debug,
diff --git a/src/v/kafka/server/connection_context.h b/src/v/kafka/server/connection_context.h
index 12ef73e40d18d..e0b5766827cfe 100644
--- a/src/v/kafka/server/connection_context.h
+++ b/src/v/kafka/server/connection_context.h
@@ -191,8 +191,16 @@ class connection_context final
     ss::future<> handle_auth_v0(size_t);
 
 private:
+    /**
+     * Bundles together a response and its associated resources.
+     */
+    struct response_and_resources {
+        response_ptr response;
+        session_resources resources;
+    };
+
     using sequence_id = named_type<uint64_t, struct kafka_protocol_sequence>;
-    using map_t = absl::flat_hash_map<sequence_id, response_ptr>;
+    using map_t = absl::flat_hash_map<sequence_id, response_and_resources>;
 
     class ctx_log {
     public:

From 5def3b733178657ee5ee9bb99e4020e16848516f Mon Sep 17 00:00:00 2001
From: Travis Downs <travis.downs@vectorized.io>
Date: Wed, 29 Jun 2022 20:40:06 -0700
Subject: [PATCH 121/201] Improve documentation of throttle related methods

---
 src/v/kafka/server/connection_context.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/v/kafka/server/connection_context.h b/src/v/kafka/server/connection_context.h
index e0b5766827cfe..9de753f27b243 100644
--- a/src/v/kafka/server/connection_context.h
+++ b/src/v/kafka/server/connection_context.h
@@ -164,10 +164,18 @@ class connection_context final
         std::unique_ptr<request_tracker> tracker;
     };
 
-    /// called by throttle_request
+    // Reserve units from memory from the memory semaphore in proportion
+    // to the number of bytes the request procesisng is expected to
+    // take.
     ss::future<ss::semaphore_units<>> reserve_request_units(size_t size);
 
-    /// apply correct backpressure sequence
+    // Apply backpressure sequence, where the request processing may be
+    // delayed for various reasons, including throttling but also because
+    // too few server resources are available to accomodate the request
+    // currently.
+    // When the returned future resolves, the throttling period is over and
+    // the associated resouces have been obtained and are tracked by the
+    // contained session_resources object.
     ss::future<session_resources>
     throttle_request(const request_header&, size_t sz);
 

From 79fe3a55f701515e16741da370d704b97b41ad3a Mon Sep 17 00:00:00 2001
From: Travis Downs <travis.downs@vectorized.io>
Date: Mon, 4 Jul 2022 17:59:33 -0700
Subject: [PATCH 122/201] Move max_api_key function to handlers header.

This lets us share it with the request processing code which
would also like to do type list based manipulation of the
request types.
---
 src/v/kafka/server/flex_versions.cc    | 7 -------
 src/v/kafka/server/handlers/handlers.h | 7 +++++++
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/v/kafka/server/flex_versions.cc b/src/v/kafka/server/flex_versions.cc
index 1a3565b41a715..b0e6bd2cbea21 100644
--- a/src/v/kafka/server/flex_versions.cc
+++ b/src/v/kafka/server/flex_versions.cc
@@ -17,13 +17,6 @@ namespace kafka {
 /// requests will map to a value of api_key(-2)
 static constexpr api_version invalid_api = api_version(-2);
 
-template<typename... RequestTypes>
-static constexpr size_t max_api_key(type_list<RequestTypes...>) {
-    /// Black magic here is an overload of std::max() that takes an
-    /// std::initializer_list
-    return std::max({RequestTypes::api::key()...});
-}
-
 template<typename... RequestTypes>
 static constexpr auto
 get_flexible_request_min_versions_list(type_list<RequestTypes...> r) {
diff --git a/src/v/kafka/server/handlers/handlers.h b/src/v/kafka/server/handlers/handlers.h
index 80cb2d22ebfbf..d98c4d0885f01 100644
--- a/src/v/kafka/server/handlers/handlers.h
+++ b/src/v/kafka/server/handlers/handlers.h
@@ -87,4 +87,11 @@ using request_types = make_request_types<
   end_txn_handler,
   create_partitions_handler,
   offset_for_leader_epoch_handler>;
+
+template<typename... RequestTypes>
+static constexpr size_t max_api_key(type_list<RequestTypes...>) {
+    /// Black magic here is an overload of std::max() that takes an
+    /// std::initializer_list
+    return std::max({RequestTypes::api::key()...});
+}
 } // namespace kafka

From 2e4dd222dc452a02c2ebdde0374162d0c2a673ff Mon Sep 17 00:00:00 2001
From: Travis Downs <travis.downs@vectorized.io>
Date: Mon, 4 Jul 2022 22:51:25 -0700
Subject: [PATCH 123/201] Introduce KafkaApiHandlerAny concept

We already had concepts for one-phase and two-phase handlers,
and this concept is simply the union of those two handler
concepts, i.e., "any" type of handler.
---
 src/v/kafka/server/handlers/handler.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/v/kafka/server/handlers/handler.h b/src/v/kafka/server/handlers/handler.h
index a4b87a7545863..8d289536344e6 100644
--- a/src/v/kafka/server/handlers/handler.h
+++ b/src/v/kafka/server/handlers/handler.h
@@ -45,4 +45,7 @@ concept KafkaApiTwoPhaseHandler = KafkaApi<typename T::api> && requires(
     { T::handle(std::move(ctx), g) } -> std::same_as<process_result_stages>;
 };
 
+template<typename T>
+concept KafkaApiHandlerAny = KafkaApiHandler<T> || KafkaApiTwoPhaseHandler<T>;
+
 } // namespace kafka

From 0c8a6e94f516cd5cdd88b3e1a5a72243172fcae1 Mon Sep 17 00:00:00 2001
From: Travis Downs <travis.downs@vectorized.io>
Date: Thu, 14 Jul 2022 11:29:31 -0700
Subject: [PATCH 124/201] Rename handler to single_stage_handler

We wish to reclaim the name handler for the generic handler interface
introduced in the next change.
---
 src/v/kafka/server/handlers/add_offsets_to_txn.h        | 3 ++-
 src/v/kafka/server/handlers/add_partitions_to_txn.h     | 3 ++-
 src/v/kafka/server/handlers/alter_configs.h             | 2 +-
 src/v/kafka/server/handlers/api_versions.h              | 3 ++-
 src/v/kafka/server/handlers/create_acls.h               | 2 +-
 src/v/kafka/server/handlers/create_partitions.h         | 3 ++-
 src/v/kafka/server/handlers/create_topics.h             | 2 +-
 src/v/kafka/server/handlers/delete_acls.h               | 2 +-
 src/v/kafka/server/handlers/delete_groups.h             | 2 +-
 src/v/kafka/server/handlers/delete_topics.h             | 2 +-
 src/v/kafka/server/handlers/describe_acls.h             | 2 +-
 src/v/kafka/server/handlers/describe_configs.h          | 3 ++-
 src/v/kafka/server/handlers/describe_groups.h           | 2 +-
 src/v/kafka/server/handlers/describe_log_dirs.h         | 3 ++-
 src/v/kafka/server/handlers/end_txn.h                   | 2 +-
 src/v/kafka/server/handlers/fetch.h                     | 2 +-
 src/v/kafka/server/handlers/find_coordinator.h          | 3 ++-
 src/v/kafka/server/handlers/handler.h                   | 2 +-
 src/v/kafka/server/handlers/heartbeat.h                 | 2 +-
 src/v/kafka/server/handlers/incremental_alter_configs.h | 2 +-
 src/v/kafka/server/handlers/init_producer_id.h          | 3 ++-
 src/v/kafka/server/handlers/leave_group.h               | 2 +-
 src/v/kafka/server/handlers/list_groups.h               | 2 +-
 src/v/kafka/server/handlers/list_offsets.h              | 2 +-
 src/v/kafka/server/handlers/metadata.h                  | 2 +-
 src/v/kafka/server/handlers/offset_fetch.h              | 2 +-
 src/v/kafka/server/handlers/offset_for_leader_epoch.h   | 2 +-
 src/v/kafka/server/handlers/sasl_authenticate.h         | 3 ++-
 src/v/kafka/server/handlers/sasl_handshake.h            | 2 +-
 src/v/kafka/server/handlers/txn_offset_commit.h         | 3 ++-
 30 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/src/v/kafka/server/handlers/add_offsets_to_txn.h b/src/v/kafka/server/handlers/add_offsets_to_txn.h
index e4b1669b970a7..fbc9fc1324e24 100644
--- a/src/v/kafka/server/handlers/add_offsets_to_txn.h
+++ b/src/v/kafka/server/handlers/add_offsets_to_txn.h
@@ -14,6 +14,7 @@
 
 namespace kafka {
 
-using add_offsets_to_txn_handler = handler<add_offsets_to_txn_api, 0, 1>;
+using add_offsets_to_txn_handler
+  = single_stage_handler<add_offsets_to_txn_api, 0, 1>;
 
 }
diff --git a/src/v/kafka/server/handlers/add_partitions_to_txn.h b/src/v/kafka/server/handlers/add_partitions_to_txn.h
index 5b0f2523b4f36..aee85586deee2 100644
--- a/src/v/kafka/server/handlers/add_partitions_to_txn.h
+++ b/src/v/kafka/server/handlers/add_partitions_to_txn.h
@@ -14,6 +14,7 @@
 
 namespace kafka {
 
-using add_partitions_to_txn_handler = handler<add_partitions_to_txn_api, 0, 2>;
+using add_partitions_to_txn_handler
+  = single_stage_handler<add_partitions_to_txn_api, 0, 2>;
 
 }
diff --git a/src/v/kafka/server/handlers/alter_configs.h b/src/v/kafka/server/handlers/alter_configs.h
index 7edcf2f987ad4..d61eb8c3f472d 100644
--- a/src/v/kafka/server/handlers/alter_configs.h
+++ b/src/v/kafka/server/handlers/alter_configs.h
@@ -14,6 +14,6 @@
 
 namespace kafka {
 
-using alter_configs_handler = handler<alter_configs_api, 0, 1>;
+using alter_configs_handler = single_stage_handler<alter_configs_api, 0, 1>;
 
 }
diff --git a/src/v/kafka/server/handlers/api_versions.h b/src/v/kafka/server/handlers/api_versions.h
index c131036d3f2a0..7ab8921561ea8 100644
--- a/src/v/kafka/server/handlers/api_versions.h
+++ b/src/v/kafka/server/handlers/api_versions.h
@@ -14,7 +14,8 @@
 
 namespace kafka {
 
-struct api_versions_handler : public handler<api_versions_api, 0, 3> {
+struct api_versions_handler
+  : public single_stage_handler<api_versions_api, 0, 3> {
     static constexpr api_version min_flexible = api_version(3);
 
     static ss::future<response_ptr>
diff --git a/src/v/kafka/server/handlers/create_acls.h b/src/v/kafka/server/handlers/create_acls.h
index e9719121d6dfc..d9a6161b71a1a 100644
--- a/src/v/kafka/server/handlers/create_acls.h
+++ b/src/v/kafka/server/handlers/create_acls.h
@@ -14,6 +14,6 @@
 
 namespace kafka {
 
-using create_acls_handler = handler<create_acls_api, 0, 1>;
+using create_acls_handler = single_stage_handler<create_acls_api, 0, 1>;
 
 }
diff --git a/src/v/kafka/server/handlers/create_partitions.h b/src/v/kafka/server/handlers/create_partitions.h
index 4102398e8d8bd..16b1dcc9de27c 100644
--- a/src/v/kafka/server/handlers/create_partitions.h
+++ b/src/v/kafka/server/handlers/create_partitions.h
@@ -14,6 +14,7 @@
 
 namespace kafka {
 
-using create_partitions_handler = handler<create_partitions_api, 0, 1>;
+using create_partitions_handler
+  = single_stage_handler<create_partitions_api, 0, 1>;
 
 }
diff --git a/src/v/kafka/server/handlers/create_topics.h b/src/v/kafka/server/handlers/create_topics.h
index e3c3958584566..2ab493358dd02 100644
--- a/src/v/kafka/server/handlers/create_topics.h
+++ b/src/v/kafka/server/handlers/create_topics.h
@@ -14,6 +14,6 @@
 
 namespace kafka {
 
-using create_topics_handler = handler<create_topics_api, 0, 5>;
+using create_topics_handler = single_stage_handler<create_topics_api, 0, 5>;
 
 }
diff --git a/src/v/kafka/server/handlers/delete_acls.h b/src/v/kafka/server/handlers/delete_acls.h
index d19ab798cf467..8e45cc5679fa6 100644
--- a/src/v/kafka/server/handlers/delete_acls.h
+++ b/src/v/kafka/server/handlers/delete_acls.h
@@ -14,6 +14,6 @@
 
 namespace kafka {
 
-using delete_acls_handler = handler<delete_acls_api, 0, 1>;
+using delete_acls_handler = single_stage_handler<delete_acls_api, 0, 1>;
 
 }
diff --git a/src/v/kafka/server/handlers/delete_groups.h b/src/v/kafka/server/handlers/delete_groups.h
index 85d01eb8c29bd..d9858140b83a1 100644
--- a/src/v/kafka/server/handlers/delete_groups.h
+++ b/src/v/kafka/server/handlers/delete_groups.h
@@ -14,6 +14,6 @@
 
 namespace kafka {
 
-using delete_groups_handler = handler<delete_groups_api, 0, 1>;
+using delete_groups_handler = single_stage_handler<delete_groups_api, 0, 1>;
 
 }
diff --git a/src/v/kafka/server/handlers/delete_topics.h b/src/v/kafka/server/handlers/delete_topics.h
index 7dd8d66d2f157..e9b6606cbe004 100644
--- a/src/v/kafka/server/handlers/delete_topics.h
+++ b/src/v/kafka/server/handlers/delete_topics.h
@@ -14,6 +14,6 @@
 
 namespace kafka {
 
-using delete_topics_handler = handler<delete_topics_api, 0, 3>;
+using delete_topics_handler = single_stage_handler<delete_topics_api, 0, 3>;
 
 }
diff --git a/src/v/kafka/server/handlers/describe_acls.h b/src/v/kafka/server/handlers/describe_acls.h
index 3377ac8a28582..996c6fa230aad 100644
--- a/src/v/kafka/server/handlers/describe_acls.h
+++ b/src/v/kafka/server/handlers/describe_acls.h
@@ -14,6 +14,6 @@
 
 namespace kafka {
 
-using describe_acls_handler = handler<describe_acls_api, 0, 1>;
+using describe_acls_handler = single_stage_handler<describe_acls_api, 0, 1>;
 
 }
diff --git a/src/v/kafka/server/handlers/describe_configs.h b/src/v/kafka/server/handlers/describe_configs.h
index 27f7235dc762a..97199e628b559 100644
--- a/src/v/kafka/server/handlers/describe_configs.h
+++ b/src/v/kafka/server/handlers/describe_configs.h
@@ -14,6 +14,7 @@
 
 namespace kafka {
 
-using describe_configs_handler = handler<describe_configs_api, 0, 2>;
+using describe_configs_handler
+  = single_stage_handler<describe_configs_api, 0, 2>;
 
 }
diff --git a/src/v/kafka/server/handlers/describe_groups.h b/src/v/kafka/server/handlers/describe_groups.h
index 6f804548b8937..b62004ae9fa60 100644
--- a/src/v/kafka/server/handlers/describe_groups.h
+++ b/src/v/kafka/server/handlers/describe_groups.h
@@ -14,6 +14,6 @@
 
 namespace kafka {
 
-using describe_groups_handler = handler<describe_groups_api, 0, 4>;
+using describe_groups_handler = single_stage_handler<describe_groups_api, 0, 4>;
 
 }
diff --git a/src/v/kafka/server/handlers/describe_log_dirs.h b/src/v/kafka/server/handlers/describe_log_dirs.h
index 13d11c440ad73..1731e88621a92 100644
--- a/src/v/kafka/server/handlers/describe_log_dirs.h
+++ b/src/v/kafka/server/handlers/describe_log_dirs.h
@@ -14,6 +14,7 @@
 
 namespace kafka {
 
-using describe_log_dirs_handler = handler<describe_log_dirs_api, 0, 1>;
+using describe_log_dirs_handler
+  = single_stage_handler<describe_log_dirs_api, 0, 1>;
 
 }
diff --git a/src/v/kafka/server/handlers/end_txn.h b/src/v/kafka/server/handlers/end_txn.h
index 72362cb00fed0..cd80b0d41c255 100644
--- a/src/v/kafka/server/handlers/end_txn.h
+++ b/src/v/kafka/server/handlers/end_txn.h
@@ -14,6 +14,6 @@
 
 namespace kafka {
 
-using end_txn_handler = handler<end_txn_api, 0, 2>;
+using end_txn_handler = single_stage_handler<end_txn_api, 0, 2>;
 
 }
diff --git a/src/v/kafka/server/handlers/fetch.h b/src/v/kafka/server/handlers/fetch.h
index d43b4e9b0b33b..8f44d89f1451d 100644
--- a/src/v/kafka/server/handlers/fetch.h
+++ b/src/v/kafka/server/handlers/fetch.h
@@ -17,7 +17,7 @@
 
 namespace kafka {
 
-using fetch_handler = handler<fetch_api, 4, 11>;
+using fetch_handler = single_stage_handler<fetch_api, 4, 11>;
 
 /*
  * Fetch operation context
diff --git a/src/v/kafka/server/handlers/find_coordinator.h b/src/v/kafka/server/handlers/find_coordinator.h
index 8e3d83bfe4d67..1f5ff07fb97f4 100644
--- a/src/v/kafka/server/handlers/find_coordinator.h
+++ b/src/v/kafka/server/handlers/find_coordinator.h
@@ -14,6 +14,7 @@
 
 namespace kafka {
 
-using find_coordinator_handler = handler<find_coordinator_api, 0, 2>;
+using find_coordinator_handler
+  = single_stage_handler<find_coordinator_api, 0, 2>;
 
 } // namespace kafka
diff --git a/src/v/kafka/server/handlers/handler.h b/src/v/kafka/server/handlers/handler.h
index 8d289536344e6..8895026c4e32c 100644
--- a/src/v/kafka/server/handlers/handler.h
+++ b/src/v/kafka/server/handlers/handler.h
@@ -22,7 +22,7 @@ template<
   typename RequestApi,
   api_version::type MinSupported,
   api_version::type MaxSupported>
-struct handler {
+struct single_stage_handler {
     using api = RequestApi;
     static constexpr api_version min_supported = api_version(MinSupported);
     static constexpr api_version max_supported = api_version(MaxSupported);
diff --git a/src/v/kafka/server/handlers/heartbeat.h b/src/v/kafka/server/handlers/heartbeat.h
index 27a4c22b1cbd9..437279760a549 100644
--- a/src/v/kafka/server/handlers/heartbeat.h
+++ b/src/v/kafka/server/handlers/heartbeat.h
@@ -14,6 +14,6 @@
 
 namespace kafka {
 
-using heartbeat_handler = handler<heartbeat_api, 0, 3>;
+using heartbeat_handler = single_stage_handler<heartbeat_api, 0, 3>;
 
 }
diff --git a/src/v/kafka/server/handlers/incremental_alter_configs.h b/src/v/kafka/server/handlers/incremental_alter_configs.h
index 8e902f5da6b36..9dbfde6be92e6 100644
--- a/src/v/kafka/server/handlers/incremental_alter_configs.h
+++ b/src/v/kafka/server/handlers/incremental_alter_configs.h
@@ -15,6 +15,6 @@
 namespace kafka {
 
 using incremental_alter_configs_handler
-  = handler<incremental_alter_configs_api, 0, 0>;
+  = single_stage_handler<incremental_alter_configs_api, 0, 0>;
 
 }
diff --git a/src/v/kafka/server/handlers/init_producer_id.h b/src/v/kafka/server/handlers/init_producer_id.h
index 5068f4325684b..be4c7d0a080a7 100644
--- a/src/v/kafka/server/handlers/init_producer_id.h
+++ b/src/v/kafka/server/handlers/init_producer_id.h
@@ -14,6 +14,7 @@
 
 namespace kafka {
 
-using init_producer_id_handler = handler<init_producer_id_api, 0, 1>;
+using init_producer_id_handler
+  = single_stage_handler<init_producer_id_api, 0, 1>;
 
 }
diff --git a/src/v/kafka/server/handlers/leave_group.h b/src/v/kafka/server/handlers/leave_group.h
index a959c6dc4ddd3..61adf1450dec7 100644
--- a/src/v/kafka/server/handlers/leave_group.h
+++ b/src/v/kafka/server/handlers/leave_group.h
@@ -14,6 +14,6 @@
 
 namespace kafka {
 
-using leave_group_handler = handler<leave_group_api, 0, 3>;
+using leave_group_handler = single_stage_handler<leave_group_api, 0, 3>;
 
 }
diff --git a/src/v/kafka/server/handlers/list_groups.h b/src/v/kafka/server/handlers/list_groups.h
index efe1657ae0827..b345f794a0e99 100644
--- a/src/v/kafka/server/handlers/list_groups.h
+++ b/src/v/kafka/server/handlers/list_groups.h
@@ -14,6 +14,6 @@
 
 namespace kafka {
 
-using list_groups_handler = handler<list_groups_api, 0, 2>;
+using list_groups_handler = single_stage_handler<list_groups_api, 0, 2>;
 
 }
diff --git a/src/v/kafka/server/handlers/list_offsets.h b/src/v/kafka/server/handlers/list_offsets.h
index 896d0344b42aa..bb88af1b1a7e0 100644
--- a/src/v/kafka/server/handlers/list_offsets.h
+++ b/src/v/kafka/server/handlers/list_offsets.h
@@ -14,6 +14,6 @@
 
 namespace kafka {
 
-using list_offsets_handler = handler<list_offsets_api, 0, 4>;
+using list_offsets_handler = single_stage_handler<list_offsets_api, 0, 4>;
 
 }
diff --git a/src/v/kafka/server/handlers/metadata.h b/src/v/kafka/server/handlers/metadata.h
index 89445b193fd0f..8d2336218e5b8 100644
--- a/src/v/kafka/server/handlers/metadata.h
+++ b/src/v/kafka/server/handlers/metadata.h
@@ -14,6 +14,6 @@
 
 namespace kafka {
 
-using metadata_handler = handler<metadata_api, 0, 7>;
+using metadata_handler = single_stage_handler<metadata_api, 0, 7>;
 
 }
diff --git a/src/v/kafka/server/handlers/offset_fetch.h b/src/v/kafka/server/handlers/offset_fetch.h
index 5cb87438b02ba..64ff13891db5e 100644
--- a/src/v/kafka/server/handlers/offset_fetch.h
+++ b/src/v/kafka/server/handlers/offset_fetch.h
@@ -17,6 +17,6 @@ namespace kafka {
 // in version 0 kafka stores offsets in zookeeper. if we ever need to
 // support version 0 then we need to do some code review to see if this has
 // any implications on semantics.
-using offset_fetch_handler = handler<offset_fetch_api, 1, 7>;
+using offset_fetch_handler = single_stage_handler<offset_fetch_api, 1, 7>;
 
 } // namespace kafka
diff --git a/src/v/kafka/server/handlers/offset_for_leader_epoch.h b/src/v/kafka/server/handlers/offset_for_leader_epoch.h
index e191aabf2a4c1..0c0e047b10a64 100644
--- a/src/v/kafka/server/handlers/offset_for_leader_epoch.h
+++ b/src/v/kafka/server/handlers/offset_for_leader_epoch.h
@@ -15,5 +15,5 @@
 namespace kafka {
 
 using offset_for_leader_epoch_handler
-  = handler<offset_for_leader_epoch_api, 0, 3>;
+  = single_stage_handler<offset_for_leader_epoch_api, 0, 3>;
 }
diff --git a/src/v/kafka/server/handlers/sasl_authenticate.h b/src/v/kafka/server/handlers/sasl_authenticate.h
index d86e3152223c4..5165e094db17a 100644
--- a/src/v/kafka/server/handlers/sasl_authenticate.h
+++ b/src/v/kafka/server/handlers/sasl_authenticate.h
@@ -14,6 +14,7 @@
 
 namespace kafka {
 
-using sasl_authenticate_handler = handler<sasl_authenticate_api, 0, 1>;
+using sasl_authenticate_handler
+  = single_stage_handler<sasl_authenticate_api, 0, 1>;
 
 }
diff --git a/src/v/kafka/server/handlers/sasl_handshake.h b/src/v/kafka/server/handlers/sasl_handshake.h
index d5a9343f939bc..7d5e5a3867f2a 100644
--- a/src/v/kafka/server/handlers/sasl_handshake.h
+++ b/src/v/kafka/server/handlers/sasl_handshake.h
@@ -14,6 +14,6 @@
 
 namespace kafka {
 
-using sasl_handshake_handler = handler<sasl_handshake_api, 0, 1>;
+using sasl_handshake_handler = single_stage_handler<sasl_handshake_api, 0, 1>;
 
 }
diff --git a/src/v/kafka/server/handlers/txn_offset_commit.h b/src/v/kafka/server/handlers/txn_offset_commit.h
index dcb1fc5786182..c7cebe25954fe 100644
--- a/src/v/kafka/server/handlers/txn_offset_commit.h
+++ b/src/v/kafka/server/handlers/txn_offset_commit.h
@@ -14,6 +14,7 @@
 
 namespace kafka {
 
-using txn_offset_commit_handler = handler<txn_offset_commit_api, 0, 3>;
+using txn_offset_commit_handler
+  = single_stage_handler<txn_offset_commit_api, 0, 3>;
 
 }

From d533df6f0526b3583bc0a8ca774b176f629584ea Mon Sep 17 00:00:00 2001
From: Travis Downs <travis.downs@vectorized.io>
Date: Mon, 4 Jul 2022 22:59:02 -0700
Subject: [PATCH 125/201] Introduce type-erased handler interface

This is a polymorphic handler class (abstract class with virtual
methods), as well as concrete instantiations of the interface for each
of the existing handlers, which can be looked up by API key.

This lets us treat handlers generically without resorting to template
functions which must be specialized for each handler type.

This reduces code bloat significantly as we do not duplicate code paths
for our ~45 handler types.

For example, requests.cc.o drops from ~11 MB to ~5 MB after it is
switched to the any_handler approach.
---
 src/v/kafka/CMakeLists.txt                    |   1 +
 .../server/handlers/handler_interface.cc      | 126 ++++++++++++++++++
 .../kafka/server/handlers/handler_interface.h |  91 +++++++++++++
 3 files changed, 218 insertions(+)
 create mode 100644 src/v/kafka/server/handlers/handler_interface.cc
 create mode 100644 src/v/kafka/server/handlers/handler_interface.h

diff --git a/src/v/kafka/CMakeLists.txt b/src/v/kafka/CMakeLists.txt
index 6e8da95e85afc..067fc72b9cd00 100644
--- a/src/v/kafka/CMakeLists.txt
+++ b/src/v/kafka/CMakeLists.txt
@@ -34,6 +34,7 @@ set(handlers_srcs
   server/handlers/delete_acls.cc
   server/handlers/create_partitions.cc
   server/handlers/offset_for_leader_epoch.cc
+  server/handlers/handler_interface.cc
   server/handlers/topics/types.cc
   server/handlers/topics/topic_utils.cc
 )
diff --git a/src/v/kafka/server/handlers/handler_interface.cc b/src/v/kafka/server/handlers/handler_interface.cc
new file mode 100644
index 0000000000000..140271e41cf53
--- /dev/null
+++ b/src/v/kafka/server/handlers/handler_interface.cc
@@ -0,0 +1,126 @@
+/*
+ * Copyright 2022 Redpanda Data, Inc.
+ *
+ * Use of this software is governed by the Business Source License
+ * included in the file licenses/BSL.md
+ *
+ * As of the Change Date specified in that file, in accordance with
+ * the Business Source License, use of this software will be governed
+ * by the Apache License, Version 2.0
+ */
+#include "kafka/server/handlers/handler_interface.h"
+
+#include "kafka/server/handlers/handlers.h"
+#include "kafka/server/handlers/produce.h"
+#include "kafka/types.h"
+
+#include <optional>
+
+namespace kafka {
+
+/**
+ * @brief Packages together basic information common to every handler.
+ */
+struct handler_info {
+    handler_info(
+      api_key key,
+      const char* name,
+      api_version min_api,
+      api_version max_api) noexcept
+      : _key(key)
+      , _name(name)
+      , _min_api(min_api)
+      , _max_api(max_api) {}
+
+    api_key _key;
+    const char* _name;
+    api_version _min_api, _max_api;
+};
+
+/**
+ * @brief Creates a type-erased handler implementation given info and a handle
+ * method.
+ *
+ * There are only two variants of this handler, for one and two pass
+ * implementations.
+ * This keeps the generated code duplication to a minimum, compared to
+ * templating this on the handler type.
+ *
+ * @tparam is_two_pass true if the handler is two-pass
+ */
+template<bool is_two_pass>
+struct handler_base final : public handler_interface {
+    using single_pass_handler
+      = ss::future<response_ptr>(request_context, ss::smp_service_group);
+    using two_pass_handler
+      = process_result_stages(request_context, ss::smp_service_group);
+    using fn_type
+      = std::conditional_t<is_two_pass, two_pass_handler, single_pass_handler>;
+
+    handler_base(const handler_info& info, fn_type* handle_fn) noexcept
+      : _info(info)
+      , _handle_fn(handle_fn) {}
+
+    api_version min_supported() const override { return _info._min_api; }
+    api_version max_supported() const override { return _info._max_api; }
+
+    api_key key() const override { return _info._key; }
+    const char* name() const override { return _info._name; }
+
+    /**
+     * Only handle varies with one or two pass, since one pass handlers
+     * must pass through single_stage() to covert them to two-pass.
+     */
+    process_result_stages
+    handle(request_context&& rc, ss::smp_service_group g) const override {
+        if constexpr (is_two_pass) {
+            return _handle_fn(std::move(rc), g);
+        } else {
+            return process_result_stages::single_stage(
+              _handle_fn(std::move(rc), g));
+        }
+    }
+
+private:
+    handler_info _info;
+    fn_type* _handle_fn;
+};
+
+/**
+ * @brief Instance holder for the handler_base.
+ *
+ * Given a handler type H, exposes a static instance of the assoicated handler
+ * base object.
+ *
+ * @tparam H the handler type.
+ */
+template<KafkaApiHandlerAny H>
+struct handler_holder {
+    static const inline handler_base<KafkaApiTwoPhaseHandler<H>> instance{
+      handler_info{
+        H::api::key, H::api::name, H::min_supported, H::max_supported},
+      H::handle};
+};
+
+template<typename... Ts>
+constexpr auto make_lut(type_list<Ts...>) {
+    constexpr int max_index = std::max({Ts::api::key...});
+    static_assert(max_index < sizeof...(Ts) * 10, "LUT is too sparse");
+
+    std::array<handler, max_index + 1> lut{};
+    ((lut[Ts::api::key] = &handler_holder<Ts>::instance), ...);
+
+    return lut;
+}
+
+std::optional<handler> handler_for_key(kafka::api_key key) {
+    static constexpr auto lut = make_lut(request_types{});
+    if (key >= (short)0 && key < (short)lut.size()) {
+        if (auto handler = lut[key]) {
+            return handler;
+        }
+    }
+    return std::nullopt;
+}
+
+} // namespace kafka
diff --git a/src/v/kafka/server/handlers/handler_interface.h b/src/v/kafka/server/handlers/handler_interface.h
new file mode 100644
index 0000000000000..a88b49490d75e
--- /dev/null
+++ b/src/v/kafka/server/handlers/handler_interface.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2022 Redpanda Data, Inc.
+ *
+ * Use of this software is governed by the Business Source License
+ * included in the file licenses/BSL.md
+ *
+ * As of the Change Date specified in that file, in accordance with
+ * the Business Source License, use of this software will be governed
+ * by the Apache License, Version 2.0
+ */
+#pragma once
+#include "kafka/server/fwd.h"
+#include "kafka/server/response.h"
+#include "kafka/types.h"
+
+namespace kafka {
+/**
+ * @brief Runtime polymorphic handler type.
+ *
+ * Allows access to all kafka request handling implementations though a
+ * type erased interface. This avoids the need to bring every handler
+ * type into scope and make everything that touches the handler a template
+ * function on the handler type.
+ *
+ */
+struct handler_interface {
+    /**
+     * @brief The minimum supported API version, inclusive.
+     */
+    virtual api_version min_supported() const = 0;
+
+    /**
+     * @brief The maximum supported API version, inclusive.
+     */
+    virtual api_version max_supported() const = 0;
+
+    /**
+     * @brief The name of the API method.
+     */
+    virtual const char* name() const = 0;
+
+    /**
+     * @brief The API key associated with the method.
+     */
+    virtual api_key key() const = 0;
+
+    /**
+     * @brief Handles the request.
+     *
+     * Invokes the request handler with the given request context
+     * (which will be moved from) and smp_service_groups.
+     *
+     * The result stages objects contains futures for both the initial
+     * dispatch phase, and the find response. For API methods which
+     * are implemented a single phase, the same type is returned, but
+     * the response future will complete as soon as the dispatch one does.
+     *
+     * @return process_result_stages representing the future completion of
+     * the handler.
+     */
+    virtual process_result_stages
+    handle(request_context&&, ss::smp_service_group) const = 0;
+
+    virtual ~handler_interface() = default;
+};
+
+/**
+ * @brief Pointer to a handler.
+ *
+ * Most code will use handler objects, which are simply pointers
+ * to handlers, generally const objects with static storage duration
+ * obtained from handler_for_key.
+ */
+using handler = const handler_interface*;
+
+/**
+ * @brief Return a handler for the given key, if any.
+ *
+ * Returns a pointer to a constant singleton handler for the given
+ * key, or an empty optional if no such handler exists. The contained
+ * any_hanlder is guaranteed to be non-null if the optional as a value.
+ *
+ * This method looks up the handler in a table populated by all handlers
+ * in kafka::request_types.
+ *
+ * @param key the API key for the handler
+ * @return std::optional<handler> the handler, if any
+ */
+std::optional<handler> handler_for_key(api_key key);
+
+} // namespace kafka

From f80e99db75186dcbbb5f1f17ae7ae964a2efcb70 Mon Sep 17 00:00:00 2001
From: Travis Downs <travis.downs@vectorized.io>
Date: Mon, 4 Jul 2022 23:15:37 -0700
Subject: [PATCH 126/201] Add handler_interface unit tests

The handler_interface already gets good functional coverage as it is
added to the core request path in requests.cc, but we also include
this unit test with basic coverage.
---
 src/v/kafka/server/tests/CMakeLists.txt       |  4 +-
 .../server/tests/handler_interface_test.cc    | 49 +++++++++++++++++++
 2 files changed, 52 insertions(+), 1 deletion(-)
 create mode 100644 src/v/kafka/server/tests/handler_interface_test.cc

diff --git a/src/v/kafka/server/tests/CMakeLists.txt b/src/v/kafka/server/tests/CMakeLists.txt
index 3630ba7e10ff6..08298dec60459 100644
--- a/src/v/kafka/server/tests/CMakeLists.txt
+++ b/src/v/kafka/server/tests/CMakeLists.txt
@@ -8,11 +8,13 @@ rp_test(
     timeouts_conversion_test.cc
     types_conversion_tests.cc
     topic_utils_test.cc
+    handler_interface_test.cc
   DEFINITIONS BOOST_TEST_DYN_LINK
-  LIBRARIES Boost::unit_test_framework v::kafka
+  LIBRARIES Boost::unit_test_framework v::kafka v::coproc
   LABELS kafka
 )
 
+
 set(srcs
   consumer_groups_test.cc
   member_test.cc
diff --git a/src/v/kafka/server/tests/handler_interface_test.cc b/src/v/kafka/server/tests/handler_interface_test.cc
new file mode 100644
index 0000000000000..ab0a011f2805d
--- /dev/null
+++ b/src/v/kafka/server/tests/handler_interface_test.cc
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2022 Redpanda Data, Inc.
+ *
+ * Use of this software is governed by the Business Source License
+ * included in the file licenses/BSL.md
+ *
+ * As of the Change Date specified in that file, in accordance with
+ * the Business Source License, use of this software will be governed
+ * by the Apache License, Version 2.0
+ */
+#include "kafka/server/handlers/handler_interface.h"
+#include "kafka/server/handlers/handlers.h"
+
+#include <boost/test/unit_test.hpp>
+
+template<kafka::KafkaApiHandlerAny H>
+void check_any_vs_static() {
+    BOOST_TEST_INFO("Testing " << H::api::name);
+    auto hopt = kafka::handler_for_key(H::api::key);
+    BOOST_REQUIRE(hopt.has_value());
+    auto h = *hopt;
+    BOOST_CHECK_EQUAL(h->min_supported(), H::min_supported);
+    BOOST_CHECK_EQUAL(h->max_supported(), H::max_supported);
+    BOOST_CHECK_EQUAL(h->key(), H::api::key);
+    BOOST_CHECK_EQUAL(h->name(), H::api::name);
+}
+
+template<typename... Ts>
+void check_all_types(kafka::type_list<Ts...>) {
+    (check_any_vs_static<Ts>(), ...);
+}
+
+BOOST_AUTO_TEST_CASE(handler_all_types) {
+    check_all_types(kafka::request_types{});
+}
+
+BOOST_AUTO_TEST_CASE(handler_handler_for_key) {
+    // key too low
+    BOOST_CHECK(!kafka::handler_for_key(kafka::api_key(-1)).has_value());
+    // key too high
+    const auto max_key = kafka::max_api_key(kafka::request_types{});
+    BOOST_CHECK(
+      !kafka::handler_for_key(kafka::api_key(max_key + 1)).has_value());
+    // last key should be present
+    BOOST_CHECK(kafka::handler_for_key(kafka::api_key(max_key)).has_value());
+    // 34 is AlterReplicaLogDirs which we don't currently support, use it as a
+    // test case for handlers which fall in the valid range but we don't support
+    BOOST_CHECK(!kafka::handler_for_key(kafka::api_key(34)).has_value());
+}

From 12d43793d343d264c282523a3e93dfba2189ce38 Mon Sep 17 00:00:00 2001
From: Travis Downs <travis.downs@vectorized.io>
Date: Tue, 5 Jul 2022 11:31:08 -0700
Subject: [PATCH 127/201] Update request handling to use polymorphic handler

Preceding changes in this series introduced a runtime polymorphic
handler class, and this change switches most of the request handling
to use it.

In particular, we replace the large switch on API key which dispatches
to a template method to a lookup of the handler method and virtual
dispatch.

Some handlers that need special processing like the authentication/SASL
related ones still use the old approach for now.
---
 src/v/kafka/server/requests.cc | 168 ++++++++++-----------------------
 1 file changed, 50 insertions(+), 118 deletions(-)

diff --git a/src/v/kafka/server/requests.cc b/src/v/kafka/server/requests.cc
index 8aec87556a3ed..5d5fcbd0b7daf 100644
--- a/src/v/kafka/server/requests.cc
+++ b/src/v/kafka/server/requests.cc
@@ -7,8 +7,13 @@
 // the Business Source License, use of this software will be governed
 // by the Apache License, Version 2.0
 
-#include "kafka/server/handlers/handlers.h"
-#include "kafka/server/handlers/produce.h"
+#include "kafka/protocol/schemata/api_versions_request.h"
+#include "kafka/protocol/schemata/fetch_request.h"
+#include "kafka/protocol/schemata/produce_request.h"
+#include "kafka/server/handlers/api_versions.h"
+#include "kafka/server/handlers/handler_interface.h"
+#include "kafka/server/handlers/sasl_authenticate.h"
+#include "kafka/server/handlers/sasl_handshake.h"
 #include "kafka/server/request_context.h"
 #include "kafka/types.h"
 #include "utils/to_string.h"
@@ -33,53 +38,6 @@ struct process_dispatch { // clang-format on
     }
 };
 
-/**
- * api_versions request processed in one stage however this template
- * specialization exists so that the return value of the request can be examined
- * by the connection layer.
- */
-template<>
-struct process_dispatch<api_versions_handler> {
-    static process_result_stages
-    process(request_context&& ctx, ss::smp_service_group g) {
-        return process_result_stages::single_stage(
-          api_versions_handler::handle(std::move(ctx), g));
-    }
-};
-
-/**
- * Requests processed in two stages
- */
-template<>
-struct process_dispatch<produce_handler> {
-    static process_result_stages
-    process(request_context&& ctx, ss::smp_service_group g) {
-        return produce_handler::handle(std::move(ctx), g);
-    }
-};
-
-template<>
-struct process_dispatch<offset_commit_handler> {
-    static process_result_stages
-    process(request_context&& ctx, ss::smp_service_group g) {
-        return offset_commit_handler::handle(std::move(ctx), g);
-    }
-};
-template<>
-struct process_dispatch<join_group_handler> {
-    static process_result_stages
-    process(request_context&& ctx, ss::smp_service_group g) {
-        return join_group_handler::handle(std::move(ctx), g);
-    }
-};
-template<>
-struct process_dispatch<sync_group_handler> {
-    static process_result_stages
-    process(request_context&& ctx, ss::smp_service_group g) {
-        return sync_group_handler::handle(std::move(ctx), g);
-    }
-};
-
 class kafka_api_version_not_supported_exception : public std::runtime_error {
 public:
     explicit kafka_api_version_not_supported_exception(const std::string& m)
@@ -121,6 +79,35 @@ requires(KafkaApiHandler<Request> || KafkaApiTwoPhaseHandler<Request>)
     return process_dispatch<Request>::process(std::move(ctx), g);
 }
 
+process_result_stages process_generic(
+  handler handler, request_context&& ctx, ss::smp_service_group g) {
+    vlog(
+      klog.trace,
+      "[{}:{}] processing name:{}, key:{}, version:{} for {}",
+      ctx.connection()->client_host(),
+      ctx.connection()->client_port(),
+      handler->name(),
+      ctx.header().key,
+      ctx.header().version,
+      ctx.header().client_id.value_or(std::string_view("unset-client-id")));
+
+    // We do a version check for most API requests, but for api_version
+    // requests we skip them. We do not apply them for api_versions,
+    // because the client does not yet know what
+    // versions this server supports. The api versions request is used by a
+    // client to query this information.
+    if (ctx.header().key != api_versions_api::key &&
+      (ctx.header().version < handler->min_supported() ||
+       ctx.header().version > handler->max_supported())) {
+        throw std::runtime_error(fmt::format(
+          "Unsupported version {} for {} API",
+          ctx.header().version,
+          handler->name()));
+    }
+
+    return handler->handle(std::move(ctx), g);
+}
+
 class kafka_authentication_exception : public std::runtime_error {
 public:
     explicit kafka_authentication_exception(const std::string& m)
@@ -161,7 +148,7 @@ handle_auth_handshake(request_context&& ctx, ss::smp_service_group g) {
 static ss::future<response_ptr>
 handle_auth_initial(request_context&& ctx, ss::smp_service_group g) {
     switch (ctx.header().key) {
-    case api_versions_handler::api::key: {
+    case api_versions_api::key: {
         auto r = api_versions_handler::handle_raw(ctx);
         if (r.data.error_code == error_code::none) {
             ctx.sasl().set_state(security::sasl_server::sasl_state::handshake);
@@ -247,8 +234,8 @@ handle_auth(request_context&& ctx, ss::smp_service_group g) {
 // only track latency for push and fetch requests
 bool track_latency(api_key key) {
     switch (key) {
-    case fetch_handler::api::key:
-    case produce_handler::api::key:
+    case fetch_api::key:
+    case produce_api::key:
         return true;
     default:
         return false;
@@ -274,47 +261,14 @@ process_request(request_context&& ctx, ss::smp_service_group g) {
             }));
     }
 
-    switch (ctx.header().key) {
-    case api_versions_handler::api::key:
-        return do_process<api_versions_handler>(std::move(ctx), g);
-    case metadata_handler::api::key:
-        return do_process<metadata_handler>(std::move(ctx), g);
-    case list_groups_handler::api::key:
-        return do_process<list_groups_handler>(std::move(ctx), g);
-    case find_coordinator_handler::api::key:
-        return do_process<find_coordinator_handler>(std::move(ctx), g);
-    case offset_fetch_handler::api::key:
-        return do_process<offset_fetch_handler>(std::move(ctx), g);
-    case produce_handler::api::key:
-        return do_process<produce_handler>(std::move(ctx), g);
-    case list_offsets_handler::api::key:
-        return do_process<list_offsets_handler>(std::move(ctx), g);
-    case offset_commit_handler::api::key:
-        return do_process<offset_commit_handler>(std::move(ctx), g);
-    case fetch_handler::api::key:
-        return do_process<fetch_handler>(std::move(ctx), g);
-    case join_group_handler::api::key:
-        return do_process<join_group_handler>(std::move(ctx), g);
-    case heartbeat_handler::api::key:
-        return do_process<heartbeat_handler>(std::move(ctx), g);
-    case leave_group_handler::api::key:
-        return do_process<leave_group_handler>(std::move(ctx), g);
-    case sync_group_handler::api::key:
-        return do_process<sync_group_handler>(std::move(ctx), g);
-    case create_topics_handler::api::key:
-        return do_process<create_topics_handler>(std::move(ctx), g);
-    case describe_configs_handler::api::key:
-        return do_process<describe_configs_handler>(std::move(ctx), g);
-    case alter_configs_handler::api::key:
-        return do_process<alter_configs_handler>(std::move(ctx), g);
-    case delete_topics_handler::api::key:
-        return do_process<delete_topics_handler>(std::move(ctx), g);
-    case describe_groups_handler::api::key:
-        return do_process<describe_groups_handler>(std::move(ctx), g);
-    case sasl_handshake_handler::api::key:
+    auto& key = ctx.header().key;
+
+    if (key == sasl_handshake_handler::api::key) {
         return process_result_stages::single_stage(ctx.respond(
           sasl_handshake_response(error_code::illegal_sasl_state, {})));
-    case sasl_authenticate_handler::api::key: {
+    }
+
+    if (key == sasl_authenticate_handler::api::key) {
         sasl_authenticate_response_data data{
           .error_code = error_code::illegal_sasl_state,
           .error_message = "Authentication process already completed",
@@ -322,33 +276,11 @@ process_request(request_context&& ctx, ss::smp_service_group g) {
         return process_result_stages::single_stage(
           ctx.respond(sasl_authenticate_response(std::move(data))));
     }
-    case init_producer_id_handler::api::key:
-        return do_process<init_producer_id_handler>(std::move(ctx), g);
-    case incremental_alter_configs_handler::api::key:
-        return do_process<incremental_alter_configs_handler>(std::move(ctx), g);
-    case delete_groups_handler::api::key:
-        return do_process<delete_groups_handler>(std::move(ctx), g);
-    case describe_acls_handler::api::key:
-        return do_process<describe_acls_handler>(std::move(ctx), g);
-    case describe_log_dirs_handler::api::key:
-        return do_process<describe_log_dirs_handler>(std::move(ctx), g);
-    case create_acls_handler::api::key:
-        return do_process<create_acls_handler>(std::move(ctx), g);
-    case delete_acls_handler::api::key:
-        return do_process<delete_acls_handler>(std::move(ctx), g);
-    case add_partitions_to_txn_handler::api::key:
-        return do_process<add_partitions_to_txn_handler>(std::move(ctx), g);
-    case txn_offset_commit_handler::api::key:
-        return do_process<txn_offset_commit_handler>(std::move(ctx), g);
-    case add_offsets_to_txn_handler::api::key:
-        return do_process<add_offsets_to_txn_handler>(std::move(ctx), g);
-    case end_txn_handler::api::key:
-        return do_process<end_txn_handler>(std::move(ctx), g);
-    case create_partitions_handler::api::key:
-        return do_process<create_partitions_handler>(std::move(ctx), g);
-    case offset_for_leader_epoch_handler::api::key:
-        return do_process<offset_for_leader_epoch_handler>(std::move(ctx), g);
-    };
+
+    if (auto handler = handler_for_key(key)) {
+        return process_generic(*handler, std::move(ctx), g);
+    }
+
     throw std::runtime_error(
       fmt::format("Unsupported API {}", ctx.header().key));
 }

From 38be81c6485cea2b841fc9de4200ac09e62a5e97 Mon Sep 17 00:00:00 2001
From: Travis Downs <travis.downs@vectorized.io>
Date: Tue, 5 Jul 2022 17:44:12 -0700
Subject: [PATCH 128/201] Add support for memory estimation to handlers

Currently we use  single memory estimate for all kafka request types,
but different API calls may use wildly different amounts of memory.

This change allows each handler to perform an API-specific calculation
instead.
---
 src/v/kafka/server/handlers/handler.h         | 12 +++++++++-
 .../server/handlers/handler_interface.cc      | 17 +++++++++++---
 .../kafka/server/handlers/handler_interface.h | 21 ++++++++++++++++++
 src/v/kafka/server/handlers/join_group.h      |  3 +++
 src/v/kafka/server/handlers/offset_commit.h   |  4 ++++
 src/v/kafka/server/handlers/produce.h         |  3 +++
 src/v/kafka/server/handlers/sync_group.h      |  3 +++
 src/v/kafka/server/response.h                 | 22 +++++++++++++++++++
 8 files changed, 81 insertions(+), 4 deletions(-)

diff --git a/src/v/kafka/server/handlers/handler.h b/src/v/kafka/server/handlers/handler.h
index 8895026c4e32c..b6b3efa8aea8d 100644
--- a/src/v/kafka/server/handlers/handler.h
+++ b/src/v/kafka/server/handlers/handler.h
@@ -18,16 +18,26 @@
 
 namespace kafka {
 
+using memory_estimate_fn = size_t(size_t);
+
 template<
   typename RequestApi,
   api_version::type MinSupported,
-  api_version::type MaxSupported>
+  api_version::type MaxSupported,
+  memory_estimate_fn MemEstimator = default_memory_estimate>
 struct single_stage_handler {
     using api = RequestApi;
     static constexpr api_version min_supported = api_version(MinSupported);
     static constexpr api_version max_supported = api_version(MaxSupported);
     static ss::future<response_ptr>
       handle(request_context, ss::smp_service_group);
+    /**
+     * See handler_interface::memory_estimate for a description of this
+     * function.
+     */
+    static size_t memory_estimate(size_t request_size) {
+        return MemEstimator(request_size);
+    }
 };
 
 template<typename T>
diff --git a/src/v/kafka/server/handlers/handler_interface.cc b/src/v/kafka/server/handlers/handler_interface.cc
index 140271e41cf53..56721e8331968 100644
--- a/src/v/kafka/server/handlers/handler_interface.cc
+++ b/src/v/kafka/server/handlers/handler_interface.cc
@@ -12,6 +12,7 @@
 
 #include "kafka/server/handlers/handlers.h"
 #include "kafka/server/handlers/produce.h"
+#include "kafka/server/response.h"
 #include "kafka/types.h"
 
 #include <optional>
@@ -26,15 +27,18 @@ struct handler_info {
       api_key key,
       const char* name,
       api_version min_api,
-      api_version max_api) noexcept
+      api_version max_api,
+      memory_estimate_fn* mem_estimate) noexcept
       : _key(key)
       , _name(name)
       , _min_api(min_api)
-      , _max_api(max_api) {}
+      , _max_api(max_api)
+      , _mem_estimate(mem_estimate) {}
 
     api_key _key;
     const char* _name;
     api_version _min_api, _max_api;
+    memory_estimate_fn* _mem_estimate;
 };
 
 /**
@@ -67,6 +71,9 @@ struct handler_base final : public handler_interface {
     api_key key() const override { return _info._key; }
     const char* name() const override { return _info._name; }
 
+    size_t memory_estimate(size_t request_size) const override {
+        return _info._mem_estimate(request_size);
+    }
     /**
      * Only handle varies with one or two pass, since one pass handlers
      * must pass through single_stage() to covert them to two-pass.
@@ -98,7 +105,11 @@ template<KafkaApiHandlerAny H>
 struct handler_holder {
     static const inline handler_base<KafkaApiTwoPhaseHandler<H>> instance{
       handler_info{
-        H::api::key, H::api::name, H::min_supported, H::max_supported},
+        H::api::key,
+        H::api::name,
+        H::min_supported,
+        H::max_supported,
+        H::memory_estimate},
       H::handle};
 };
 
diff --git a/src/v/kafka/server/handlers/handler_interface.h b/src/v/kafka/server/handlers/handler_interface.h
index a88b49490d75e..05e21ec497e06 100644
--- a/src/v/kafka/server/handlers/handler_interface.h
+++ b/src/v/kafka/server/handlers/handler_interface.h
@@ -44,6 +44,27 @@ struct handler_interface {
      */
     virtual api_key key() const = 0;
 
+    /**
+     * @brief Estimates the memory used to process the request.
+     *
+     * Returns an esimate of the memory needed to process a request. This is
+     * used to block the request until sufficient memory is available using the
+     * "memory units" semaphore. Ideally this should be a conservative request
+     * (i.e., a possible overestimate in cases where the memory use may vary
+     * significantly) as the result of a too-small estimate may be an
+     * out-of-memory condition, while a too-large estimate will "merely" reduce
+     * performance.
+     *
+     * Handers may also return an initial, small estimate here covering the
+     * first part of processing, then dynamically increase their memory
+     * allocation later on during processing when the full memory size is known.
+     *
+     * Unfortunately, this estimate happens early in the decoding process, after
+     * only the request size and header has been read, so handlers don't have
+     * as much information as they may like to make this decision.
+     */
+    virtual size_t memory_estimate(size_t request_size) const = 0;
+
     /**
      * @brief Handles the request.
      *
diff --git a/src/v/kafka/server/handlers/join_group.h b/src/v/kafka/server/handlers/join_group.h
index 1d3ec508350a7..5f4ebf3220277 100644
--- a/src/v/kafka/server/handlers/join_group.h
+++ b/src/v/kafka/server/handlers/join_group.h
@@ -19,5 +19,8 @@ struct join_group_handler {
     static constexpr api_version min_supported = api_version(0);
     static constexpr api_version max_supported = api_version(5);
     static process_result_stages handle(request_context, ss::smp_service_group);
+    static size_t memory_estimate(size_t request_size) {
+        return default_memory_estimate(request_size);
+    }
 };
 } // namespace kafka
diff --git a/src/v/kafka/server/handlers/offset_commit.h b/src/v/kafka/server/handlers/offset_commit.h
index 2355c580aeac2..7026c855ce597 100644
--- a/src/v/kafka/server/handlers/offset_commit.h
+++ b/src/v/kafka/server/handlers/offset_commit.h
@@ -11,6 +11,7 @@
 #pragma once
 #include "kafka/protocol/offset_commit.h"
 #include "kafka/server/handlers/handler.h"
+#include "kafka/server/response.h"
 
 namespace kafka {
 
@@ -22,5 +23,8 @@ struct offset_commit_handler {
     static constexpr api_version min_supported = api_version(1);
     static constexpr api_version max_supported = api_version(7);
     static process_result_stages handle(request_context, ss::smp_service_group);
+    static size_t memory_estimate(size_t request_size) {
+        return default_memory_estimate(request_size);
+    }
 };
 } // namespace kafka
diff --git a/src/v/kafka/server/handlers/produce.h b/src/v/kafka/server/handlers/produce.h
index ae7673858d77b..fbdace0fb23fa 100644
--- a/src/v/kafka/server/handlers/produce.h
+++ b/src/v/kafka/server/handlers/produce.h
@@ -20,6 +20,9 @@ struct produce_handler {
     static constexpr api_version max_supported = api_version(7);
     static process_result_stages handle(request_context, ss::smp_service_group);
     static constexpr auto despam_interval = std::chrono::minutes(5);
+    static size_t memory_estimate(size_t request_size) {
+        return default_memory_estimate(request_size);
+    }
 };
 
 } // namespace kafka
diff --git a/src/v/kafka/server/handlers/sync_group.h b/src/v/kafka/server/handlers/sync_group.h
index 711cca63bae92..88b85acbb14ad 100644
--- a/src/v/kafka/server/handlers/sync_group.h
+++ b/src/v/kafka/server/handlers/sync_group.h
@@ -19,6 +19,9 @@ struct sync_group_handler {
     static constexpr api_version min_supported = api_version(0);
     static constexpr api_version max_supported = api_version(3);
     static process_result_stages handle(request_context, ss::smp_service_group);
+    static size_t memory_estimate(size_t request_size) {
+        return default_memory_estimate(request_size);
+    }
 };
 
 } // namespace kafka
diff --git a/src/v/kafka/server/response.h b/src/v/kafka/server/response.h
index c7be3de993d36..0bbe52bc978e7 100644
--- a/src/v/kafka/server/response.h
+++ b/src/v/kafka/server/response.h
@@ -105,4 +105,26 @@ struct process_result_stages {
     ss::future<response_ptr> response;
 };
 
+/**
+ * @brief The default memory size estimate.
+ *
+ * Request must make an up-front estimate of the amount of memory they will use,
+ * in order to obtain the corresponding number of units from the memory
+ * semaphore (blocking if they are not available). Each request type can use
+ * their own estimation approach, but if not specified this default estimator
+ * will be used.
+ *
+ * Now, this estimator is very poor for many request types: it only applies a
+ * multiplier to the request size, so only makes
+ * sense for requests (such as produce) where the size of the request is a
+ * good indicator of the total memory size. For requests with a small request
+ * but a large response (fetch, metadata, etc), it is not appropriate.
+ *
+ * @return size_t the estimated size required to process the request
+ */
+constexpr size_t default_memory_estimate(size_t request_size) {
+    // Allow for extra copies and bookkeeping
+    return request_size * 2 + 8000; // NOLINT
+}
+
 } // namespace kafka

From 5a9ca081b3e42ad01065abd9d08d4e9eaf3a3aa4 Mon Sep 17 00:00:00 2001
From: Travis Downs <travis.downs@vectorized.io>
Date: Tue, 5 Jul 2022 21:07:27 -0700
Subject: [PATCH 129/201] Use handler specific memory estimate

In connection_context, we now use the handler-specific initial memory
use estimate, rather than a single estimate for every handler type.
---
 src/v/kafka/server/connection_context.cc | 25 ++++++++++++++++--------
 src/v/kafka/server/connection_context.h  |  3 ++-
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/src/v/kafka/server/connection_context.cc b/src/v/kafka/server/connection_context.cc
index d046f5cbfdb0e..e8546672a17ff 100644
--- a/src/v/kafka/server/connection_context.cc
+++ b/src/v/kafka/server/connection_context.cc
@@ -13,10 +13,12 @@
 #include "bytes/iobuf.h"
 #include "config/configuration.h"
 #include "kafka/protocol/sasl_authenticate.h"
+#include "kafka/server/handlers/handler_interface.h"
 #include "kafka/server/protocol.h"
 #include "kafka/server/protocol_utils.h"
 #include "kafka/server/quota_manager.h"
 #include "kafka/server/request_context.h"
+#include "kafka/server/response.h"
 #include "security/exceptions.h"
 #include "units.h"
 #include "vlog.h"
@@ -236,8 +238,9 @@ connection_context::throttle_request(
     }
     auto track = track_latency(hdr.key);
     return fut
-      .then(
-        [this, request_size] { return reserve_request_units(request_size); })
+      .then([this, key = hdr.key, request_size] {
+          return reserve_request_units(key, request_size);
+      })
       .then([this, delay, track, tracker = std::move(tracker)](
               ss::semaphore_units<> units) mutable {
           return server().get_request_unit().then(
@@ -262,15 +265,21 @@ connection_context::throttle_request(
 }
 
 ss::future<ss::semaphore_units<>>
-connection_context::reserve_request_units(size_t size) {
-    // Allow for extra copies and bookkeeping
-    auto mem_estimate = size * 2 + 8000; // NOLINT
-    if (mem_estimate >= (size_t)std::numeric_limits<int32_t>::max()) {
+connection_context::reserve_request_units(api_key key, size_t size) {
+    // Defer to the handler for the request type for the memory estimate, but
+    // if the request isn't found, use the default estimate (although in that
+    // case the request is likely for an API we don't support or malformed, so
+    // it is likely to fail shortly anyway).
+    auto handler = handler_for_key(key);
+    auto mem_estimate = handler ? (*handler)->memory_estimate(size)
+                                : default_memory_estimate(size);
+    if (unlikely(mem_estimate >= (size_t)std::numeric_limits<int32_t>::max())) {
         // TODO: Create error response using the specific API?
         throw std::runtime_error(fmt::format(
-          "request too large > 1GB (size: {}; estimate: {})",
+          "request too large > 1GB (size: {}, estimate: {}, API: {})",
           size,
-          mem_estimate));
+          mem_estimate,
+          handler ? (*handler)->name() : "<bad key>"));
     }
     auto fut = ss::get_units(_rs.memory(), mem_estimate);
     if (_rs.memory().waiters()) {
diff --git a/src/v/kafka/server/connection_context.h b/src/v/kafka/server/connection_context.h
index 9de753f27b243..2b18788fcbf8e 100644
--- a/src/v/kafka/server/connection_context.h
+++ b/src/v/kafka/server/connection_context.h
@@ -167,7 +167,8 @@ class connection_context final
     // Reserve units from memory from the memory semaphore in proportion
     // to the number of bytes the request procesisng is expected to
     // take.
-    ss::future<ss::semaphore_units<>> reserve_request_units(size_t size);
+    ss::future<ss::semaphore_units<>>
+    reserve_request_units(api_key key, size_t size);
 
     // Apply backpressure sequence, where the request processing may be
     // delayed for various reasons, including throttling but also because

From 958b9330bcbb69f16e9cb2502417c403845d45fc Mon Sep 17 00:00:00 2001
From: Travis Downs <travis.downs@vectorized.io>
Date: Sun, 10 Jul 2022 16:22:20 -0700
Subject: [PATCH 130/201] Move session_resouces object to top level

The session_resources type was a private member of
connection_context, but as we want to use it more
broadly, move it out as a standalone public class.

Additionally, pass it by shared_pointer in preparation
for later changes will feed it into requests.
---
 src/v/kafka/server/connection_context.cc | 17 ++++--
 src/v/kafka/server/connection_context.h  | 71 ++++++++++++------------
 src/v/kafka/server/request_context.h     |  3 +-
 src/v/kafka/server/requests.cc           | 19 +++++--
 4 files changed, 63 insertions(+), 47 deletions(-)

diff --git a/src/v/kafka/server/connection_context.cc b/src/v/kafka/server/connection_context.cc
index e8546672a17ff..e94b4af156883 100644
--- a/src/v/kafka/server/connection_context.cc
+++ b/src/v/kafka/server/connection_context.cc
@@ -25,6 +25,7 @@
 
 #include <seastar/core/coroutine.hh>
 #include <seastar/core/scattered_message.hh>
+#include <seastar/core/shared_ptr.hh>
 #include <seastar/core/sleep.hh>
 #include <seastar/core/with_timeout.hh>
 
@@ -184,8 +185,9 @@ ss::future<> connection_context::handle_auth_v0(const size_t size) {
           },
           std::move(request_buf),
           0s);
+        auto sres = session_resources{};
         auto resp = co_await kafka::process_request(
-                      std::move(ctx), _proto.smp_group())
+                      std::move(ctx), _proto.smp_group(), sres)
                       .response;
         auto data = std::move(*resp).release();
         response.decode(std::move(data), version);
@@ -215,8 +217,7 @@ bool connection_context::is_finished_parsing() const {
     return _rs.conn->input().eof() || _rs.abort_requested();
 }
 
-ss::future<connection_context::session_resources>
-connection_context::throttle_request(
+ss::future<session_resources> connection_context::throttle_request(
   const request_header& hdr, size_t request_size) {
     // update the throughput tracker for this client using the
     // size of the current request and return any computed delay
@@ -291,11 +292,15 @@ connection_context::reserve_request_units(api_key key, size_t size) {
 ss::future<>
 connection_context::dispatch_method_once(request_header hdr, size_t size) {
     return throttle_request(hdr, size).then([this, hdr = std::move(hdr), size](
-                                              session_resources sres) mutable {
+                                              session_resources
+                                                sres_in) mutable {
         if (_rs.abort_requested()) {
             // protect against shutdown behavior
             return ss::make_ready_future<>();
         }
+
+        auto sres = ss::make_lw_shared(std::move(sres_in));
+
         auto remaining = size - request_header_size
                          - hdr.client_id_buffer.size() - hdr.tags_size_bytes;
         return read_iobuf_exactly(_rs.conn->input(), remaining)
@@ -307,7 +312,7 @@ connection_context::dispatch_method_once(request_header hdr, size_t size) {
               }
               auto self = shared_from_this();
               auto rctx = request_context(
-                self, std::move(hdr), std::move(buf), sres.backpressure_delay);
+                self, std::move(hdr), std::move(buf), sres->backpressure_delay);
               /*
                * we process requests in order since all subsequent requests
                * are dependent on authentication having completed.
@@ -332,7 +337,7 @@ connection_context::dispatch_method_once(request_header hdr, size_t size) {
               const sequence_id seq = _seq_idx;
               _seq_idx = _seq_idx + sequence_id(1);
               auto res = kafka::process_request(
-                std::move(rctx), _proto.smp_group());
+                std::move(rctx), _proto.smp_group(), *sres);
               /**
                * first stage processed in a foreground.
                */
diff --git a/src/v/kafka/server/connection_context.h b/src/v/kafka/server/connection_context.h
index 2b18788fcbf8e..10eb4d6977560 100644
--- a/src/v/kafka/server/connection_context.h
+++ b/src/v/kafka/server/connection_context.h
@@ -11,6 +11,7 @@
 #pragma once
 #include "kafka/server/protocol.h"
 #include "kafka/server/response.h"
+#include "kafka/types.h"
 #include "net/server.h"
 #include "seastarx.h"
 #include "security/acl.h"
@@ -37,6 +38,41 @@ using authz_quiet = ss::bool_class<struct authz_quiet_tag>;
 struct request_header;
 class request_context;
 
+// used to track number of pending requests
+class request_tracker {
+public:
+    explicit request_tracker(net::server_probe& probe) noexcept
+      : _probe(probe) {
+        _probe.request_received();
+    }
+    request_tracker(const request_tracker&) = delete;
+    request_tracker(request_tracker&&) = delete;
+    request_tracker& operator=(const request_tracker&) = delete;
+    request_tracker& operator=(request_tracker&&) = delete;
+
+    ~request_tracker() noexcept { _probe.request_completed(); }
+
+private:
+    net::server_probe& _probe;
+};
+
+// Used to hold resources associated with a given request until
+// the response has been send, as well as to track some statistics
+// about the request.
+//
+// The resources in particular should be not be destroyed until
+// the request is complete (e.g., all the information written to
+// the socket so that no userspace buffers remain).
+struct session_resources {
+    using pointer = ss::lw_shared_ptr<session_resources>;
+
+    ss::lowres_clock::duration backpressure_delay;
+    ss::semaphore_units<> memlocks;
+    ss::semaphore_units<> queue_units;
+    std::unique_ptr<hdr_hist::measurement> method_latency;
+    std::unique_ptr<request_tracker> tracker;
+};
+
 class connection_context final
   : public ss::enable_lw_shared_from_this<connection_context> {
 public:
@@ -131,39 +167,6 @@ class connection_context final
     }
 
 private:
-    // used to track number of pending requests
-    class request_tracker {
-    public:
-        explicit request_tracker(net::server_probe& probe) noexcept
-          : _probe(probe) {
-            _probe.request_received();
-        }
-        request_tracker(const request_tracker&) = delete;
-        request_tracker(request_tracker&&) = delete;
-        request_tracker& operator=(const request_tracker&) = delete;
-        request_tracker& operator=(request_tracker&&) = delete;
-
-        ~request_tracker() noexcept { _probe.request_completed(); }
-
-    private:
-        net::server_probe& _probe;
-    };
-
-    // Used to hold resources associated with a given request until
-    // the response has been send, as well as to track some statistics
-    // about the request.
-    //
-    // The resources in particular should be not be destroyed until
-    // the request is complete (e.g., all the information written to
-    // the socket so that no userspace buffers remain).
-    struct session_resources {
-        ss::lowres_clock::duration backpressure_delay;
-        ss::semaphore_units<> memlocks;
-        ss::semaphore_units<> queue_units;
-        std::unique_ptr<hdr_hist::measurement> method_latency;
-        std::unique_ptr<request_tracker> tracker;
-    };
-
     // Reserve units from memory from the memory semaphore in proportion
     // to the number of bytes the request procesisng is expected to
     // take.
@@ -205,7 +208,7 @@ class connection_context final
      */
     struct response_and_resources {
         response_ptr response;
-        session_resources resources;
+        session_resources::pointer resources;
     };
 
     using sequence_id = named_type<uint64_t, struct kafka_protocol_sequence>;
diff --git a/src/v/kafka/server/request_context.h b/src/v/kafka/server/request_context.h
index 1f7b1f35e7668..ddee4747a43f3 100644
--- a/src/v/kafka/server/request_context.h
+++ b/src/v/kafka/server/request_context.h
@@ -219,7 +219,8 @@ class request_context {
 };
 
 // Executes the API call identified by the specified request_context.
-process_result_stages process_request(request_context&&, ss::smp_service_group);
+process_result_stages process_request(
+  request_context&&, ss::smp_service_group, const session_resources&);
 
 bool track_latency(api_key);
 
diff --git a/src/v/kafka/server/requests.cc b/src/v/kafka/server/requests.cc
index 5d5fcbd0b7daf..950ff2ca2d628 100644
--- a/src/v/kafka/server/requests.cc
+++ b/src/v/kafka/server/requests.cc
@@ -10,6 +10,7 @@
 #include "kafka/protocol/schemata/api_versions_request.h"
 #include "kafka/protocol/schemata/fetch_request.h"
 #include "kafka/protocol/schemata/produce_request.h"
+#include "kafka/server/connection_context.h"
 #include "kafka/server/handlers/api_versions.h"
 #include "kafka/server/handlers/handler_interface.h"
 #include "kafka/server/handlers/sasl_authenticate.h"
@@ -80,16 +81,20 @@ requires(KafkaApiHandler<Request> || KafkaApiTwoPhaseHandler<Request>)
 }
 
 process_result_stages process_generic(
-  handler handler, request_context&& ctx, ss::smp_service_group g) {
+  handler handler,
+  request_context&& ctx,
+  ss::smp_service_group g,
+  const session_resources& sres) {
     vlog(
       klog.trace,
-      "[{}:{}] processing name:{}, key:{}, version:{} for {}",
+      "[{}:{}] processing name:{}, key:{}, version:{} for {}, mem_units: {}",
       ctx.connection()->client_host(),
       ctx.connection()->client_port(),
       handler->name(),
       ctx.header().key,
       ctx.header().version,
-      ctx.header().client_id.value_or(std::string_view("unset-client-id")));
+      ctx.header().client_id.value_or(std::string_view("unset-client-id")),
+      sres.memlocks.count());
 
     // We do a version check for most API requests, but for api_version
     // requests we skip them. We do not apply them for api_versions,
@@ -242,8 +247,10 @@ bool track_latency(api_key key) {
     }
 }
 
-process_result_stages
-process_request(request_context&& ctx, ss::smp_service_group g) {
+process_result_stages process_request(
+  request_context&& ctx,
+  ss::smp_service_group g,
+  const session_resources& sres) {
     /*
      * requests are handled as normal when auth is disabled. otherwise no
      * request is handled until the auth process has completed.
@@ -278,7 +285,7 @@ process_request(request_context&& ctx, ss::smp_service_group g) {
     }
 
     if (auto handler = handler_for_key(key)) {
-        return process_generic(*handler, std::move(ctx), g);
+        return process_generic(*handler, std::move(ctx), g, sres);
     }
 
     throw std::runtime_error(

From 213b6716d6aa2bb8e1f29623074868017db19deb Mon Sep 17 00:00:00 2001
From: Travis Downs <travis.downs@vectorized.io>
Date: Wed, 13 Jul 2022 16:16:10 -0700
Subject: [PATCH 131/201] Sort kafka/server forward includes

---
 src/v/kafka/server/fwd.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/v/kafka/server/fwd.h b/src/v/kafka/server/fwd.h
index a893be15ed167..4aa0a30909ec1 100644
--- a/src/v/kafka/server/fwd.h
+++ b/src/v/kafka/server/fwd.h
@@ -13,13 +13,14 @@
 
 namespace kafka {
 
+// sorted
 class coordinator_ntp_mapper;
 class fetch_session_cache;
 class group_manager;
 class group_router;
+class quota_manager;
+class request_context;
 class rm_group_frontend;
 class rm_group_proxy_impl;
-class request_context;
-class quota_manager;
 
 } // namespace kafka

From d645c7a78d548706e4f2b9d79ea48115e765566c Mon Sep 17 00:00:00 2001
From: Travis Downs <travis.downs@vectorized.io>
Date: Wed, 13 Jul 2022 20:15:15 -0700
Subject: [PATCH 132/201] Introduce handler template for two-stage handlers

Single-stage handlers have a handler template which means that handler
objects can be declared in a single line specifying their api object,
min and max API versions.

This change extends this nice concept to two-stage handlers as well.
---
 src/v/kafka/server/handlers/handler.h        | 54 ++++++++++++++++++--
 src/v/kafka/server/handlers/join_group.cc    |  1 +
 src/v/kafka/server/handlers/join_group.h     | 11 +---
 src/v/kafka/server/handlers/offset_commit.cc |  1 +
 src/v/kafka/server/handlers/offset_commit.h  | 11 +---
 src/v/kafka/server/handlers/produce.cc       |  3 ++
 src/v/kafka/server/handlers/produce.h        | 11 +---
 src/v/kafka/server/handlers/sync_group.cc    |  1 +
 src/v/kafka/server/handlers/sync_group.h     | 10 +---
 9 files changed, 62 insertions(+), 41 deletions(-)

diff --git a/src/v/kafka/server/handlers/handler.h b/src/v/kafka/server/handlers/handler.h
index b6b3efa8aea8d..749943cd6077f 100644
--- a/src/v/kafka/server/handlers/handler.h
+++ b/src/v/kafka/server/handlers/handler.h
@@ -20,17 +20,28 @@ namespace kafka {
 
 using memory_estimate_fn = size_t(size_t);
 
+/**
+ * Handlers are generally specializations of this template, via one of the
+ * two aliases (handler or two_phase_hander) declared below, though it is
+ * not strictly necessary (only conforming to one of the two KafkaApi*
+ * concepts is needed).
+ *
+ * The benefit of this template is that it takes care of the most of the
+ * handler boilerplate.
+ */
 template<
   typename RequestApi,
   api_version::type MinSupported,
   api_version::type MaxSupported,
-  memory_estimate_fn MemEstimator = default_memory_estimate>
-struct single_stage_handler {
+  typename HandleRetType,
+  memory_estimate_fn MemEstimator>
+struct handler_template {
     using api = RequestApi;
     static constexpr api_version min_supported = api_version(MinSupported);
     static constexpr api_version max_supported = api_version(MaxSupported);
-    static ss::future<response_ptr>
-      handle(request_context, ss::smp_service_group);
+
+    static HandleRetType handle(request_context, ss::smp_service_group);
+
     /**
      * See handler_interface::memory_estimate for a description of this
      * function.
@@ -40,6 +51,41 @@ struct single_stage_handler {
     }
 };
 
+/**
+ * A single-stage handler implements the entire request handling in the initial
+ * stage which occurs before any subsequent request is processed.
+ */
+template<
+  typename RequestApi,
+  api_version::type MinSupported,
+  api_version::type MaxSupported,
+  memory_estimate_fn MemEstimator = default_memory_estimate>
+using single_stage_handler = handler_template<
+  RequestApi,
+  MinSupported,
+  MaxSupported,
+  ss::future<response_ptr>,
+  MemEstimator>;
+
+/**
+ * A two-stage handler has an initial stage which happens before any other
+ * request can start processing (as in a single-stage handler) but then also has
+ * a second stage which is processed in the background allowing other requests
+ * on the same connection to start their handler. Responses are still sent in
+ * order, but processing is out-of-order.
+ */
+template<
+  typename RequestApi,
+  api_version::type MinSupported,
+  api_version::type MaxSupported,
+  memory_estimate_fn MemEstimator = default_memory_estimate>
+using two_phase_handler = handler_template<
+  RequestApi,
+  MinSupported,
+  MaxSupported,
+  process_result_stages,
+  MemEstimator>;
+
 template<typename T>
 concept KafkaApiHandler = KafkaApi<typename T::api> && requires(
   T h, request_context&& ctx, ss::smp_service_group g) {
diff --git a/src/v/kafka/server/handlers/join_group.cc b/src/v/kafka/server/handlers/join_group.cc
index f7bc53b80a4e4..57fe29ca93623 100644
--- a/src/v/kafka/server/handlers/join_group.cc
+++ b/src/v/kafka/server/handlers/join_group.cc
@@ -35,6 +35,7 @@ static void decode_request(request_context& ctx, join_group_request& req) {
       fmt::format("{}", ctx.connection()->client_host()));
 }
 
+template<>
 process_result_stages join_group_handler::handle(
   request_context ctx, [[maybe_unused]] ss::smp_service_group g) {
     join_group_request request;
diff --git a/src/v/kafka/server/handlers/join_group.h b/src/v/kafka/server/handlers/join_group.h
index 5f4ebf3220277..1830badc2f277 100644
--- a/src/v/kafka/server/handlers/join_group.h
+++ b/src/v/kafka/server/handlers/join_group.h
@@ -14,13 +14,6 @@
 
 namespace kafka {
 
-struct join_group_handler {
-    using api = join_group_api;
-    static constexpr api_version min_supported = api_version(0);
-    static constexpr api_version max_supported = api_version(5);
-    static process_result_stages handle(request_context, ss::smp_service_group);
-    static size_t memory_estimate(size_t request_size) {
-        return default_memory_estimate(request_size);
-    }
-};
+using join_group_handler = two_phase_handler<join_group_api, 0, 5>;
+
 } // namespace kafka
diff --git a/src/v/kafka/server/handlers/offset_commit.cc b/src/v/kafka/server/handlers/offset_commit.cc
index ddaa68616aaba..7c1c4d10976a0 100644
--- a/src/v/kafka/server/handlers/offset_commit.cc
+++ b/src/v/kafka/server/handlers/offset_commit.cc
@@ -53,6 +53,7 @@ struct offset_commit_ctx {
       , ssg(ssg) {}
 };
 
+template<>
 process_result_stages
 offset_commit_handler::handle(request_context ctx, ss::smp_service_group ssg) {
     offset_commit_request request;
diff --git a/src/v/kafka/server/handlers/offset_commit.h b/src/v/kafka/server/handlers/offset_commit.h
index 7026c855ce597..5a0512d4d043c 100644
--- a/src/v/kafka/server/handlers/offset_commit.h
+++ b/src/v/kafka/server/handlers/offset_commit.h
@@ -18,13 +18,6 @@ namespace kafka {
 // in version 0 kafka stores offsets in zookeeper. if we ever need to
 // support version 0 then we need to do some code review to see if this has
 // any implications on semantics.
-struct offset_commit_handler {
-    using api = offset_commit_api;
-    static constexpr api_version min_supported = api_version(1);
-    static constexpr api_version max_supported = api_version(7);
-    static process_result_stages handle(request_context, ss::smp_service_group);
-    static size_t memory_estimate(size_t request_size) {
-        return default_memory_estimate(request_size);
-    }
-};
+using offset_commit_handler = two_phase_handler<offset_commit_api, 1, 7>;
+
 } // namespace kafka
diff --git a/src/v/kafka/server/handlers/produce.cc b/src/v/kafka/server/handlers/produce.cc
index 8e9a2b051ecd9..29b2dfcde8a0d 100644
--- a/src/v/kafka/server/handlers/produce.cc
+++ b/src/v/kafka/server/handlers/produce.cc
@@ -43,6 +43,8 @@
 
 namespace kafka {
 
+static constexpr auto despam_interval = std::chrono::minutes(5);
+
 produce_response produce_request::make_error_response(error_code error) const {
     produce_response response;
 
@@ -464,6 +466,7 @@ static std::vector<topic_produce_stages> produce_topics(produce_ctx& octx) {
     return topics;
 }
 
+template<>
 process_result_stages
 produce_handler::handle(request_context ctx, ss::smp_service_group ssg) {
     produce_request request;
diff --git a/src/v/kafka/server/handlers/produce.h b/src/v/kafka/server/handlers/produce.h
index fbdace0fb23fa..617b7cb1c98f7 100644
--- a/src/v/kafka/server/handlers/produce.h
+++ b/src/v/kafka/server/handlers/produce.h
@@ -14,15 +14,6 @@
 
 namespace kafka {
 
-struct produce_handler {
-    using api = produce_api;
-    static constexpr api_version min_supported = api_version(0);
-    static constexpr api_version max_supported = api_version(7);
-    static process_result_stages handle(request_context, ss::smp_service_group);
-    static constexpr auto despam_interval = std::chrono::minutes(5);
-    static size_t memory_estimate(size_t request_size) {
-        return default_memory_estimate(request_size);
-    }
-};
+using produce_handler = two_phase_handler<produce_api, 0, 7>;
 
 } // namespace kafka
diff --git a/src/v/kafka/server/handlers/sync_group.cc b/src/v/kafka/server/handlers/sync_group.cc
index fcbfe4a1c7437..0ddee9864020e 100644
--- a/src/v/kafka/server/handlers/sync_group.cc
+++ b/src/v/kafka/server/handlers/sync_group.cc
@@ -21,6 +21,7 @@
 
 namespace kafka {
 
+template<>
 process_result_stages sync_group_handler::handle(
   request_context ctx, [[maybe_unused]] ss::smp_service_group g) {
     sync_group_request request;
diff --git a/src/v/kafka/server/handlers/sync_group.h b/src/v/kafka/server/handlers/sync_group.h
index 88b85acbb14ad..b23ceb79578aa 100644
--- a/src/v/kafka/server/handlers/sync_group.h
+++ b/src/v/kafka/server/handlers/sync_group.h
@@ -14,14 +14,6 @@
 
 namespace kafka {
 
-struct sync_group_handler {
-    using api = sync_group_api;
-    static constexpr api_version min_supported = api_version(0);
-    static constexpr api_version max_supported = api_version(3);
-    static process_result_stages handle(request_context, ss::smp_service_group);
-    static size_t memory_estimate(size_t request_size) {
-        return default_memory_estimate(request_size);
-    }
-};
+using sync_group_handler = two_phase_handler<sync_group_api, 0, 3>;
 
 } // namespace kafka

From bbea197bdc3c611d342d1c5b46ea43d563fa165c Mon Sep 17 00:00:00 2001
From: Travis Downs <travis.downs@vectorized.io>
Date: Wed, 13 Jul 2022 20:54:46 -0700
Subject: [PATCH 133/201] Add the connection context to the memory estimator

Passing the connection context to the estimator allows the
estimator to use the various subsystems to estimate the memory
use of a given request.
---
 src/v/kafka/server/connection_context.cc        |  2 +-
 src/v/kafka/server/fwd.h                        |  1 +
 src/v/kafka/server/handlers/handler.h           | 17 ++++++++++++-----
 .../kafka/server/handlers/handler_interface.cc  |  5 +++--
 src/v/kafka/server/handlers/handler_interface.h | 11 +++++------
 src/v/kafka/server/handlers/metadata.cc         |  1 +
 6 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/src/v/kafka/server/connection_context.cc b/src/v/kafka/server/connection_context.cc
index e94b4af156883..0b263d09c8d69 100644
--- a/src/v/kafka/server/connection_context.cc
+++ b/src/v/kafka/server/connection_context.cc
@@ -272,7 +272,7 @@ connection_context::reserve_request_units(api_key key, size_t size) {
     // case the request is likely for an API we don't support or malformed, so
     // it is likely to fail shortly anyway).
     auto handler = handler_for_key(key);
-    auto mem_estimate = handler ? (*handler)->memory_estimate(size)
+    auto mem_estimate = handler ? (*handler)->memory_estimate(size, *this)
                                 : default_memory_estimate(size);
     if (unlikely(mem_estimate >= (size_t)std::numeric_limits<int32_t>::max())) {
         // TODO: Create error response using the specific API?
diff --git a/src/v/kafka/server/fwd.h b/src/v/kafka/server/fwd.h
index 4aa0a30909ec1..e034d9f650847 100644
--- a/src/v/kafka/server/fwd.h
+++ b/src/v/kafka/server/fwd.h
@@ -14,6 +14,7 @@
 namespace kafka {
 
 // sorted
+class connection_context;
 class coordinator_ntp_mapper;
 class fetch_session_cache;
 class group_manager;
diff --git a/src/v/kafka/server/handlers/handler.h b/src/v/kafka/server/handlers/handler.h
index 749943cd6077f..6e294af073c50 100644
--- a/src/v/kafka/server/handlers/handler.h
+++ b/src/v/kafka/server/handlers/handler.h
@@ -10,6 +10,7 @@
  */
 #pragma once
 #include "kafka/protocol/types.h"
+#include "kafka/server/fwd.h"
 #include "kafka/server/request_context.h"
 #include "kafka/server/response.h"
 #include "kafka/types.h"
@@ -18,7 +19,12 @@
 
 namespace kafka {
 
-using memory_estimate_fn = size_t(size_t);
+using memory_estimate_fn = size_t(size_t, connection_context&);
+
+constexpr size_t
+default_estimate_adaptor(size_t request_size, connection_context&) {
+    return default_memory_estimate(request_size);
+}
 
 /**
  * Handlers are generally specializations of this template, via one of the
@@ -46,8 +52,9 @@ struct handler_template {
      * See handler_interface::memory_estimate for a description of this
      * function.
      */
-    static size_t memory_estimate(size_t request_size) {
-        return MemEstimator(request_size);
+    static size_t
+    memory_estimate(size_t request_size, connection_context& conn_ctx) {
+        return MemEstimator(request_size, conn_ctx);
     }
 };
 
@@ -59,7 +66,7 @@ template<
   typename RequestApi,
   api_version::type MinSupported,
   api_version::type MaxSupported,
-  memory_estimate_fn MemEstimator = default_memory_estimate>
+  memory_estimate_fn MemEstimator = default_estimate_adaptor>
 using single_stage_handler = handler_template<
   RequestApi,
   MinSupported,
@@ -78,7 +85,7 @@ template<
   typename RequestApi,
   api_version::type MinSupported,
   api_version::type MaxSupported,
-  memory_estimate_fn MemEstimator = default_memory_estimate>
+  memory_estimate_fn MemEstimator = default_estimate_adaptor>
 using two_phase_handler = handler_template<
   RequestApi,
   MinSupported,
diff --git a/src/v/kafka/server/handlers/handler_interface.cc b/src/v/kafka/server/handlers/handler_interface.cc
index 56721e8331968..156f305bb51b1 100644
--- a/src/v/kafka/server/handlers/handler_interface.cc
+++ b/src/v/kafka/server/handlers/handler_interface.cc
@@ -71,8 +71,9 @@ struct handler_base final : public handler_interface {
     api_key key() const override { return _info._key; }
     const char* name() const override { return _info._name; }
 
-    size_t memory_estimate(size_t request_size) const override {
-        return _info._mem_estimate(request_size);
+    size_t memory_estimate(
+      size_t request_size, connection_context& conn_ctx) const override {
+        return _info._mem_estimate(request_size, conn_ctx);
     }
     /**
      * Only handle varies with one or two pass, since one pass handlers
diff --git a/src/v/kafka/server/handlers/handler_interface.h b/src/v/kafka/server/handlers/handler_interface.h
index 05e21ec497e06..6f5b368984c84 100644
--- a/src/v/kafka/server/handlers/handler_interface.h
+++ b/src/v/kafka/server/handlers/handler_interface.h
@@ -55,15 +55,14 @@ struct handler_interface {
      * out-of-memory condition, while a too-large estimate will "merely" reduce
      * performance.
      *
-     * Handers may also return an initial, small estimate here covering the
-     * first part of processing, then dynamically increase their memory
-     * allocation later on during processing when the full memory size is known.
-     *
      * Unfortunately, this estimate happens early in the decoding process, after
      * only the request size and header has been read, so handlers don't have
-     * as much information as they may like to make this decision.
+     * as much information as they may like to make this decision. The
+     * connection_context for the associated connection is passed to give access
+     * to global state which may be useful in making the estimate.
      */
-    virtual size_t memory_estimate(size_t request_size) const = 0;
+    virtual size_t memory_estimate(
+      size_t request_size, connection_context& conn_ctx) const = 0;
 
     /**
      * @brief Handles the request.
diff --git a/src/v/kafka/server/handlers/metadata.cc b/src/v/kafka/server/handlers/metadata.cc
index 43813d0e5931a..4865bb5401414 100644
--- a/src/v/kafka/server/handlers/metadata.cc
+++ b/src/v/kafka/server/handlers/metadata.cc
@@ -15,6 +15,7 @@
 #include "config/configuration.h"
 #include "config/node_config.h"
 #include "kafka/server/errors.h"
+#include "kafka/server/fwd.h"
 #include "kafka/server/handlers/details/leader_epoch.h"
 #include "kafka/server/handlers/details/security.h"
 #include "kafka/server/handlers/topics/topic_utils.h"

From c218c2bb3c122a1dedfdb451416113ad48a61a74 Mon Sep 17 00:00:00 2001
From: Travis Downs <travis.downs@vectorized.io>
Date: Sun, 10 Jul 2022 22:27:57 -0700
Subject: [PATCH 134/201] Use a better estimator for metadata requests

Currently we estimate that metadata requests take 8000 + rsize * 2 bytes
of memory to process, where rsize is the size of the request. Since
metadata requests are very small, this end up being roughly 8000 bytes.

However, metadata requests which return information about every
partition and replica may easily be several MBs in size.

To fix this for metadata requests specifically, we use a new more
conservative estimate which uses the current topic and partition
configuration to give an upper bound on the size.
---
 src/v/kafka/server/handlers/metadata.cc | 65 +++++++++++++++++++++++++
 src/v/kafka/server/handlers/metadata.h  | 15 +++++-
 2 files changed, 78 insertions(+), 2 deletions(-)

diff --git a/src/v/kafka/server/handlers/metadata.cc b/src/v/kafka/server/handlers/metadata.cc
index 4865bb5401414..92e034fb4af87 100644
--- a/src/v/kafka/server/handlers/metadata.cc
+++ b/src/v/kafka/server/handlers/metadata.cc
@@ -19,6 +19,7 @@
 #include "kafka/server/handlers/details/leader_epoch.h"
 #include "kafka/server/handlers/details/security.h"
 #include "kafka/server/handlers/topics/topic_utils.h"
+#include "kafka/server/response.h"
 #include "kafka/types.h"
 #include "likely.h"
 #include "model/metadata.h"
@@ -423,4 +424,68 @@ ss::future<response_ptr> metadata_handler::handle(
     co_return co_await ctx.respond(std::move(reply));
 }
 
+size_t
+metadata_memory_estimator(size_t request_size, connection_context& conn_ctx) {
+    // We cannot make a precise estimate of the size of a metadata response by
+    // examining only the size of the request (nor even by examining the entire
+    // request) since the response depends on the number of partitions in the
+    // cluster. Instead, we return a conservative estimate based on the current
+    // number of topics & partitions in the cluster.
+
+    // Essentially we need to estimate the size taken by a "maximum size"
+    // metadata_response_data response. The maximum size is when metadata for
+    // all topics is returned, which is also a common case in practice. This
+    // involves calculating the size for each topic's portion of the response,
+    // since the size varies both based on the number of partitions and the
+    // replica count.
+
+    // We start with a base estimate of 10K and then proceed to ignore
+    // everything other than the topic/partition part of the response, since
+    // that's what takes space in large responses and we assume the remaining
+    // part of the response (the broker list being the second largest part) will
+    // fit in this 10000k slush fund.
+    size_t size_estimate = 10000;
+
+    auto& md = conn_ctx.server().metadata_cache().all_topics_metadata();
+
+    for (auto& [tp_ns, topic_metadata] : md) {
+        // metadata_response_topic
+        size_estimate += sizeof(kafka::metadata_response_topic);
+        size_estimate += tp_ns.tp().size();
+
+        using partition = kafka::metadata_response_partition;
+
+        // Base number of bytes needed to represent each partition, ignoring the
+        // variable part attributable to the replica count, we just take as the
+        // size of the partition response structure.
+        constexpr size_t bytes_per_partition = sizeof(partition);
+
+        // Then, we need the number of additional bytes per replica, per
+        // partition, associated with storing the replica list in
+        // metadata_response_partition::replicas/isr_nodes, which we take to
+        // be the size of the elements in those lists (4 bytes each).
+        constexpr size_t bytes_per_replica = sizeof(partition::replica_nodes[0])
+                                             + sizeof(partition::isr_nodes[0]);
+
+        // The actual partition and replica count for this topic.
+        int32_t pcount = topic_metadata.get_configuration().partition_count;
+        int32_t rcount = topic_metadata.get_configuration().replication_factor;
+
+        size_estimate += pcount
+                         * (bytes_per_partition + bytes_per_replica * rcount);
+    }
+
+    // Finally, we double the estimate, because the highwater mark for memory
+    // use comes when the in-memory structures (metadata_response_data and
+    // subobjects) exist on the heap and they are encoded into the reponse,
+    // which will also exist on the heap. The calculation above handles the
+    // first size, and the encoded response ends up being very similar in size,
+    // so we double the estimate to account for both.
+    size_estimate *= 2;
+
+    // We still add on the default_estimate to handle the size of the request
+    // itself and miscellaneous other procesing (this is a small adjustment,
+    // generally ~8000 bytes).
+    return default_memory_estimate(request_size) + size_estimate;
+}
 } // namespace kafka
diff --git a/src/v/kafka/server/handlers/metadata.h b/src/v/kafka/server/handlers/metadata.h
index 8d2336218e5b8..bd0e78bb70039 100644
--- a/src/v/kafka/server/handlers/metadata.h
+++ b/src/v/kafka/server/handlers/metadata.h
@@ -14,6 +14,17 @@
 
 namespace kafka {
 
-using metadata_handler = single_stage_handler<metadata_api, 0, 7>;
+/**
+ * Estimate the size of a metadata request.
+ *
+ * Metadata requests are generally very small (a request for *all* metadata
+ * about a cluster is less than 30 bytes) but the response may be very large, so
+ * the default estimator is unsuitable. See the implementation for further
+ * notes.
+ */
+memory_estimate_fn metadata_memory_estimator;
+
+using metadata_handler
+  = single_stage_handler<metadata_api, 0, 7, metadata_memory_estimator>;
 
-}
+} // namespace kafka

From 6cb5a71e009803a818400db6ad6336b01b24171b Mon Sep 17 00:00:00 2001
From: Travis Downs <travis.downs@vectorized.io>
Date: Thu, 14 Jul 2022 13:38:51 -0700
Subject: [PATCH 135/201] Include broker list in metadata estimation

Prior to this change, we used only the topic and partition data to estimate
the size of the metadata response. Now, we also include the approximate
size of the broker metadata portion of the response, which may
be important if the list of brokers is very large or they have very long
hostnames.
---
 src/v/kafka/server/handlers/metadata.cc | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/v/kafka/server/handlers/metadata.cc b/src/v/kafka/server/handlers/metadata.cc
index 92e034fb4af87..2667b11176f56 100644
--- a/src/v/kafka/server/handlers/metadata.cc
+++ b/src/v/kafka/server/handlers/metadata.cc
@@ -14,6 +14,7 @@
 #include "cluster/types.h"
 #include "config/configuration.h"
 #include "config/node_config.h"
+#include "kafka/protocol/schemata/metadata_response.h"
 #include "kafka/server/errors.h"
 #include "kafka/server/fwd.h"
 #include "kafka/server/handlers/details/leader_epoch.h"
@@ -446,9 +447,21 @@ metadata_memory_estimator(size_t request_size, connection_context& conn_ctx) {
     // fit in this 10000k slush fund.
     size_t size_estimate = 10000;
 
-    auto& md = conn_ctx.server().metadata_cache().all_topics_metadata();
+    auto& md_cache = conn_ctx.server().metadata_cache();
 
-    for (auto& [tp_ns, topic_metadata] : md) {
+    // The size will vary with the number of brokers, though this effect is
+    // probably small if there are large numbers of partitions
+
+    // This covers the variable part of the broker response, i.e., the broker
+    // hostname + rack We just hope these are less than this amount, because we
+    // don't want to execute the relatively complex logic to guess the listener
+    // just for the size estimate.
+    constexpr size_t extra_bytes_per_broker = 200;
+    size_estimate
+      += md_cache.all_brokers().size()
+         * (sizeof(metadata_response_broker) + extra_bytes_per_broker);
+
+    for (auto& [tp_ns, topic_metadata] : md_cache.all_topics_metadata()) {
         // metadata_response_topic
         size_estimate += sizeof(kafka::metadata_response_topic);
         size_estimate += tp_ns.tp().size();

From f27d25af1e01ee04f6409fb03084340bea7be70a Mon Sep 17 00:00:00 2001
From: Rogger Vasquez <rvasque3@gmail.com>
Date: Thu, 14 Jul 2022 15:42:32 -0500
Subject: [PATCH 136/201] rpk: make redpanda_checkers run in order

---
 src/go/rpk/pkg/tuners/check.go | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/go/rpk/pkg/tuners/check.go b/src/go/rpk/pkg/tuners/check.go
index 453018698b854..cf7a10a7f0ed2 100644
--- a/src/go/rpk/pkg/tuners/check.go
+++ b/src/go/rpk/pkg/tuners/check.go
@@ -30,7 +30,16 @@ func Check(
 		return results, err
 	}
 
-	for _, checkers := range checkersMap {
+	// We use a sorted list of the checker's ID present in the checkersMap to
+	// run in a consistent order.
+	var ids []int
+	for id := range checkersMap {
+		ids = append(ids, int(id))
+	}
+	sort.Ints(ids)
+
+	for _, id := range ids {
+		checkers := checkersMap[CheckerID(id)]
 		for _, c := range checkers {
 			result := c.Check()
 			if result.Err != nil {

From f649f7f7f46abd37fa413e5d36ecf032976d3ab9 Mon Sep 17 00:00:00 2001
From: Travis Downs <travis.downs@vectorized.io>
Date: Thu, 14 Jul 2022 14:26:23 -0700
Subject: [PATCH 137/201] Add noexcept to handler_for_key.

---
 src/v/kafka/server/handlers/handler_interface.cc | 2 +-
 src/v/kafka/server/handlers/handler_interface.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/v/kafka/server/handlers/handler_interface.cc b/src/v/kafka/server/handlers/handler_interface.cc
index 156f305bb51b1..0551cef7ffb85 100644
--- a/src/v/kafka/server/handlers/handler_interface.cc
+++ b/src/v/kafka/server/handlers/handler_interface.cc
@@ -125,7 +125,7 @@ constexpr auto make_lut(type_list<Ts...>) {
     return lut;
 }
 
-std::optional<handler> handler_for_key(kafka::api_key key) {
+std::optional<handler> handler_for_key(kafka::api_key key) noexcept {
     static constexpr auto lut = make_lut(request_types{});
     if (key >= (short)0 && key < (short)lut.size()) {
         if (auto handler = lut[key]) {
diff --git a/src/v/kafka/server/handlers/handler_interface.h b/src/v/kafka/server/handlers/handler_interface.h
index 6f5b368984c84..dc4857e9f9933 100644
--- a/src/v/kafka/server/handlers/handler_interface.h
+++ b/src/v/kafka/server/handlers/handler_interface.h
@@ -106,6 +106,6 @@ using handler = const handler_interface*;
  * @param key the API key for the handler
  * @return std::optional<handler> the handler, if any
  */
-std::optional<handler> handler_for_key(api_key key);
+std::optional<handler> handler_for_key(api_key key) noexcept;
 
 } // namespace kafka

From 1bd2a548fa751624c8cebc7fd3f3093db6fbd211 Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Thu, 14 Jul 2022 22:29:40 +0100
Subject: [PATCH 138/201] controller/probe: Disable metrics on stop()

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/cluster/controller_probe.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/v/cluster/controller_probe.cc b/src/v/cluster/controller_probe.cc
index e930c239decd9..6063b322757a1 100644
--- a/src/v/cluster/controller_probe.cc
+++ b/src/v/cluster/controller_probe.cc
@@ -49,6 +49,7 @@ void controller_probe::start() {
 }
 
 void controller_probe::stop() {
+    _public_metrics.reset();
     _controller._raft_manager.local().unregister_leadership_notification(
       _leadership_notification_handle);
 }

From fd6353f40efc22e433e79b14aa0fabd2e68cabb9 Mon Sep 17 00:00:00 2001
From: Travis Downs <travis.downs@vectorized.io>
Date: Thu, 14 Jul 2022 14:30:19 -0700
Subject: [PATCH 139/201] Add NOLINT to use of operator[]

We already check the bounds so the cpp core guideline presumably
does not apply and we don't want to pay the price for the additional
bounds check inside at().
---
 src/v/kafka/server/handlers/handler_interface.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/v/kafka/server/handlers/handler_interface.cc b/src/v/kafka/server/handlers/handler_interface.cc
index 0551cef7ffb85..44593b0f96e42 100644
--- a/src/v/kafka/server/handlers/handler_interface.cc
+++ b/src/v/kafka/server/handlers/handler_interface.cc
@@ -128,6 +128,9 @@ constexpr auto make_lut(type_list<Ts...>) {
 std::optional<handler> handler_for_key(kafka::api_key key) noexcept {
     static constexpr auto lut = make_lut(request_types{});
     if (key >= (short)0 && key < (short)lut.size()) {
+        // We have already checked the bounds above so it is safe to use []
+        // instead of at()
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-constant-array-index)
         if (auto handler = lut[key]) {
             return handler;
         }

From e5090d25a7dd6acb9e1d00448aef1eac2ffd0144 Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Thu, 14 Jul 2022 22:30:34 +0100
Subject: [PATCH 140/201] controller/probe: Prefer the public API

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/cluster/controller_probe.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/v/cluster/controller_probe.cc b/src/v/cluster/controller_probe.cc
index 6063b322757a1..44c0fcd78f860 100644
--- a/src/v/cluster/controller_probe.cc
+++ b/src/v/cluster/controller_probe.cc
@@ -87,7 +87,7 @@ void controller_probe::setup_metrics() {
           "partitions",
           [this] {
               const auto& leaders_table
-                = _controller._partition_leaders.local();
+                = _controller.get_partition_leaders().local();
 
               auto partitions_count = 0;
               leaders_table.for_each_leader(
@@ -102,7 +102,7 @@ void controller_probe::setup_metrics() {
           "unavailable_partitions",
           [this] {
               const auto& leaders_table
-                = _controller._partition_leaders.local();
+                = _controller.get_partition_leaders().local();
               auto unavailable_partitions_count = 0;
 
               leaders_table.for_each_leader([&unavailable_partitions_count](

From 48ab0af9867b04ad911d24014779fbd94cd3748c Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Thu, 14 Jul 2022 22:31:27 +0100
Subject: [PATCH 141/201] controller/probe: Avoid a null raft0

This avoids a known crash

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/cluster/controller_probe.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/v/cluster/controller_probe.cc b/src/v/cluster/controller_probe.cc
index 44c0fcd78f860..1c2ad9a4bc1bd 100644
--- a/src/v/cluster/controller_probe.cc
+++ b/src/v/cluster/controller_probe.cc
@@ -36,7 +36,7 @@ void controller_probe::start() {
           std::optional<model::node_id> leader_id) {
             // We are only interested in notifications regarding the controller
             // group.
-            if (_controller._raft0->group() != group) {
+            if (!_controller._raft0 || _controller._raft0->group() != group) {
                 return;
             }
 

From 54f650e2d02641e8b24b2d09eda4893c875c6bde Mon Sep 17 00:00:00 2001
From: Andrew Wong <awong@redpanda.com>
Date: Tue, 12 Jul 2022 21:15:44 -0700
Subject: [PATCH 142/201] tests: re-use installs in upgrade tests

Tests that use the RedpandaInstaller are bandwidth-intensive and often
take several minutes to complete on account of the download of hundreds
of MBs worth of tarballs.

This commit mitigates this in local ducktape by having all test
containers share a single bind mount. The installer now uses a lock file
to prevent concurrent operations on the mount (e.g. when downloading
binaries, checking to see what binaries exist, etc).

With this commit, regardless of whether in local or clustered ducktape,
we also no longer get rid of downloaded binaries between test runs.
Instead, after a test completes, we just revert any changes to the
original binaries, and leave the rest be.
---
 tests/docker/docker-compose.yml             |   1 +
 tests/rptest/services/redpanda.py           |  24 +-
 tests/rptest/services/redpanda_installer.py | 291 ++++++++++++++------
 tests/rptest/tests/fix_5355_upgrade_test.py |   1 -
 tests/rptest/tests/redpanda_test.py         |   3 -
 tests/rptest/tests/upgrade_test.py          |   8 +-
 6 files changed, 220 insertions(+), 108 deletions(-)

diff --git a/tests/docker/docker-compose.yml b/tests/docker/docker-compose.yml
index c4687c13e74d7..14de928b1ba02 100644
--- a/tests/docker/docker-compose.yml
+++ b/tests/docker/docker-compose.yml
@@ -42,5 +42,6 @@ services:
     - minio
     volumes:
     - '${BUILD_ROOT}:${BUILD_ROOT}'
+    - '${BUILD_ROOT}/redpanda_installs:/opt/redpanda_installs'
     networks:
     - redpanda-test
diff --git a/tests/rptest/services/redpanda.py b/tests/rptest/services/redpanda.py
index e7109b4fe857b..f132ea62b92bc 100644
--- a/tests/rptest/services/redpanda.py
+++ b/tests/rptest/services/redpanda.py
@@ -447,7 +447,6 @@ def __init__(self,
                  environment: Optional[dict[str, str]] = None,
                  security: SecurityConfig = SecurityConfig(),
                  node_ready_timeout_s=None,
-                 enable_installer=False,
                  superuser: Optional[SaslCredentials] = None):
         super(RedpandaService, self).__init__(context, num_nodes=num_brokers)
         self._context = context
@@ -456,9 +455,7 @@ def __init__(self,
         self._enable_pp = enable_pp
         self._enable_sr = enable_sr
         self._security = security
-        self._installer: Optional[RedpandaInstaller] = None
-        if enable_installer:
-            self._installer = RedpandaInstaller(self)
+        self._installer: RedpandaInstaller = RedpandaInstaller(self)
 
         if superuser is None:
             superuser = self.SUPERUSER_CREDENTIALS
@@ -601,7 +598,7 @@ def start(self, nodes=None, clean_nodes=True, start_si=True):
                     # Expected usage is that we may install new binaries before
                     # starting the cluster, and installation-cleaning happened
                     # when we started the installer.
-                    self.clean_node(node, clean_installs=False)
+                    self.clean_node(node, preserve_current_install=True)
                 else:
                     self.logger.debug("%s: skip cleaning node" %
                                       self.who_am_i(node))
@@ -1134,7 +1131,7 @@ def decode_backtraces(self):
                 self.logger.exception("Failed to run seastar-addr2line")
 
     def rp_install_path(self):
-        if self._installer and self._installer._started:
+        if self._installer._started:
             # The installer sets up binaries to always use /opt/redpanda.
             return "/opt/redpanda"
         return self._context.globals.get("rp_install_path_root", None)
@@ -1207,7 +1204,10 @@ def clean(self, **kwargs):
         if self._s3client:
             self.delete_bucket_from_si()
 
-    def clean_node(self, node, preserve_logs=False, clean_installs=True):
+    def clean_node(self,
+                   node,
+                   preserve_logs=False,
+                   preserve_current_install=False):
         # These are allow_fail=True to allow for a race where kill_process finds
         # the PID, but then the process has died before it sends the SIGKILL.  This
         # should be safe against actual failures to of the process to stop, because
@@ -1235,9 +1235,11 @@ def clean_node(self, node, preserve_logs=False, clean_installs=True):
                 self.EXECUTABLE_SAVE_PATH):
             node.account.remove(self.EXECUTABLE_SAVE_PATH)
 
-        if clean_installs and self._installer is not None:
-            # Get rid of any installed packages.
-            self._installer.clean(node)
+        if not preserve_current_install or not self._installer._started:
+            # Reset the binaries to use the original binaries.
+            # NOTE: if the installer hasn't been started, there is no
+            # installation to preserve!
+            self._installer.reset_current_install([node])
 
     def remove_local_data(self, node):
         node.account.remove(f"{RedpandaService.PERSISTENT_ROOT}/data/*")
@@ -1728,7 +1730,7 @@ def save_executable(self):
         # Any node will do. Even in a mixed-version upgrade test, we should
         # still have the original binaries available.
         node = self.nodes[0]
-        if self._installer and self._installer._started:
+        if self._installer._started:
             head_root_path = self._installer.path_for_version(
                 RedpandaInstaller.HEAD)
             binary = f"{head_root_path}/libexec/redpanda"
diff --git a/tests/rptest/services/redpanda_installer.py b/tests/rptest/services/redpanda_installer.py
index e9ab2c2f2cbbf..445d6e9fe8f61 100644
--- a/tests/rptest/services/redpanda_installer.py
+++ b/tests/rptest/services/redpanda_installer.py
@@ -7,8 +7,11 @@
 # the Business Source License, use of this software will be governed
 # by the Apache License, Version 2.0
 
+import errno
+import os
 import re
 import requests
+
 from ducktape.utils.util import wait_until
 
 # Match any version that may result from a redpanda binary, which may not be a
@@ -49,9 +52,19 @@ class RedpandaInstaller:
     # Represents the binaries installed at the time of the call to start(). It
     # is expected that this is identical across all nodes initially.
     HEAD = "head"
+
+    # Directory to which binaries are downloaded.
+    #
+    # In local deployments it is expected that this is shared by all nodes in a
+    # cluster, and that directories therein are only ever created (never
+    # deleted) during the lifetime of the RedpandaInstaller.
     INSTALLER_ROOT = "/opt/redpanda_installs"
     TGZ_URL_TEMPLATE = "https://packages.vectorized.io/qSZR7V26sJx7tCXe/redpanda/raw/names/redpanda-{arch}/versions/{version}/redpanda-{version}-{arch}.tar.gz"
 
+    # File path to be used for locking to prevent multiple local test processes
+    # from operating on the same volume mounts.
+    INSTALLER_LOCK_PATH = f"{INSTALLER_ROOT}/install_lock"
+
     @staticmethod
     def root_for_version(version):
         """
@@ -78,16 +91,96 @@ def __init__(self, redpanda):
         """
         self._started = False
         self._redpanda = redpanda
-        self._installed_per_node = dict()
 
-        # Keep track if the original install path is /opt/redpanda is used, as
-        # is the case for package-deployed clusters. Since the installer uses
-        # this directory, we'll need to be mindful not to mess with the
-        # original binaries.
+        # Keep track if the original install path is /opt/redpanda, as is the
+        # case for package-deployed clusters. Since the installer uses this
+        # directory, we'll need to be mindful not to mess with the original
+        # binaries.
         rp_install_path_root = self._redpanda._context.globals.get(
             "rp_install_path_root", None)
         self._head_backed_up = rp_install_path_root == "/opt/redpanda"
 
+        # Whether the nodes are expected to share a single mounted volume for
+        # their installs. If so, care should be taken to coordinate operations
+        # on the installer root.
+        self._nodes_share_installs = rp_install_path_root != "/opt/redpanda"
+
+        # File descriptor used to coordinate access to the installer root when
+        # multiple test processes are running on the same machine.
+        # Must be acquire when operating on the contents of the installer root
+        # (i.e. root_for_version(), etc).
+        self._install_lock_fd = None
+
+    def _acquire_install_lock(self, timeout_sec=600):
+        """
+        Attempt to take the install lock, preventing other test processes from
+        operating an installer.
+
+        Serves to prevent concurrent operations to the same local mountpoint.
+        """
+        if not self._nodes_share_installs:
+            self._redpanda.logger.debug(
+                "Nodes don't share installs; no locking needed")
+            return
+
+        def _lock():
+            try:
+                self._redpanda.logger.debug(
+                    f"Acquiring install lock {self.INSTALLER_LOCK_PATH}")
+                fd = os.open(self.INSTALLER_LOCK_PATH,
+                             os.O_CREAT | os.O_EXCL | os.O_RDWR)
+                self._install_lock_fd = fd
+            except OSError as e:
+                if e.errno != errno.EEXIST:
+                    raise
+                # Another process holds the lock.
+                return False
+            return True
+
+        wait_until(lambda: _lock(), timeout_sec=timeout_sec)
+        self._redpanda.logger.debug(
+            f"Acquired install lock {self.INSTALLER_LOCK_PATH}")
+
+    def _release_install_lock(self):
+        """
+        Releases the install lock, allowing other test processes running
+        locally to perform downloads.
+        """
+        if not self._nodes_share_installs:
+            self._redpanda.logger.debug(
+                "Nodes don't share installs; no locking needed")
+            return
+
+        if not self._install_lock_fd:
+            self._redpanda.logger.debug("Installer lock not held")
+            return True
+        os.close(self._install_lock_fd)
+        os.unlink(self.INSTALLER_LOCK_PATH)
+        self._redpanda.logger.debug("Released install lock")
+
+    def _setup_head_roots_unlocked(self):
+        """
+        Sets up the head roots on each node such that they contain or point to
+        the original binaries installed at 'rp_install_path_root'.
+
+        Expects that the install lock has been acquired before calling.
+        """
+        nodes = self._redpanda.nodes
+        head_root_path = RedpandaInstaller.root_for_version(
+            RedpandaInstaller.HEAD)
+        rp_install_path_root = self._redpanda._context.globals.get(
+            "rp_install_path_root", None)
+        for node in nodes:
+            # Always end up with binaries at 'head_root_path', so we can
+            # continue to use root_for_version() to reference the head root.
+            cmd = None
+            if self._head_backed_up:
+                cmd = f"mv /opt/redpanda {head_root_path}"
+            elif not node.account.exists(head_root_path):
+                cmd = f"ln -s {rp_install_path_root} {head_root_path}"
+            if cmd:
+                node.account.ssh_output(cmd)
+
     def start(self):
         """
         Validates that all nodes in the service have installed the same
@@ -97,6 +190,9 @@ def start(self):
         if self._started:
             return
 
+        # In case a previous test was aborted, do some cleanup.
+        self.reset_current_install(self._redpanda.nodes)
+
         initial_version = None
         nodes = self._redpanda.nodes
 
@@ -107,37 +203,23 @@ def start(self):
                 initial_version = vers
             assert initial_version == vers, \
                 f"Mismatch version {node.account.hostname} has {vers}, {nodes[0].account.hostname} has {initial_version}"
+            node.account.ssh_output(f"mkdir -p {self.INSTALLER_ROOT}")
 
-        # Clean up the installer root directory so we start out clean.
-        for node in nodes:
-            if node.account.exists(RedpandaInstaller.INSTALLER_ROOT):
-                node.account.remove(f"{RedpandaInstaller.INSTALLER_ROOT}/*",
-                                    allow_fail=True)
-            else:
-                node.account.mkdir(RedpandaInstaller.INSTALLER_ROOT)
+        try:
+            self._acquire_install_lock()
+            self._setup_head_roots_unlocked()
+        finally:
+            self._release_install_lock()
 
-        # Now that we're at a sane starting point, set up our install path for
-        # ease of jumping between versions.
+        # Start out pointing /opt/redpanda at the current installation.
         ssh_setup_head_per_node = dict()
-        head_root_path = RedpandaInstaller.root_for_version(
-            RedpandaInstaller.HEAD)
-        rp_install_path_root = self._redpanda._context.globals.get(
-            "rp_install_path_root", None)
+        head_root_path = self.root_for_version(RedpandaInstaller.HEAD)
         for node in nodes:
-            # For simplicity's sake, always end up with binaries at
-            # 'head_root_path', so we can continue to use root_for_version() to
-            # reference the head root.
-            head_cmd = ""
-            if self._head_backed_up:
-                head_cmd = f"mv /opt/redpanda {head_root_path}"
-            else:
-                head_cmd = f"ln -s {rp_install_path_root} {head_root_path}"
-
-            cmd = f"{head_cmd} && ln -s {head_root_path} /opt/redpanda"
-            ssh_setup_head_per_node[node] = node.account.ssh_capture(cmd)
-            self._installed_per_node[node] = set()
+            if not node.account.exists("/opt/redpanda"):
+                cmd = f"ln -s {head_root_path} /opt/redpanda"
+                ssh_setup_head_per_node[node] = node.account.ssh_capture(cmd)
         self.wait_for_async_ssh(self._redpanda.logger, ssh_setup_head_per_node,
-                                "Setting up head binaries")
+                                "Setting up /opt/redpanda")
 
         def int_tuple(str_tuple):
             return (int(str_tuple[0]), int(str_tuple[1]), int(str_tuple[2]))
@@ -178,8 +260,8 @@ def highest_from_prior_feature_version(self, version):
 
     def install(self, nodes, version):
         """
-        Installs the release on the given node such that the next time the node
-        is restarted, it will use the newly installed bits.
+        Installs the release on the given nodes such that the next time the
+        nodes are restarted, they will use the newly installed bits.
 
         TODO: abstract 'version' into a more generic installation that doesn't
         necessarily correspond to a released version. E.g. a custom build
@@ -187,64 +269,99 @@ def install(self, nodes, version):
         """
         if not self._started:
             self.start()
+
+        try:
+            self._acquire_install_lock()
+            self._install_unlocked(nodes, version)
+        finally:
+            self._release_install_lock()
+
+    def _install_unlocked(self, nodes, version):
+        """
+        Like above but expects the install lock to have been taken before
+        calling.
+        """
         assert version == RedpandaInstaller.HEAD or version in self._released_versions, \
             f"Can't find installation for {version}"
-        ssh_install_per_node = dict()
+        version_root = self.root_for_version(version)
+
+        nodes_to_download = nodes
+        if self._nodes_share_installs:
+            nodes_to_download = [nodes[0]]
+
+        ssh_download_per_node = dict()
+        for node in nodes_to_download:
+            if not version == RedpandaInstaller.HEAD and not node.account.exists(
+                    version_root):
+                ssh_download_per_node[
+                    node] = self._async_download_on_node_unlocked(
+                        node, version)
+        self.wait_for_async_ssh(self._redpanda.logger, ssh_download_per_node,
+                                "Finished downloading binaries")
+
+        # Regardless of whether we downloaded anything, adjust the
+        # /opt/redpanda link to point to the appropriate version on all nodes.
+        relink_cmd = f"unlink /opt/redpanda && ln -s {version_root} /opt/redpanda"
         for node in nodes:
-            # If we already have this version installed, just adjust the
-            # symlinks.
-            version_root = self.root_for_version(version)
-            relink_cmd = f"unlink /opt/redpanda && ln -s {version_root} /opt/redpanda"
-            if version == RedpandaInstaller.HEAD or version in self._installed_per_node[
-                    node]:
-                ssh_install_per_node[node] = node.account.ssh_capture(
-                    relink_cmd)
-                continue
-
-            arch = "amd64"
-            uname = str(node.account.ssh_output("uname -m"))
-            if "aarch" in uname or "arm" in uname:
-                arch = "arm64"
-            self._redpanda.logger.debug(
-                f"{node.account.hostname} uname output: {uname}")
-
-            self._installed_per_node[node].add(version)
-            url = RedpandaInstaller.TGZ_URL_TEMPLATE.format( \
-                arch=arch, version=f"{version[0]}.{version[1]}.{version[2]}")
-            tgz = "redpanda.tar.gz"
-            cmd = f"curl -fsSL {url} --create-dir --output-dir {version_root} -o {tgz} && gunzip -c {version_root}/{tgz} | tar -xf - -C {version_root} && rm {version_root}/{tgz} && {relink_cmd}"
-            ssh_install_per_node[node] = node.account.ssh_capture(cmd)
+            node.account.ssh_output(relink_cmd)
 
-        self.wait_for_async_ssh(self._redpanda.logger, ssh_install_per_node,
-                                "Finished installing binaries")
-
-    def clean(self, node):
+    def _async_download_on_node_unlocked(self, node, version):
         """
-        Cleans the node such that only the original installation remains.
+        Asynchonously downloads Redpanda of the given version on the given
+        node. Returns an iterator to the results.
 
-        This should only be called once there is no longer a need to run the
-        RedpandaService.
+        Expects the install lock to have been taken before calling.
         """
-        if not self._started:
-            self._redpanda.logger.debug(
-                "Ignoring cleanup, installer not started")
-            return
-
-        # Allow failures so the entire cleanup can proceed even on failure.
-        head_root_path = RedpandaInstaller.root_for_version(
-            RedpandaInstaller.HEAD)
-        if self._head_backed_up:
-            cmd = f"unlink /opt/redpanda && mv {head_root_path} /opt/redpanda"
-            node.account.ssh(cmd, allow_fail=True)
-        else:
-            cmd = f"unlink /opt/redpanda && unlink {head_root_path}"
-            node.account.ssh(cmd, allow_fail=True)
-
-        # Also clean up all the downloaded published binaries.
-        roots_to_rm = [
-            RedpandaInstaller.root_for_version(v)
-            for v in self._installed_per_node[node]
-        ]
-        if len(roots_to_rm) == 0:
-            return
-        node.account.remove(' '.join(roots_to_rm), allow_fail=True)
+        version_root = self.root_for_version(version)
+        arch = "amd64"
+        uname = str(node.account.ssh_output("uname -m"))
+        if "aarch" in uname or "arm" in uname:
+            arch = "arm64"
+        self._redpanda.logger.debug(
+            f"{node.account.hostname} uname output: {uname}")
+
+        url = RedpandaInstaller.TGZ_URL_TEMPLATE.format( \
+            arch=arch, version=f"{version[0]}.{version[1]}.{version[2]}")
+        tgz = "redpanda.tar.gz"
+        cmd = f"curl -fsSL {url} --create-dir --output-dir {version_root} -o {tgz} && gunzip -c {version_root}/{tgz} | tar -xf - -C {version_root} && rm {version_root}/{tgz}"
+        return node.account.ssh_capture(cmd)
+
+    def reset_current_install(self, nodes):
+        """
+        WARNING: should not be used to upgrade to the originally installed
+        binaries; use 'install(RedpandaInstaller.HEAD)' for that. This should
+        only be used to clean up a node to its expected starting state (the
+        state of the world before the first call to 'start()').
+
+        Resets any /opt/redpanda symlink to instead be real binaries if they
+        exist. This is a best attempt effort to revert the installs to their
+        original state (i.e. the state before installing other versions).
+
+        Upon returning, either:
+        - this is a packaged deployment (CDT) and we are left with a real
+          /opt/redpanda directory (not a symlink) if possible, or
+        - this is a local deployment and we are left with no links to head
+          binaries
+        """
+        head_root_path = self.root_for_version(RedpandaInstaller.HEAD)
+        for node in nodes:
+            host = node.account.hostname
+            if self._head_backed_up:
+                assert not self._nodes_share_installs
+                # NOTE: no locking required since installs aren't shared.
+                head_root_path_exists = node.account.exists(head_root_path)
+                opt_redpanda_exists = node.account.exists("/opt/redpanda")
+                if opt_redpanda_exists:
+                    if not node.account.islink("/opt/redpanda"):
+                        assert not head_root_path_exists, \
+                            f"{host}: {head_root_path} exists and /opt/redpanda exists but is not a link; unclear which to use"
+                        continue
+                    node.account.ssh_output("unlink /opt/redpanda",
+                                            allow_fail=True)
+
+                assert head_root_path_exists, f"{host}: neither {head_root_path} nor /opt/redpanda exists"
+                node.account.ssh_output(f"mv {head_root_path} /opt/redpanda",
+                                        allow_fail=True)
+            else:
+                node.account.ssh_output("unlink /opt/redpanda",
+                                        allow_fail=True)
diff --git a/tests/rptest/tests/fix_5355_upgrade_test.py b/tests/rptest/tests/fix_5355_upgrade_test.py
index 455154950163d..d7776299ba7df 100644
--- a/tests/rptest/tests/fix_5355_upgrade_test.py
+++ b/tests/rptest/tests/fix_5355_upgrade_test.py
@@ -39,7 +39,6 @@ def __init__(self, test_context):
         }
         super(Fix5355UpgradeTest, self).__init__(test_context=test_context,
                                                  num_brokers=3,
-                                                 enable_installer=True,
                                                  extra_rp_conf=extra_rp_conf)
         self.installer = self.redpanda._installer
 
diff --git a/tests/rptest/tests/redpanda_test.py b/tests/rptest/tests/redpanda_test.py
index 3b69e5b44bb3a..f81f7c75b1a22 100644
--- a/tests/rptest/tests/redpanda_test.py
+++ b/tests/rptest/tests/redpanda_test.py
@@ -34,7 +34,6 @@ def __init__(self,
                  enable_pp=False,
                  enable_sr=False,
                  si_settings=None,
-                 enable_installer=False,
                  **kwargs):
         """
         Any trailing keyword arguments are passed through to the
@@ -43,7 +42,6 @@ def __init__(self,
         super(RedpandaTest, self).__init__(test_context)
         self.scale = Scale(test_context)
         self.si_settings = si_settings
-        self.enable_installer = enable_installer
 
         if num_brokers is None:
             # Default to a 3 node cluster if sufficient nodes are available, else
@@ -65,7 +63,6 @@ def __init__(self,
                                         enable_pp=enable_pp,
                                         enable_sr=enable_sr,
                                         si_settings=self.si_settings,
-                                        enable_installer=enable_installer,
                                         **kwargs)
         self._client = DefaultClient(self.redpanda)
 
diff --git a/tests/rptest/tests/upgrade_test.py b/tests/rptest/tests/upgrade_test.py
index 7745e1ebccaf5..5da4b85973638 100644
--- a/tests/rptest/tests/upgrade_test.py
+++ b/tests/rptest/tests/upgrade_test.py
@@ -23,9 +23,7 @@ class UpgradeFromSpecificVersion(RedpandaTest):
     """
     def __init__(self, test_context):
         super(UpgradeFromSpecificVersion,
-              self).__init__(test_context=test_context,
-                             num_brokers=3,
-                             enable_installer=True)
+              self).__init__(test_context=test_context, num_brokers=3)
         self.installer = self.redpanda._installer
 
     def setUp(self):
@@ -69,9 +67,7 @@ class UpgradeFromPriorFeatureVersionTest(RedpandaTest):
     """
     def __init__(self, test_context):
         super(UpgradeFromPriorFeatureVersionTest,
-              self).__init__(test_context=test_context,
-                             num_brokers=1,
-                             enable_installer=True)
+              self).__init__(test_context=test_context, num_brokers=1)
         self.installer = self.redpanda._installer
 
     def setUp(self):

From cd3e8c3e5d18c74aff4b3a09e256082d1adf975f Mon Sep 17 00:00:00 2001
From: Michal Maslanka <michal@redpanda.com>
Date: Tue, 5 Jul 2022 11:03:07 +0200
Subject: [PATCH 143/201] s/segment: fixed typos

Signed-off-by: Michal Maslanka <michal@redpanda.com>
---
 src/v/storage/segment.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/v/storage/segment.cc b/src/v/storage/segment.cc
index 21f5f63e5be49..4f96aa6667bab 100644
--- a/src/v/storage/segment.cc
+++ b/src/v/storage/segment.cc
@@ -179,7 +179,7 @@ ss::future<> segment::release_appender(readers_cache* readers_cache) {
      * An exception safe variant of try write lock is simulated since seastar
      * does not have such primitives available on the semaphore. The fast path
      * of try_write_lock is combined with immediately releasing the lock (which
-     * will not also not signal any waiters--there cannot be any!) to guarnatee
+     * will not also not signal any waiters--there cannot be any!) to guarantee
      * that the blocking get_units version will find the lock uncontested.
      *
      * TODO: we should upstream get_units try-variants for semaphore and rwlock.
@@ -447,7 +447,7 @@ ss::future<append_result> segment::append(const model::record_batch& b) {
           auto index_err = std::move(index_fut).get_exception();
           vlog(
             stlog.error,
-            "segment::append index: {}. ignorning append: {}",
+            "segment::append index: {}. ignoring append: {}",
             index_err,
             ret);
           return ss::make_exception_future<append_result>(index_err);

From 3877d2ab3a3d113bf3aeb2fe65a266fed9168aa9 Mon Sep 17 00:00:00 2001
From: Michal Maslanka <michal@redpanda.com>
Date: Tue, 5 Jul 2022 19:02:51 +0200
Subject: [PATCH 144/201] s/compacted_index: improve naming of compacted index
 recovery state

Signed-off-by: Michal Maslanka <michal@redpanda.com>
---
 src/v/storage/compacted_index.h | 28 +++++++++++++++-------
 src/v/storage/segment_utils.cc  | 41 +++++++--------------------------
 src/v/storage/types.cc          | 15 ++++++++++++
 3 files changed, 43 insertions(+), 41 deletions(-)

diff --git a/src/v/storage/compacted_index.h b/src/v/storage/compacted_index.h
index 6906243899134..39bc7b9a2656f 100644
--- a/src/v/storage/compacted_index.h
+++ b/src/v/storage/compacted_index.h
@@ -55,14 +55,23 @@ struct compacted_index {
         }
     };
     enum class recovery_state {
-        // happens during a crash
-        missing,
-        // needs rebuilding - when user 'touch' a file or during a crash
-        needsrebuild,
-        // already recovered - nothing to do - after a reboot
-        recovered,
-        // we need to compact next
-        nonrecovered
+        /**
+         * Index may be missing when either was deleted or not stored when
+         * redpanda crashed
+         */
+        index_missing,
+        /**
+         * Index may needs a rebuild when it is corrupted
+         */
+        index_needs_rebuild,
+        /**
+         * Segment is already compacted
+         */
+        already_compacted,
+        /**
+         * Compaction index is recovered, ready to compaction
+         */
+        index_recovered
     };
     static constexpr size_t footer_size = sizeof(footer::size)
                                           + sizeof(footer::keys)
@@ -83,6 +92,9 @@ struct compacted_index {
         int32_t delta;
     };
 };
+
+std::ostream& operator<<(std::ostream&, compacted_index::recovery_state);
+
 [[gnu::always_inline]] inline compacted_index::footer_flags
 operator|(compacted_index::footer_flags a, compacted_index::footer_flags b) {
     return compacted_index::footer_flags(
diff --git a/src/v/storage/segment_utils.cc b/src/v/storage/segment_utils.cc
index df6eec938b835..c6248d8a0f60f 100644
--- a/src/v/storage/segment_utils.cc
+++ b/src/v/storage/segment_utils.cc
@@ -54,31 +54,6 @@
 #include <fmt/format.h>
 #include <roaring/roaring.hh>
 
-template<>
-struct fmt::formatter<storage::compacted_index::recovery_state> {
-    using recovery_state = storage::compacted_index::recovery_state;
-    constexpr auto parse(format_parse_context& ctx) { return ctx.end(); }
-    template<typename FormatContext>
-    auto format(const recovery_state& s, FormatContext& ctx) const {
-        std::string_view str = "unknown";
-        switch (s) {
-        case recovery_state::missing:
-            str = "missing";
-            break;
-        case recovery_state::needsrebuild:
-            str = "needsrebuild";
-            break;
-        case recovery_state::recovered:
-            str = "recovered";
-            break;
-        case recovery_state::nonrecovered:
-            str = "nonrecovered";
-            break;
-        }
-        return format_to(ctx.out(), "{}", str);
-    }
-};
-
 namespace storage::internal {
 using namespace storage; // NOLINT
 
@@ -306,9 +281,9 @@ ss::future<compacted_index::recovery_state> do_detect_compaction_index_state(
             .then([reader]() mutable { return reader.load_footer(); })
             .then([](compacted_index::footer footer) {
                 if (bool(footer.flags & flags::self_compaction)) {
-                    return compacted_index::recovery_state::recovered;
+                    return compacted_index::recovery_state::already_compacted;
                 }
-                return compacted_index::recovery_state::nonrecovered;
+                return compacted_index::recovery_state::index_recovered;
             })
             .finally([reader]() mutable { return reader.close(); });
       })
@@ -318,7 +293,7 @@ ss::future<compacted_index::recovery_state> do_detect_compaction_index_state(
             "detected error while attempting recovery, {}. marking as 'needs "
             "rebuild'. Common situation during crashes or hard shutdowns.",
             e);
-          return compacted_index::recovery_state::needsrebuild;
+          return compacted_index::recovery_state::index_needs_rebuild;
       });
 }
 
@@ -329,7 +304,7 @@ detect_compaction_index_state(std::filesystem::path p, compaction_config cfg) {
             return do_detect_compaction_index_state(p, cfg);
         }
         return ss::make_ready_future<compacted_index::recovery_state>(
-          compacted_index::recovery_state::missing);
+          compacted_index::recovery_state::index_missing);
     });
 }
 
@@ -560,20 +535,20 @@ ss::future<compaction_result> self_compact_segment(
               compacted_index::recovery_state state) mutable {
           vlog(gclog.trace, "segment {} compaction state: {}", idx_path, state);
           switch (state) {
-          case compacted_index::recovery_state::recovered: {
+          case compacted_index::recovery_state::already_compacted: {
               vlog(gclog.debug, "detected {} is already compacted", idx_path);
               return ss::make_ready_future<compaction_result>(s->size_bytes());
           }
-          case compacted_index::recovery_state::nonrecovered:
+          case compacted_index::recovery_state::index_recovered:
               return do_self_compact_segment(
                        s, cfg, pb, readers_cache, resources)
                 .then([before = s->size_bytes(), &pb](size_t sz_after) {
                     pb.segment_compacted();
                     return compaction_result(before, sz_after);
                 });
-          case compacted_index::recovery_state::missing:
+          case compacted_index::recovery_state::index_missing:
               [[fallthrough]];
-          case compacted_index::recovery_state::needsrebuild: {
+          case compacted_index::recovery_state::index_needs_rebuild: {
               vlog(gclog.info, "Rebuilding index file... ({})", idx_path);
               pb.corrupted_compaction_index();
               return s->read_lock()
diff --git a/src/v/storage/types.cc b/src/v/storage/types.cc
index 65bc010f929be..b3d76868da19a 100644
--- a/src/v/storage/types.cc
+++ b/src/v/storage/types.cc
@@ -9,6 +9,7 @@
 
 #include "storage/types.h"
 
+#include "storage/compacted_index.h"
 #include "storage/ntp_config.h"
 #include "utils/human.h"
 #include "utils/to_string.h"
@@ -169,4 +170,18 @@ std::ostream& operator<<(std::ostream& o, const compaction_result& r) {
     return o;
 }
 
+std::ostream&
+operator<<(std::ostream& o, compacted_index::recovery_state state) {
+    switch (state) {
+    case compacted_index::recovery_state::index_missing:
+        return o << "index_missing";
+    case compacted_index::recovery_state::already_compacted:
+        return o << "already_compacted";
+    case compacted_index::recovery_state::index_needs_rebuild:
+        return o << "index_needs_rebuild";
+    case compacted_index::recovery_state::index_recovered:
+        return o << "index_recovered";
+    }
+    __builtin_unreachable();
+}
 } // namespace storage

From 274d8c2a8c320fae02ccb6c0bdf3c128dbcd7b87 Mon Sep 17 00:00:00 2001
From: Michal Maslanka <michal@redpanda.com>
Date: Tue, 5 Jul 2022 19:11:23 +0200
Subject: [PATCH 145/201] s/compacted_index: introduced `compacted_key` type

Introduced `compacted_key` type representing record key prefixed with a
byte of a record batch type. Introduction of `compacted_key` type allow
us to differentiate between keys that are already prefixed with the
batch type at index writer API level.

Signed-off-by: Michal Maslanka <michal@redpanda.com>
---
 src/v/storage/compacted_index.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/v/storage/compacted_index.h b/src/v/storage/compacted_index.h
index 39bc7b9a2656f..51f2b6c48f104 100644
--- a/src/v/storage/compacted_index.h
+++ b/src/v/storage/compacted_index.h
@@ -20,6 +20,14 @@
 namespace storage {
 // simple types shared among readers and writers
 
+/**
+ * Type representing a record key prefixed with batch_type
+ */
+struct compaction_key : bytes {
+    explicit compaction_key(bytes b)
+      : bytes(std::move(b)) {}
+};
+
 struct compacted_index {
     static constexpr const size_t max_entry_size = size_t(
       std::numeric_limits<uint16_t>::max());

From 0bd0b4adffae2a0c06ce2abaa9adeda97d19edcf Mon Sep 17 00:00:00 2001
From: Michal Maslanka <michal@redpanda.com>
Date: Tue, 5 Jul 2022 19:16:35 +0200
Subject: [PATCH 146/201] s/segment: do not store not compactible batches in
 index

We do not need to store batches of raft configuration and archival stm
metadata in the index as they are not compacted anyway.

Signed-off-by: Michal Maslanka <michal@redpanda.com>
---
 src/v/storage/compaction_reducers.cc | 7 +------
 src/v/storage/segment.cc             | 5 +++++
 src/v/storage/segment_utils.h        | 9 +++++++++
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/v/storage/compaction_reducers.cc b/src/v/storage/compaction_reducers.cc
index 6ac4eb8836528..991421c8abcfd 100644
--- a/src/v/storage/compaction_reducers.cc
+++ b/src/v/storage/compaction_reducers.cc
@@ -118,12 +118,7 @@ std::optional<model::record_batch>
 copy_data_segment_reducer::filter(model::record_batch&& batch) {
     // do not compact raft configuration and archival metadata as they shift
     // offset translation
-    if (
-      batch.header().type == model::record_batch_type::raft_configuration
-      || batch.header().type == model::record_batch_type::archival_metadata
-      || batch.header().type == model::record_batch_type::group_abort_tx
-      || batch.header().type == model::record_batch_type::group_commit_tx
-      || batch.header().type == model::record_batch_type::group_prepare_tx) {
+    if (!is_compactible(batch)) {
         return std::move(batch);
     }
 
diff --git a/src/v/storage/segment.cc b/src/v/storage/segment.cc
index 4f96aa6667bab..549b8a7e9e13c 100644
--- a/src/v/storage/segment.cc
+++ b/src/v/storage/segment.cc
@@ -365,6 +365,11 @@ ss::future<> segment::compaction_index_batch(const model::record_batch& b) {
     if (!has_compaction_index()) {
         return ss::now();
     }
+    // do not index not compactible batches
+    if (!internal::is_compactible(b)) {
+        return ss::now();
+    }
+
     if (!b.compressed()) {
         return do_compaction_index_batch(b);
     }
diff --git a/src/v/storage/segment_utils.h b/src/v/storage/segment_utils.h
index 711af52bda1a1..233d343fd3db8 100644
--- a/src/v/storage/segment_utils.h
+++ b/src/v/storage/segment_utils.h
@@ -194,4 +194,13 @@ struct clean_segment_value
     ss::sstring segment_name;
 };
 
+inline bool is_compactible(const model::record_batch& b) {
+    return !(
+      b.header().type == model::record_batch_type::raft_configuration
+      || b.header().type == model::record_batch_type::archival_metadata
+      || b.header().type == model::record_batch_type::group_abort_tx
+      || b.header().type == model::record_batch_type::group_commit_tx
+      || b.header().type == model::record_batch_type::group_prepare_tx);
+}
+
 } // namespace storage::internal

From 2a9d9ab95bdc4550d7f3da60278879f48279077e Mon Sep 17 00:00:00 2001
From: Michal Maslanka <michal@redpanda.com>
Date: Tue, 5 Jul 2022 19:20:45 +0200
Subject: [PATCH 147/201] s/compacted_index: prefix indexed key with batch type

Added batch type prefix to key stored in compaction index, both on disk
and in memory. This way a key stored in an index in actually a tuple
consisting of (batch_type, key_payload). Thanks to this approach
compaction logic is able to compact keys per batch type instead of
discriminating the batch type completely.

Signed-off-by: Michal Maslanka <michal@redpanda.com>
---
 src/v/storage/compacted_index.h               | 27 ++++-
 src/v/storage/compacted_index_chunk_reader.cc |  3 +-
 src/v/storage/compacted_index_writer.h        | 32 ++++--
 src/v/storage/compaction_reducers.cc          |  8 +-
 src/v/storage/segment.cc                      |  6 +-
 src/v/storage/spill_key_index.cc              | 27 +++--
 src/v/storage/spill_key_index.h               | 13 ++-
 src/v/storage/tests/compaction_idx_bench.cc   |  5 +-
 .../tests/compaction_index_format_tests.cc    | 99 ++++++++++++++-----
 9 files changed, 161 insertions(+), 59 deletions(-)

diff --git a/src/v/storage/compacted_index.h b/src/v/storage/compacted_index.h
index 51f2b6c48f104..05f9325328f6b 100644
--- a/src/v/storage/compacted_index.h
+++ b/src/v/storage/compacted_index.h
@@ -12,6 +12,7 @@
 #pragma once
 #include "bytes/bytes.h"
 #include "model/fundamental.h"
+#include "model/record_batch_types.h"
 
 #include <cstddef>
 #include <cstdint>
@@ -28,6 +29,20 @@ struct compaction_key : bytes {
       : bytes(std::move(b)) {}
 };
 
+inline compaction_key
+prefix_with_batch_type(model::record_batch_type type, bytes_view key) {
+    auto bt_le = ss::cpu_to_le(
+      static_cast<std::underlying_type<model::record_batch_type>::type>(type));
+    auto enriched_key = ss::uninitialized_string<bytes>(
+      sizeof(bt_le) + key.size());
+    auto out = enriched_key.begin();
+    out = std::copy_n(
+      reinterpret_cast<const char*>(&bt_le), sizeof(bt_le), out);
+    std::copy_n(key.begin(), key.size(), out);
+
+    return compaction_key(std::move(enriched_key));
+}
+
 struct compacted_index {
     static constexpr const size_t max_entry_size = size_t(
       std::numeric_limits<uint16_t>::max());
@@ -48,12 +63,17 @@ struct compacted_index {
         self_compaction = 1U << 1U,
     };
     struct footer {
+        // initial version of footer
+        static constexpr int8_t base_version = 0;
+        // introduced a key being a tuple of batch_type and the key content
+        static constexpr int8_t key_prefixed_with_batch_type = 1;
+
         uint32_t size{0};
         uint32_t keys{0};
         footer_flags flags{0};
         uint32_t crc{0}; // crc32
         // version *must* be the last value
-        int8_t version{0};
+        int8_t version{key_prefixed_with_batch_type};
 
         friend std::ostream&
         operator<<(std::ostream& o, const compacted_index::footer& f) {
@@ -88,14 +108,15 @@ struct compacted_index {
                                           + sizeof(footer::version);
     // for the readers and friends
     struct entry {
-        entry(entry_type t, bytes k, model::offset o, int32_t d) noexcept
+        entry(
+          entry_type t, compaction_key k, model::offset o, int32_t d) noexcept
           : type(t)
           , key(std::move(k))
           , offset(o)
           , delta(d) {}
 
         entry_type type;
-        bytes key;
+        compaction_key key;
         model::offset offset;
         int32_t delta;
     };
diff --git a/src/v/storage/compacted_index_chunk_reader.cc b/src/v/storage/compacted_index_chunk_reader.cc
index 0ca818ebe99a6..b113e1c5c9242 100644
--- a/src/v/storage/compacted_index_chunk_reader.cc
+++ b/src/v/storage/compacted_index_chunk_reader.cc
@@ -212,7 +212,8 @@ compacted_index_chunk_reader::load_slice(model::timeout_clock::time_point t) {
                              auto type = reflection::adl<uint8_t>{}.from(p);
                              auto [offset, _1] = p.read_varlong();
                              auto [delta, _2] = p.read_varlong();
-                             auto key = p.read_bytes(p.bytes_left());
+                             auto bytes = p.read_bytes(p.bytes_left());
+                             auto key = compaction_key(std::move(bytes));
                              slice.push_back(compacted_index::entry(
                                compacted_index::entry_type(type),
                                std::move(key),
diff --git a/src/v/storage/compacted_index_writer.h b/src/v/storage/compacted_index_writer.h
index 75f54e594daef..73c6090be00e2 100644
--- a/src/v/storage/compacted_index_writer.h
+++ b/src/v/storage/compacted_index_writer.h
@@ -12,6 +12,7 @@
 #pragma once
 #include "bytes/bytes.h"
 #include "model/fundamental.h"
+#include "model/record_batch_types.h"
 #include "storage/compacted_index.h"
 #include "storage/types.h"
 
@@ -56,18 +57,21 @@ class compacted_index_writer {
         impl& operator=(const impl&) = delete;
 
         virtual ss::future<> index(
-          bytes_view, // convert from bytes which is the key-type in map
+          const compaction_key&, // convert from bytes which is the key-type in
+                                 // map
           model::offset base_offset,
           int32_t offset_delta)
           = 0;
 
         virtual ss::future<> index(
+          model::record_batch_type,
           const iobuf& key, // default format in record batch
           model::offset base_offset,
           int32_t offset_delta)
           = 0;
 
         virtual ss::future<> index(
+          model::record_batch_type,
           bytes&& key, // default format in record batch
           model::offset base_offset,
           int32_t offset_delta)
@@ -92,9 +96,13 @@ class compacted_index_writer {
     explicit compacted_index_writer(std::unique_ptr<impl> i)
       : _impl(std::move(i)) {}
 
-    ss::future<> index(bytes_view, model::offset, int32_t);
-    ss::future<> index(const iobuf& key, model::offset, int32_t);
-    ss::future<> index(bytes&&, model::offset, int32_t);
+    // accepts a compaction_key which is already prefixed with batch_type
+    ss::future<> index(const compaction_key& b, model::offset, int32_t);
+
+    ss::future<>
+    index(model::record_batch_type, const iobuf& key, model::offset, int32_t);
+    ss::future<>
+    index(model::record_batch_type, bytes&&, model::offset, int32_t);
 
     ss::future<> append(compacted_index::entry);
 
@@ -127,16 +135,22 @@ compacted_index_writer::release() && {
     return std::move(_impl);
 }
 inline ss::future<> compacted_index_writer::index(
-  const iobuf& b, model::offset base_offset, int32_t delta) {
-    return _impl->index(b, base_offset, delta);
+  model::record_batch_type batch_type,
+  const iobuf& b,
+  model::offset base_offset,
+  int32_t delta) {
+    return _impl->index(batch_type, b, base_offset, delta);
 }
 inline ss::future<> compacted_index_writer::index(
-  bytes_view b, model::offset base_offset, int32_t delta) {
+  const compaction_key& b, model::offset base_offset, int32_t delta) {
     return _impl->index(b, base_offset, delta);
 }
 inline ss::future<> compacted_index_writer::index(
-  bytes&& b, model::offset base_offset, int32_t delta) {
-    return _impl->index(std::move(b), base_offset, delta);
+  model::record_batch_type batch_type,
+  bytes&& b,
+  model::offset base_offset,
+  int32_t delta) {
+    return _impl->index(batch_type, std::move(b), base_offset, delta);
 }
 inline ss::future<> compacted_index_writer::truncate(model::offset o) {
     return _impl->truncate(o);
diff --git a/src/v/storage/compaction_reducers.cc b/src/v/storage/compaction_reducers.cc
index 991421c8abcfd..391fb21204083 100644
--- a/src/v/storage/compaction_reducers.cc
+++ b/src/v/storage/compaction_reducers.cc
@@ -97,8 +97,7 @@ index_filtered_copy_reducer::operator()(compacted_index::entry&& e) {
     const bool should_add = _bm.contains(_natural_index);
     ++_natural_index;
     if (should_add) {
-        bytes_view bv = e.key;
-        return _writer->index(bv, e.offset, e.delta)
+        return _writer->index(e.key, e.offset, e.delta)
           .then([k = std::move(e.key)] {
               return ss::make_ready_future<stop_t>(stop_t::no);
           });
@@ -294,8 +293,9 @@ index_rebuilder_reducer::operator()(model::record_batch&& b) {
 ss::future<> index_rebuilder_reducer::do_index(model::record_batch&& b) {
     return ss::do_with(std::move(b), [this](model::record_batch& b) {
         return model::for_each_record(
-          b, [this, o = b.base_offset()](model::record& r) {
-              return _w->index(r.key(), o, r.offset_delta());
+          b,
+          [this, bt = b.header().type, o = b.base_offset()](model::record& r) {
+              return _w->index(bt, r.key(), o, r.offset_delta());
           });
     });
 }
diff --git a/src/v/storage/segment.cc b/src/v/storage/segment.cc
index 549b8a7e9e13c..28cd1290bcaea 100644
--- a/src/v/storage/segment.cc
+++ b/src/v/storage/segment.cc
@@ -357,8 +357,10 @@ ss::future<> segment::do_compaction_index_batch(const model::record_batch& b) {
     vassert(!b.compressed(), "wrong method. Call compact_index_batch. {}", b);
     auto& w = compaction_index();
     return model::for_each_record(
-      b, [o = b.base_offset(), &w](const model::record& r) {
-          return w.index(r.key(), o, r.offset_delta());
+      b,
+      [o = b.base_offset(), batch_type = b.header().type, &w](
+        const model::record& r) {
+          return w.index(batch_type, r.key(), o, r.offset_delta());
       });
 }
 ss::future<> segment::compaction_index_batch(const model::record_batch& b) {
diff --git a/src/v/storage/spill_key_index.cc b/src/v/storage/spill_key_index.cc
index 5cd1b8536019a..0feaf59bfdbf6 100644
--- a/src/v/storage/spill_key_index.cc
+++ b/src/v/storage/spill_key_index.cc
@@ -12,6 +12,7 @@
 #include "bytes/bytes.h"
 #include "random/generators.h"
 #include "reflection/adl.h"
+#include "storage/compacted_index.h"
 #include "storage/compacted_index_writer.h"
 #include "storage/logger.h"
 #include "storage/segment_utils.h"
@@ -68,8 +69,8 @@ spill_key_index::~spill_key_index() {
       _midx.size());
 }
 
-ss::future<>
-spill_key_index::index(bytes_view v, model::offset base_offset, int32_t delta) {
+ss::future<> spill_key_index::index(
+  const compaction_key& v, model::offset base_offset, int32_t delta) {
     if (auto it = _midx.find(v); it != _midx.end()) {
         auto& pair = it->second;
         if (base_offset > pair.base_offset) {
@@ -79,10 +80,10 @@ spill_key_index::index(bytes_view v, model::offset base_offset, int32_t delta) {
         return ss::now();
     }
     // not found
-    return add_key(bytes(v), value_type{base_offset, delta});
+    return add_key(v, value_type{base_offset, delta});
 }
 
-ss::future<> spill_key_index::add_key(bytes b, value_type v) {
+ss::future<> spill_key_index::add_key(compaction_key b, value_type v) {
     auto f = ss::now();
     auto const key_size = b.size();
     auto const expected_size = idx_mem_usage() + _keys_mem_usage + key_size;
@@ -121,9 +122,13 @@ ss::future<> spill_key_index::add_key(bytes b, value_type v) {
     });
 }
 
-ss::future<>
-spill_key_index::index(bytes&& b, model::offset base_offset, int32_t delta) {
-    if (auto it = _midx.find(b); it != _midx.end()) {
+ss::future<> spill_key_index::index(
+  model::record_batch_type batch_type,
+  bytes&& b,
+  model::offset base_offset,
+  int32_t delta) {
+    auto key = prefix_with_batch_type(batch_type, b);
+    if (auto it = _midx.find(key); it != _midx.end()) {
         auto& pair = it->second;
         // must use both base+delta, since we only want to keep the latest
         // which might be inserted into the batch multiple times by client
@@ -136,11 +141,15 @@ spill_key_index::index(bytes&& b, model::offset base_offset, int32_t delta) {
         return ss::now();
     }
     // not found
-    return add_key(std::move(b), value_type{base_offset, delta});
+    return add_key(std::move(key), value_type{base_offset, delta});
 }
 ss::future<> spill_key_index::index(
-  const iobuf& key, model::offset base_offset, int32_t delta) {
+  model::record_batch_type batch_type,
+  const iobuf& key,
+  model::offset base_offset,
+  int32_t delta) {
     return index(
+      batch_type,
       iobuf_to_bytes(key), // makes a copy, but we need deterministic keys
       base_offset,
       delta);
diff --git a/src/v/storage/spill_key_index.h b/src/v/storage/spill_key_index.h
index d4939d04e4448..20f64a213a45c 100644
--- a/src/v/storage/spill_key_index.h
+++ b/src/v/storage/spill_key_index.h
@@ -14,6 +14,7 @@
 #include "hashing/crc32c.h"
 #include "hashing/xx.h"
 #include "model/fundamental.h"
+#include "model/record_batch_types.h"
 #include "storage/compacted_index.h"
 #include "storage/compacted_index_writer.h"
 #include "storage/segment_appender.h"
@@ -38,7 +39,7 @@ class spill_key_index final : public compacted_index_writer::impl {
     static constexpr size_t max_key_size = compacted_index::max_entry_size
                                            - (2 * vint::max_length);
     using underlying_t = absl::node_hash_map<
-      bytes,
+      compaction_key,
       value_type,
       bytes_hasher<uint64_t, xxhash_64>,
       bytes_type_eq>;
@@ -66,9 +67,11 @@ class spill_key_index final : public compacted_index_writer::impl {
 
     ss::future<> maybe_open();
     ss::future<> open();
-    ss::future<> index(const iobuf& key, model::offset, int32_t) final;
-    ss::future<> index(bytes_view, model::offset, int32_t) final;
-    ss::future<> index(bytes&&, model::offset, int32_t) final;
+    ss::future<> index(
+      model::record_batch_type, const iobuf& key, model::offset, int32_t) final;
+    ss::future<> index(const compaction_key& b, model::offset, int32_t) final;
+    ss::future<>
+    index(model::record_batch_type, bytes&&, model::offset, int32_t) final;
     ss::future<> truncate(model::offset) final;
     ss::future<> append(compacted_index::entry) final;
     ss::future<> close() final;
@@ -88,7 +91,7 @@ class spill_key_index final : public compacted_index_writer::impl {
         return debug::AllocatedByteSize(_midx);
     }
     ss::future<> drain_all_keys();
-    ss::future<> add_key(bytes b, value_type);
+    ss::future<> add_key(compaction_key, value_type);
     ss::future<> spill(compacted_index::entry_type, bytes_view, value_type);
 
     storage::debug_sanitize_files _debug;
diff --git a/src/v/storage/tests/compaction_idx_bench.cc b/src/v/storage/tests/compaction_idx_bench.cc
index 6afeae1cf6569..950613db78faf 100644
--- a/src/v/storage/tests/compaction_idx_bench.cc
+++ b/src/v/storage/tests/compaction_idx_bench.cc
@@ -30,7 +30,10 @@ PERF_TEST_F(reducer_bench, compaction_key_reducer_test) {
     auto key = random_generators::get_bytes(20);
 
     storage::compacted_index::entry entry(
-      storage::compacted_index::entry_type::key, std::move(key), o, 0);
+      storage::compacted_index::entry_type::key,
+      storage::compaction_key(std::move(key)),
+      o,
+      0);
 
     perf_tests::start_measuring_time();
     return reducer(std::move(entry)).discard_result().finally([] {
diff --git a/src/v/storage/tests/compaction_index_format_tests.cc b/src/v/storage/tests/compaction_index_format_tests.cc
index 16c931a662fc3..594c720e30362 100644
--- a/src/v/storage/tests/compaction_index_format_tests.cc
+++ b/src/v/storage/tests/compaction_index_format_tests.cc
@@ -24,13 +24,10 @@
 
 #include <boost/test/unit_test_suite.hpp>
 
-class
-
-  storage::compacted_index_writer
-  make_dummy_compacted_index(
-    tmpbuf_file::store_t& index_data,
-    size_t max_mem,
-    storage::storage_resources& resources) {
+storage::compacted_index_writer make_dummy_compacted_index(
+  tmpbuf_file::store_t& index_data,
+  size_t max_mem,
+  storage::storage_resources& resources) {
     auto f = ss::file(ss::make_shared(tmpbuf_file(index_data)));
     return storage::compacted_index_writer(
       std::make_unique<storage::internal::spill_key_index>(
@@ -41,16 +38,52 @@ struct compacted_topic_fixture {
     storage::storage_resources resources;
 };
 
+model::record_batch_type random_batch_type() {
+    return random_generators::random_choice(
+      std::vector<model::record_batch_type>{
+        model::record_batch_type::raft_data,
+        model::record_batch_type::raft_configuration,
+        model::record_batch_type::controller,
+        model::record_batch_type::kvstore,
+        model::record_batch_type::checkpoint,
+        model::record_batch_type::topic_management_cmd,
+        model::record_batch_type::ghost_batch,
+        model::record_batch_type::id_allocator,
+        model::record_batch_type::tx_prepare,
+        model::record_batch_type::tx_fence,
+        model::record_batch_type::tm_update,
+        model::record_batch_type::user_management_cmd,
+        model::record_batch_type::acl_management_cmd,
+        model::record_batch_type::group_prepare_tx,
+        model::record_batch_type::group_commit_tx,
+        model::record_batch_type::group_abort_tx,
+        model::record_batch_type::node_management_cmd,
+        model::record_batch_type::data_policy_management_cmd,
+        model::record_batch_type::archival_metadata,
+        model::record_batch_type::cluster_config_cmd,
+        model::record_batch_type::feature_update,
+      });
+}
+
+bytes extract_record_key(bytes prefixed_key) {
+    size_t sz = prefixed_key.size() - 1;
+    auto read_key = ss::uninitialized_string<bytes>(sz);
+
+    std::copy_n(prefixed_key.begin() + 1, sz, read_key.begin());
+    return read_key;
+}
+
 FIXTURE_TEST(format_verification, compacted_topic_fixture) {
     tmpbuf_file::store_t index_data;
     auto idx = make_dummy_compacted_index(index_data, 1_KiB, resources);
     const auto key = random_generators::get_bytes(1024);
-    idx.index(key, model::offset(42), 66).get();
+    auto bt = random_batch_type();
+    idx.index(bt, bytes(key), model::offset(42), 66).get();
     idx.close().get();
     info("{}", idx);
 
     iobuf data = std::move(index_data).release_iobuf();
-    BOOST_REQUIRE_EQUAL(data.size_bytes(), 1047);
+    BOOST_REQUIRE_EQUAL(data.size_bytes(), 1048);
     iobuf_parser p(data.share(0, data.size_bytes()));
     (void)p.consume_type<uint16_t>(); // SIZE
     (void)p.consume_type<uint8_t>();  // TYPE
@@ -58,30 +91,36 @@ FIXTURE_TEST(format_verification, compacted_topic_fixture) {
     BOOST_REQUIRE_EQUAL(model::offset(offset), model::offset(42));
     auto [delta, _2] = p.read_varlong();
     BOOST_REQUIRE_EQUAL(delta, 66);
-    const auto key_result = p.read_bytes(1024);
-    BOOST_REQUIRE_EQUAL(key, key_result);
+    const auto key_result = p.read_bytes(1025);
+
+    auto read_key = extract_record_key(key_result);
+    BOOST_REQUIRE_EQUAL(key, read_key);
     auto footer = reflection::adl<storage::compacted_index::footer>{}.from(p);
     info("{}", footer);
     BOOST_REQUIRE_EQUAL(footer.keys, 1);
     BOOST_REQUIRE_EQUAL(
       footer.size,
-      sizeof(uint16_t)
-        + 1 /*type*/ + 1 /*offset*/ + 2 /*delta*/ + 1024 /*key*/);
-    BOOST_REQUIRE_EQUAL(footer.version, 0);
+      sizeof(uint16_t) + 1 /*type*/ + 1 /*offset*/ + 2 /*delta*/
+        + 1 /*batch_type*/ + 1024 /*key*/);
+    BOOST_REQUIRE_EQUAL(
+      footer.version,
+      storage::compacted_index::footer::key_prefixed_with_batch_type);
     BOOST_REQUIRE(footer.crc != 0);
 }
 FIXTURE_TEST(format_verification_max_key, compacted_topic_fixture) {
     tmpbuf_file::store_t index_data;
     auto idx = make_dummy_compacted_index(index_data, 1_MiB, resources);
     const auto key = random_generators::get_bytes(1_MiB);
-    idx.index(key, model::offset(42), 66).get();
+    auto bt = random_batch_type();
+    idx.index(bt, bytes(key), model::offset(42), 66).get();
     idx.close().get();
     info("{}", idx);
 
     /**
      * Length of an entry is equal to
      *
-     * max_key_size + sizeof(uint8_t) + sizeof(uint16_t) + vint(42) + vint(66)
+     * max_key_size + sizeof(uint8_t) + sizeof(uint16_t) + vint(42) +
+     * vint(66)
      */
     iobuf data = std::move(index_data).release_iobuf();
 
@@ -104,7 +143,8 @@ FIXTURE_TEST(format_verification_roundtrip, compacted_topic_fixture) {
     tmpbuf_file::store_t index_data;
     auto idx = make_dummy_compacted_index(index_data, 1_MiB, resources);
     const auto key = random_generators::get_bytes(20);
-    idx.index(key, model::offset(42), 66).get();
+    auto bt = random_batch_type();
+    idx.index(bt, bytes(key), model::offset(42), 66).get();
     idx.close().get();
     info("{}", idx);
 
@@ -115,20 +155,23 @@ FIXTURE_TEST(format_verification_roundtrip, compacted_topic_fixture) {
       32_KiB);
     auto footer = rdr.load_footer().get0();
     BOOST_REQUIRE_EQUAL(footer.keys, 1);
-    BOOST_REQUIRE_EQUAL(footer.version, 0);
+    BOOST_REQUIRE_EQUAL(
+      footer.version,
+      storage::compacted_index::footer::key_prefixed_with_batch_type);
     BOOST_REQUIRE(footer.crc != 0);
     auto vec = compaction_index_reader_to_memory(std::move(rdr)).get0();
     BOOST_REQUIRE_EQUAL(vec.size(), 1);
     BOOST_REQUIRE_EQUAL(vec[0].offset, model::offset(42));
     BOOST_REQUIRE_EQUAL(vec[0].delta, 66);
-    BOOST_REQUIRE_EQUAL(vec[0].key, key);
+    BOOST_REQUIRE_EQUAL(extract_record_key(vec[0].key), key);
 }
 FIXTURE_TEST(
   format_verification_roundtrip_exceeds_capacity, compacted_topic_fixture) {
     tmpbuf_file::store_t index_data;
     auto idx = make_dummy_compacted_index(index_data, 1_MiB, resources);
     const auto key = random_generators::get_bytes(1_MiB);
-    idx.index(key, model::offset(42), 66).get();
+    auto bt = random_batch_type();
+    idx.index(bt, bytes(key), model::offset(42), 66).get();
     idx.close().get();
     info("{}", idx);
 
@@ -139,7 +182,9 @@ FIXTURE_TEST(
       32_KiB);
     auto footer = rdr.load_footer().get0();
     BOOST_REQUIRE_EQUAL(footer.keys, 1);
-    BOOST_REQUIRE_EQUAL(footer.version, 0);
+    BOOST_REQUIRE_EQUAL(
+      footer.version,
+      storage::compacted_index::footer::key_prefixed_with_batch_type);
     BOOST_REQUIRE(footer.crc != 0);
     auto vec = compaction_index_reader_to_memory(std::move(rdr)).get0();
     BOOST_REQUIRE_EQUAL(vec.size(), 1);
@@ -147,7 +192,8 @@ FIXTURE_TEST(
     BOOST_REQUIRE_EQUAL(vec[0].delta, 66);
     auto max_sz = storage::internal::spill_key_index::max_key_size;
     BOOST_REQUIRE_EQUAL(vec[0].key.size(), max_sz);
-    BOOST_REQUIRE_EQUAL(vec[0].key, bytes_view(key.data(), max_sz));
+    BOOST_REQUIRE_EQUAL(
+      extract_record_key(vec[0].key), bytes_view(key.data(), max_sz - 1));
 }
 
 FIXTURE_TEST(key_reducer_no_truncate_filter, compacted_topic_fixture) {
@@ -157,6 +203,7 @@ FIXTURE_TEST(key_reducer_no_truncate_filter, compacted_topic_fixture) {
 
     const auto key1 = random_generators::get_bytes(1_KiB);
     const auto key2 = random_generators::get_bytes(1_KiB);
+    auto bt = random_batch_type();
     for (auto i = 0; i < 100; ++i) {
         bytes_view put_key;
         if (i % 2) {
@@ -164,7 +211,7 @@ FIXTURE_TEST(key_reducer_no_truncate_filter, compacted_topic_fixture) {
         } else {
             put_key = key2;
         }
-        idx.index(put_key, model::offset(i), 0).get();
+        idx.index(bt, bytes(put_key), model::offset(i), 0).get();
     }
     idx.close().get();
     info("{}", idx);
@@ -197,6 +244,7 @@ FIXTURE_TEST(key_reducer_max_mem, compacted_topic_fixture) {
 
     const auto key1 = random_generators::get_bytes(1_KiB);
     const auto key2 = random_generators::get_bytes(1_KiB);
+    auto bt = random_batch_type();
     for (auto i = 0; i < 100; ++i) {
         bytes_view put_key;
         if (i % 2) {
@@ -204,7 +252,7 @@ FIXTURE_TEST(key_reducer_max_mem, compacted_topic_fixture) {
         } else {
             put_key = key2;
         }
-        idx.index(put_key, model::offset(i), 0).get();
+        idx.index(bt, bytes(put_key), model::offset(i), 0).get();
     }
     idx.close().get();
     info("{}", idx);
@@ -262,6 +310,7 @@ FIXTURE_TEST(index_filtered_copy_tests, compacted_topic_fixture) {
 
     const auto key1 = random_generators::get_bytes(128_KiB);
     const auto key2 = random_generators::get_bytes(1_KiB);
+    auto bt = random_batch_type();
     for (auto i = 0; i < 100; ++i) {
         bytes_view put_key;
         if (i % 2) {
@@ -269,7 +318,7 @@ FIXTURE_TEST(index_filtered_copy_tests, compacted_topic_fixture) {
         } else {
             put_key = key2;
         }
-        idx.index(put_key, model::offset(i), 0).get();
+        idx.index(bt, bytes(put_key), model::offset(i), 0).get();
     }
     idx.close().get();
     info("{}", idx);

From 098b71856ffa42691df0112660b7ddd966f01d02 Mon Sep 17 00:00:00 2001
From: Michal Maslanka <michal@redpanda.com>
Date: Tue, 5 Jul 2022 19:25:42 +0200
Subject: [PATCH 148/201] s/tests: added test verifying compaction of different
 batch types

Signed-off-by: Michal Maslanka <michal@redpanda.com>
---
 src/v/storage/tests/storage_e2e_test.cc | 105 ++++++++++++++++++++++++
 1 file changed, 105 insertions(+)

diff --git a/src/v/storage/tests/storage_e2e_test.cc b/src/v/storage/tests/storage_e2e_test.cc
index b7b9237449f19..71a540dcde785 100644
--- a/src/v/storage/tests/storage_e2e_test.cc
+++ b/src/v/storage/tests/storage_e2e_test.cc
@@ -2086,3 +2086,108 @@ FIXTURE_TEST(test_querying_term_last_offset, storage_test_fixture) {
 
     BOOST_REQUIRE(!log.get_term_last_offset(model::term_id(0)).has_value());
 }
+
+void write_batch(
+  storage::log log,
+  ss::sstring key,
+  int value,
+  model::record_batch_type batch_type) {
+    storage::record_batch_builder builder(batch_type, model::offset(0));
+
+    builder.add_raw_kv(serde::to_iobuf(std::move(key)), serde::to_iobuf(value));
+
+    auto batch = std::move(builder).build();
+    batch.set_term(model::term_id(0));
+    auto reader = model::make_memory_record_batch_reader({std::move(batch)});
+    storage::log_append_config cfg{
+      .should_fsync = storage::log_append_config::fsync::no,
+      .io_priority = ss::default_priority_class(),
+      .timeout = model::no_timeout,
+    };
+
+    std::move(reader).for_each_ref(log.make_appender(cfg), cfg.timeout).get0();
+}
+
+absl::flat_hash_map<std::pair<model::record_batch_type, ss::sstring>, int>
+compact_in_memory(storage::log log) {
+    auto rdr = log
+                 .make_reader(storage::log_reader_config(
+                   model::offset(0),
+                   model::offset::max(),
+                   ss::default_priority_class()))
+                 .get();
+
+    absl::flat_hash_map<std::pair<model::record_batch_type, ss::sstring>, int>
+      ret;
+    auto batches = model::consume_reader_to_memory(
+                     std::move(rdr), model::no_timeout)
+                     .get();
+
+    for (auto& b : batches) {
+        b.for_each_record([&ret, bt = b.header().type](model::record r) {
+            auto k = std::make_pair(
+              bt, serde::from_iobuf<ss::sstring>(r.key().copy()));
+            ret.insert_or_assign(k, serde::from_iobuf<int>(r.value().copy()));
+        });
+    }
+
+    return ret;
+}
+
+FIXTURE_TEST(test_compacting_batches_of_different_types, storage_test_fixture) {
+    auto cfg = default_log_config(test_dir);
+    cfg.max_compacted_segment_size = config::mock_binding<size_t>(100_MiB);
+    cfg.stype = storage::log_config::storage_type::disk;
+    cfg.cache = storage::with_cache::no;
+    storage::ntp_config::default_overrides overrides;
+    overrides.cleanup_policy_bitflags
+      = model::cleanup_policy_bitflags::compaction;
+
+    ss::abort_source as;
+    storage::log_manager mgr = make_log_manager(cfg);
+    auto deferred = ss::defer([&mgr]() mutable { mgr.stop().get0(); });
+    auto ntp = model::ntp("default", "test", 0);
+    auto log = mgr
+                 .manage(storage::ntp_config(
+                   ntp,
+                   mgr.config().base_dir,
+                   std::make_unique<storage::ntp_config::default_overrides>(
+                     overrides)))
+                 .get0();
+
+    auto disk_log = get_disk_log(log);
+
+    // the same key but three different batch types
+    write_batch(log, "key_1", 1, model::record_batch_type::raft_data);
+    write_batch(log, "key_1", 10, model::record_batch_type::tm_update);
+    write_batch(log, "key_1", 100, model::record_batch_type::tx_fence);
+
+    write_batch(log, "key_1", 2, model::record_batch_type::raft_data);
+    write_batch(log, "key_1", 3, model::record_batch_type::raft_data);
+    write_batch(log, "key_1", 4, model::record_batch_type::raft_data);
+
+    write_batch(log, "key_1", 20, model::record_batch_type::tm_update);
+    write_batch(log, "key_1", 30, model::record_batch_type::tm_update);
+    write_batch(log, "key_1", 40, model::record_batch_type::tm_update);
+
+    write_batch(log, "key_1", 200, model::record_batch_type::tm_update);
+    write_batch(log, "key_1", 300, model::record_batch_type::tm_update);
+    write_batch(log, "key_1", 400, model::record_batch_type::tm_update);
+
+    disk_log->force_roll(ss::default_priority_class()).get();
+
+    log.flush().get0();
+
+    BOOST_REQUIRE_EQUAL(disk_log->segment_count(), 2);
+
+    storage::compaction_config c_cfg(
+      model::timestamp::min(), std::nullopt, ss::default_priority_class(), as);
+    auto before_compaction = compact_in_memory(log);
+
+    BOOST_REQUIRE_EQUAL(before_compaction.size(), 3);
+    // compact
+    log.compact(c_cfg).get0();
+    auto after_compaction = compact_in_memory(log);
+
+    BOOST_REQUIRE(before_compaction == after_compaction);
+}

From d73771bfac8600664613c7045cadcb20c790d069 Mon Sep 17 00:00:00 2001
From: Michal Maslanka <michal@redpanda.com>
Date: Tue, 5 Jul 2022 19:27:20 +0200
Subject: [PATCH 149/201] s/segment_utils: require index rebuild whenever it is
 in old version

Signed-off-by: Michal Maslanka <michal@redpanda.com>
---
 src/v/storage/segment_utils.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/v/storage/segment_utils.cc b/src/v/storage/segment_utils.cc
index c6248d8a0f60f..d7056f9eab77e 100644
--- a/src/v/storage/segment_utils.cc
+++ b/src/v/storage/segment_utils.cc
@@ -283,6 +283,13 @@ ss::future<compacted_index::recovery_state> do_detect_compaction_index_state(
                 if (bool(footer.flags & flags::self_compaction)) {
                     return compacted_index::recovery_state::already_compacted;
                 }
+                // if we deal with old version of index that is not yet
+                // compacted request a rebuild
+                if (
+                  footer.version
+                  < compacted_index::footer::key_prefixed_with_batch_type) {
+                    return compacted_index::recovery_state::index_needs_rebuild;
+                }
                 return compacted_index::recovery_state::index_recovered;
             })
             .finally([reader]() mutable { return reader.close(); });

From 3a93d25f9b5233120becf57d0fcc7fef5bd8744c Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@redpanda.com>
Date: Thu, 14 Jul 2022 18:28:19 +0100
Subject: [PATCH 150/201] ssx: add util for namespaced metric label creation

---
 src/v/ssx/metrics.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/v/ssx/metrics.h b/src/v/ssx/metrics.h
index 3858980c00284..2b492765b669c 100644
--- a/src/v/ssx/metrics.h
+++ b/src/v/ssx/metrics.h
@@ -34,4 +34,10 @@ inline ss::metrics::histogram report_default_histogram(const hdr_hist& hist) {
       num_buckets, first_value, log_base, scale);
 }
 
+const auto label_namespace = "redpanda";
+
+inline ss::metrics::label make_namespaced_label(const seastar::sstring& name) {
+    return ss::metrics::label(ssx::sformat("{}_{}", label_namespace, name));
+}
+
 } // namespace ssx::metrics

From 3d70d0370ab706a4ddba6da51ce96a918305ce4e Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@redpanda.com>
Date: Thu, 14 Jul 2022 18:29:34 +0100
Subject: [PATCH 151/201] cloud_storage: add namespace to new metrics labels

---
 src/v/cloud_storage/probe.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/v/cloud_storage/probe.cc b/src/v/cloud_storage/probe.cc
index af7a8f3a8badc..398d690ef7163 100644
--- a/src/v/cloud_storage/probe.cc
+++ b/src/v/cloud_storage/probe.cc
@@ -99,7 +99,7 @@ remote_probe::remote_probe(
     }
 
     if (!public_disabled) {
-        auto direction_label = sm::label("direction");
+        auto direction_label = ssx::metrics::make_namespaced_label("direction");
 
         _public_metrics.add_group(
           prometheus_sanitize::metrics_name("cloud_storage"),

From 8a74dbce1bde5734aaa9a6b4197cb463e8d3a3cb Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@redpanda.com>
Date: Thu, 14 Jul 2022 18:31:11 +0100
Subject: [PATCH 152/201] cluster: add namespaces to partition probe labels

---
 src/v/cluster/partition_probe.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/v/cluster/partition_probe.cc b/src/v/cluster/partition_probe.cc
index 01b21336a273f..e4f42ae68a3f0 100644
--- a/src/v/cluster/partition_probe.cc
+++ b/src/v/cluster/partition_probe.cc
@@ -143,10 +143,10 @@ void replicated_partition_probe::setup_public_metrics(const model::ntp& ntp) {
         return;
     }
 
-    auto request_label = sm::label("request");
-    auto ns_label = sm::label("namespace");
-    auto topic_label = sm::label("topic");
-    auto partition_label = sm::label("partition");
+    auto request_label = ssx::metrics::make_namespaced_label("request");
+    auto ns_label = ssx::metrics::make_namespaced_label("namespace");
+    auto topic_label = ssx::metrics::make_namespaced_label("topic");
+    auto partition_label = ssx::metrics::make_namespaced_label("partition");
 
     const std::vector<sm::label_instance> labels = {
       ns_label(ntp.ns()),

From fbc41a2e90fc75aa48ac20fed16e3d82d711d655 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@redpanda.com>
Date: Thu, 14 Jul 2022 18:31:46 +0100
Subject: [PATCH 153/201] kafka: add namespaces to group probe labels

---
 src/v/kafka/group_probe.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/v/kafka/group_probe.h b/src/v/kafka/group_probe.h
index 371e4b31b4bea..26535deb0f3f0 100644
--- a/src/v/kafka/group_probe.h
+++ b/src/v/kafka/group_probe.h
@@ -62,9 +62,9 @@ class group_offset_probe {
             return;
         }
 
-        auto group_label = sm::label("group");
-        auto topic_label = sm::label("topic");
-        auto partition_label = sm::label("partition");
+        auto group_label = ssx::metrics::make_namespaced_label("group");
+        auto topic_label = ssx::metrics::make_namespaced_label("topic");
+        auto partition_label = ssx::metrics::make_namespaced_label("partition");
         std::vector<sm::label_instance> labels{
           group_label(group_id()),
           topic_label(tp.topic()),
@@ -110,7 +110,7 @@ class group_probe {
             return;
         }
 
-        auto group_label = sm::label("group");
+        auto group_label = ssx::metrics::make_namespaced_label("group");
 
         std::vector<sm::label_instance> labels{group_label(group_id())};
 

From 008ee5a2f98093c8c53da7c44134f359486f9a30 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@redpanda.com>
Date: Thu, 14 Jul 2022 18:32:25 +0100
Subject: [PATCH 154/201] kafka: add namespaces to latency probe labels

---
 src/v/kafka/latency_probe.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/v/kafka/latency_probe.h b/src/v/kafka/latency_probe.h
index f38b3fb34b6ed..3946475e6cfbc 100644
--- a/src/v/kafka/latency_probe.h
+++ b/src/v/kafka/latency_probe.h
@@ -60,7 +60,7 @@ class latency_probe {
             sm::make_histogram(
               "request_latency_seconds",
               sm::description("Internal latency of kafka produce requests"),
-              {sm::label("request")("produce")},
+              {ssx::metrics::make_namespaced_label("request")("produce")},
               [this] {
                   return ssx::metrics::report_default_histogram(
                     _produce_latency);
@@ -69,7 +69,7 @@ class latency_probe {
             sm::make_histogram(
               "request_latency_seconds",
               sm::description("Internal latency of kafka consume requests"),
-              {sm::label("request")("consume")},
+              {ssx::metrics::make_namespaced_label("request")("consume")},
               [this] {
                   return ssx::metrics::report_default_histogram(_fetch_latency);
               })

From ab0212c78713eaf6da51ef7d79d468c7ce4edf26 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@redpanda.com>
Date: Thu, 14 Jul 2022 18:32:47 +0100
Subject: [PATCH 155/201] pandaproxy: add namespace to public metrics labels

This commit adds a prefix to the labels used for metrics published
on the 'public_metrics' endpoint. It also splits the registration
of metrics into two functions.
---
 src/v/pandaproxy/probe.cc | 136 +++++++++++++++++++++-----------------
 src/v/pandaproxy/probe.h  |   6 ++
 src/v/ssx/metrics.h       |   2 +-
 3 files changed, 84 insertions(+), 60 deletions(-)

diff --git a/src/v/pandaproxy/probe.cc b/src/v/pandaproxy/probe.cc
index 51b432b54f21b..7b9fdd5c74ee5 100644
--- a/src/v/pandaproxy/probe.cc
+++ b/src/v/pandaproxy/probe.cc
@@ -22,77 +22,95 @@ namespace pandaproxy {
 probe::probe(
   ss::httpd::path_description& path_desc, const ss::sstring& group_name)
   : _request_metrics()
+  , _path(path_desc)
+  , _group_name(group_name)
   , _metrics()
   , _public_metrics(ssx::metrics::public_metrics_handle) {
+    setup_metrics();
+    setup_public_metrics();
+}
+
+void probe::setup_metrics() {
     namespace sm = ss::metrics;
 
+    if (config::shard_local_cfg().disable_metrics()) {
+        return;
+    }
+
     auto operation_label = sm::label("operation");
     std::vector<sm::label_instance> labels{
-      operation_label(path_desc.operations.nickname)};
+      operation_label(_path.operations.nickname)};
 
     auto aggregate_labels = std::vector<sm::label>{
       sm::shard_label, operation_label};
 
-    if (!config::shard_local_cfg().disable_metrics()) {
-        auto internal_aggregate_labels
-          = config::shard_local_cfg().aggregate_metrics()
-              ? aggregate_labels
-              : std::vector<sm::label>{};
-
-        _metrics.add_group(
-          "pandaproxy",
-          {sm::make_histogram(
-             "request_latency",
-             sm::description("Request latency"),
-             labels,
-             [this] {
-                 return _request_metrics.hist().seastar_histogram_logform();
-             })
-             .aggregate(internal_aggregate_labels)});
-    }
+    auto internal_aggregate_labels
+      = config::shard_local_cfg().aggregate_metrics()
+          ? aggregate_labels
+          : std::vector<sm::label>{};
 
-    if (!config::shard_local_cfg().disable_public_metrics()) {
-        auto status_label = sm::label("status");
-        _public_metrics.add_group(
-          group_name,
-          {sm::make_histogram(
-             "request_latency_seconds",
-             sm::description(
-               ssx::sformat("Internal latency of request for {}", group_name)),
-             labels,
-             [this] {
-                 return ssx::metrics::report_default_histogram(
-                   _request_metrics.hist());
-             })
-             .aggregate(aggregate_labels),
-
-           sm::make_counter(
-             "request_errors_total",
-             [this] { return _request_metrics._5xx_count; },
-             sm::description(
-               ssx::sformat("Total number of {} server errors", group_name)),
-             {operation_label(path_desc.operations.nickname),
-              status_label("5xx")})
-             .aggregate(aggregate_labels),
-
-           sm::make_counter(
-             "request_errors_total",
-             [this] { return _request_metrics._4xx_count; },
-             sm::description(
-               ssx::sformat("Total number of {} client errors", group_name)),
-             {operation_label(path_desc.operations.nickname),
-              status_label("4xx")})
-             .aggregate(aggregate_labels),
-
-           sm::make_counter(
-             "request_errors_total",
-             [this] { return _request_metrics._3xx_count; },
-             sm::description(ssx::sformat(
-               "Total number of {} redirection errors", group_name)),
-             {operation_label(path_desc.operations.nickname),
-              status_label("3xx")})
-             .aggregate(aggregate_labels)});
+    _metrics.add_group(
+      "pandaproxy",
+      {sm::make_histogram(
+         "request_latency",
+         sm::description("Request latency"),
+         labels,
+         [this] { return _request_metrics.hist().seastar_histogram_logform(); })
+         .aggregate(internal_aggregate_labels)});
+}
+
+void probe::setup_public_metrics() {
+    namespace sm = ss::metrics;
+
+    if (config::shard_local_cfg().disable_public_metrics()) {
+        return;
     }
+
+    auto operation_label = ssx::metrics::make_namespaced_label("operation");
+    auto status_label = ssx::metrics::make_namespaced_label("status");
+
+    std::vector<sm::label_instance> labels{
+      operation_label(_path.operations.nickname)};
+
+    auto aggregate_labels = std::vector<sm::label>{
+      sm::shard_label, operation_label};
+
+    _public_metrics.add_group(
+      _group_name,
+      {sm::make_histogram(
+         "request_latency_seconds",
+         sm::description(
+           ssx::sformat("Internal latency of request for {}", _group_name)),
+         labels,
+         [this] {
+             return ssx::metrics::report_default_histogram(
+               _request_metrics.hist());
+         })
+         .aggregate(aggregate_labels),
+
+       sm::make_counter(
+         "request_errors_total",
+         [this] { return _request_metrics._5xx_count; },
+         sm::description(
+           ssx::sformat("Total number of {} server errors", _group_name)),
+         {operation_label(_path.operations.nickname), status_label("5xx")})
+         .aggregate(aggregate_labels),
+
+       sm::make_counter(
+         "request_errors_total",
+         [this] { return _request_metrics._4xx_count; },
+         sm::description(
+           ssx::sformat("Total number of {} client errors", _group_name)),
+         {operation_label(_path.operations.nickname), status_label("4xx")})
+         .aggregate(aggregate_labels),
+
+       sm::make_counter(
+         "request_errors_total",
+         [this] { return _request_metrics._3xx_count; },
+         sm::description(
+           ssx::sformat("Total number of {} redirection errors", _group_name)),
+         {operation_label(_path.operations.nickname), status_label("3xx")})
+         .aggregate(aggregate_labels)});
 }
 
 } // namespace pandaproxy
diff --git a/src/v/pandaproxy/probe.h b/src/v/pandaproxy/probe.h
index 2fd5ffa2f27a6..f5f3455dab24e 100644
--- a/src/v/pandaproxy/probe.h
+++ b/src/v/pandaproxy/probe.h
@@ -63,8 +63,14 @@ class probe {
       ss::httpd::path_description& path_desc, const ss::sstring& group_name);
     auto auto_measure() { return _request_metrics.auto_measure(); }
 
+private:
+    void setup_metrics();
+    void setup_public_metrics();
+
 private:
     http_status_metric _request_metrics;
+    const ss::httpd::path_description& _path;
+    const ss::sstring& _group_name;
     ss::metrics::metric_groups _metrics;
     ss::metrics::metric_groups _public_metrics;
 };
diff --git a/src/v/ssx/metrics.h b/src/v/ssx/metrics.h
index 2b492765b669c..4f3afd9ae8db0 100644
--- a/src/v/ssx/metrics.h
+++ b/src/v/ssx/metrics.h
@@ -34,7 +34,7 @@ inline ss::metrics::histogram report_default_histogram(const hdr_hist& hist) {
       num_buckets, first_value, log_base, scale);
 }
 
-const auto label_namespace = "redpanda";
+constexpr auto label_namespace = "redpanda";
 
 inline ss::metrics::label make_namespaced_label(const seastar::sstring& name) {
     return ss::metrics::label(ssx::sformat("{}_{}", label_namespace, name));

From 974c38233380410149dd2ec801fec087029fcd9e Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@redpanda.com>
Date: Fri, 15 Jul 2022 10:51:21 +0100
Subject: [PATCH 156/201] net: add namespaces to rpc labels

---
 src/v/net/probes.cc | 2 +-
 src/v/net/server.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/v/net/probes.cc b/src/v/net/probes.cc
index 41aef57e63478..79d905015deba 100644
--- a/src/v/net/probes.cc
+++ b/src/v/net/probes.cc
@@ -119,7 +119,7 @@ void server_probe::setup_public_metrics(
         proto.remove_suffix(4);
     }
 
-    auto server_label = sm::label("server");
+    auto server_label = ssx::metrics::make_namespaced_label("server");
 
     mgs.add_group(
       "rpc",
diff --git a/src/v/net/server.cc b/src/v/net/server.cc
index 7cce70e67fb48..682b25ae8e562 100644
--- a/src/v/net/server.cc
+++ b/src/v/net/server.cc
@@ -337,7 +337,7 @@ void server::setup_public_metrics() {
         server_name.remove_suffix(4);
     }
 
-    auto server_label = sm::label("server");
+    auto server_label = ssx::metrics::make_namespaced_label("server");
 
     _public_metrics.add_group(
       prometheus_sanitize::metrics_name("rpc:request"),

From 4c3db3d7a118305b80b9d2bc3c152da61ff91f16 Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Tue, 21 Jun 2022 10:13:50 +0100
Subject: [PATCH 157/201] auth: Introduce broker_authn_endpoint

This is equivalent to a broker_endpoint, but contains a field to
describe the authentication required for the listener.

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/config/CMakeLists.txt           |   1 +
 src/v/config/broker_authn_endpoint.cc | 104 ++++++++++++++++++++++++++
 src/v/config/broker_authn_endpoint.h  |  85 +++++++++++++++++++++
 3 files changed, 190 insertions(+)
 create mode 100644 src/v/config/broker_authn_endpoint.cc
 create mode 100644 src/v/config/broker_authn_endpoint.h

diff --git a/src/v/config/CMakeLists.txt b/src/v/config/CMakeLists.txt
index 350ee35e36f9c..7579ca7d37ff8 100644
--- a/src/v/config/CMakeLists.txt
+++ b/src/v/config/CMakeLists.txt
@@ -1,6 +1,7 @@
 v_cc_library(
   NAME config
   SRCS
+    broker_authn_endpoint.cc
     configuration.cc
     node_config.cc
     base_property.cc
diff --git a/src/v/config/broker_authn_endpoint.cc b/src/v/config/broker_authn_endpoint.cc
new file mode 100644
index 0000000000000..d3bcb055c0c8d
--- /dev/null
+++ b/src/v/config/broker_authn_endpoint.cc
@@ -0,0 +1,104 @@
+// Copyright 2022 Redpanda Data, Inc.
+//
+// Use of this software is governed by the Business Source License
+// included in the file licenses/BSL.md
+//
+// As of the Change Date specified in that file, in accordance with
+// the Business Source License, use of this software will be governed
+// by the Apache License, Version 2.0
+
+#include "config/broker_authn_endpoint.h"
+
+#include "kafka/client/exceptions.h"
+#include "model/metadata.h"
+#include "utils/string_switch.h"
+
+namespace config {
+
+std::string_view to_string_view(broker_authn_method m) {
+    switch (m) {
+    case broker_authn_method::none:
+        return "none";
+    case broker_authn_method::sasl:
+        return "sasl";
+    case broker_authn_method::mtls_identity:
+        return "mtls_identity";
+    }
+}
+
+template<>
+std::optional<broker_authn_method>
+from_string_view<broker_authn_method>(std::string_view sv) {
+    return string_switch<broker_authn_method>(sv)
+      .match("none", broker_authn_method::none)
+      .match("sasl", broker_authn_method::sasl)
+      .match("mtls_identity", broker_authn_method::mtls_identity)
+      .default_match(broker_authn_method::none);
+}
+
+std::ostream& operator<<(std::ostream& os, const broker_authn_endpoint& ep) {
+    fmt::print(os, "{{{}:{}:{}}}", ep.name, ep.address, ep.authn_method);
+    return os;
+}
+
+} // namespace config
+
+namespace YAML {
+
+Node convert<config::broker_authn_endpoint>::encode(const type& rhs) {
+    Node node;
+    node["name"] = rhs.name;
+    node["address"] = rhs.address.host();
+    node["port"] = rhs.address.port();
+    if (rhs.authn_method) {
+        node["authentication_method"] = ss::sstring(
+          to_string_view(*rhs.authn_method));
+    }
+    return node;
+}
+
+bool convert<config::broker_authn_endpoint>::decode(
+  const Node& node, type& rhs) {
+    for (auto s : {"address", "port"}) {
+        if (!node[s]) {
+            return false;
+        }
+    }
+    ss::sstring name;
+    if (node["name"]) {
+        name = node["name"].as<ss::sstring>();
+    }
+    auto address = node["address"].as<ss::sstring>();
+    auto port = node["port"].as<uint16_t>();
+    auto addr = net::unresolved_address(std::move(address), port);
+    std::optional<config::broker_authn_method> method{};
+    if (auto n = node["authentication_method"]; bool(n)) {
+        method = config::from_string_view<config::broker_authn_method>(
+          n.as<ss::sstring>());
+    }
+    rhs = config::broker_authn_endpoint{
+      .name = std::move(name),
+      .address = std::move(addr),
+      .authn_method = method};
+    return true;
+}
+
+} // namespace YAML
+
+void json::rjson_serialize(
+  json::Writer<json::StringBuffer>& w,
+  const config::broker_authn_endpoint& ep) {
+    w.StartObject();
+    w.Key("name");
+    w.String(ep.name);
+    w.Key("address");
+    w.String(ep.address.host());
+    w.Key("port");
+    w.Uint(ep.address.port());
+    if (ep.authn_method) {
+        w.Key("authentication_method");
+        auto method = to_string_view(*ep.authn_method);
+        w.String(method.begin(), method.length());
+    }
+    w.EndObject();
+}
diff --git a/src/v/config/broker_authn_endpoint.h b/src/v/config/broker_authn_endpoint.h
new file mode 100644
index 0000000000000..ac1ef2953b5b3
--- /dev/null
+++ b/src/v/config/broker_authn_endpoint.h
@@ -0,0 +1,85 @@
+// Copyright 2021 Redpanda Data, Inc.
+//
+// Use of this software is governed by the Business Source License
+// included in the file licenses/BSL.md
+//
+// As of the Change Date specified in that file, in accordance with
+// the Business Source License, use of this software will be governed
+// by the Apache License, Version 2.0
+
+#pragma once
+
+#include "config/convert.h"
+#include "config/property.h"
+#include "json/_include_first.h"
+#include "json/stringbuffer.h"
+#include "json/writer.h"
+#include "net/unresolved_address.h"
+
+#include <seastar/core/sstring.hh>
+
+#include <yaml-cpp/node/node.h>
+
+#include <iosfwd>
+#include <optional>
+#include <string>
+
+namespace config {
+
+template<typename E>
+std::enable_if_t<std::is_enum_v<E>, std::optional<E>>
+  from_string_view(std::string_view);
+
+enum class broker_authn_method {
+    none = 0,
+    sasl,
+    mtls_identity,
+};
+
+std::string_view to_string_view(broker_authn_method m);
+
+template<>
+std::optional<broker_authn_method>
+from_string_view<broker_authn_method>(std::string_view sv);
+
+struct broker_authn_endpoint {
+    ss::sstring name;
+    net::unresolved_address address;
+    std::optional<broker_authn_method> authn_method;
+
+    friend bool
+    operator==(const broker_authn_endpoint&, const broker_authn_endpoint&)
+      = default;
+
+    friend std::ostream&
+    operator<<(std::ostream& os, const broker_authn_endpoint& ep);
+};
+
+namespace detail {
+
+template<>
+consteval std::string_view property_type_name<broker_authn_endpoint>() {
+    return "config::broker_auth_endpoint";
+}
+
+} // namespace detail
+
+} // namespace config
+
+namespace YAML {
+
+template<>
+struct convert<config::broker_authn_endpoint> {
+    using type = config::broker_authn_endpoint;
+    static Node encode(const type& rhs);
+    static bool decode(const Node& node, type& rhs);
+};
+
+} // namespace YAML
+
+namespace json {
+
+void rjson_serialize(
+  json::Writer<json::StringBuffer>& w, const config::broker_authn_endpoint& ep);
+
+}

From ec192c861a8dfce63b4ad4299a272b11d26bcd83 Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Wed, 29 Jun 2022 13:34:50 +0100
Subject: [PATCH 158/201] auth: Extract get_authn_method

This will simplify future refactorings for auth per endpoint.

This intermediate step allows configuring AuthN without AuthZ,
this allows, for example, a sasl hadnshake, but the principal
is ignored.

The feature gate is no longer required, it was used to guard a
not-intended-for-public-consumption tech preview.

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/kafka/server/protocol.cc | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/src/v/kafka/server/protocol.cc b/src/v/kafka/server/protocol.cc
index f8167d884537f..d5a7ba1c004e5 100644
--- a/src/v/kafka/server/protocol.cc
+++ b/src/v/kafka/server/protocol.cc
@@ -10,6 +10,7 @@
 #include "protocol.h"
 
 #include "cluster/topics_frontend.h"
+#include "config/broker_authn_endpoint.h"
 #include "config/configuration.h"
 #include "kafka/server/connection_context.h"
 #include "kafka/server/coordinator_ntp_mapper.h"
@@ -17,6 +18,7 @@
 #include "kafka/server/logger.h"
 #include "kafka/server/request_context.h"
 #include "kafka/server/response.h"
+#include "net/connection.h"
 #include "security/mtls.h"
 #include "security/scram_algorithm.h"
 #include "utils/utf8.h"
@@ -92,31 +94,35 @@ coordinator_ntp_mapper& protocol::coordinator_mapper() {
     return _group_router.local().coordinator_mapper().local();
 }
 
+config::broker_authn_method get_authn_method(const net::connection& conn) {
+    const auto& config = config::shard_local_cfg();
+    if (config.enable_sasl()) {
+        return config::broker_authn_method::sasl;
+    }
+    if (conn.get_principal_mapping().has_value()) {
+        return config::broker_authn_method::mtls_identity;
+    }
+    return config::broker_authn_method::none;
+}
+
 ss::future<> protocol::apply(net::server::resources rs) {
+    const auto authn_method = get_authn_method(*rs.conn);
+
     /*
      * if sasl authentication is not enabled then initialize the sasl state to
      * complete. this will cause auth to be skipped during request processing.
-     *
-     * TODO: temporarily acl authorization is enabled/disabled based on sasl
-     * being enabled/disabled. it may be useful to configure them separately,
-     * but this will come when identity management is introduced.
      */
     security::sasl_server sasl(
-      config::shard_local_cfg().enable_sasl()
+      authn_method == config::broker_authn_method::sasl
         ? security::sasl_server::sasl_state::initial
         : security::sasl_server::sasl_state::complete);
 
-    const auto enable_mtls_authentication
-      = rs.conn->get_principal_mapping().has_value()
-        && feature_table().local().is_active(
-          cluster::feature::mtls_authentication);
-
     auto ctx = ss::make_lw_shared<connection_context>(
       *this,
       std::move(rs),
       std::move(sasl),
-      config::shard_local_cfg().enable_sasl(),
-      enable_mtls_authentication);
+      authn_method != config::broker_authn_method::none,
+      authn_method == config::broker_authn_method::mtls_identity);
 
     return ss::do_until(
              [ctx] { return ctx->is_finished_parsing(); },

From b16cfff99ba03fd571daff006768f682b8f809fd Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Tue, 21 Jun 2022 10:15:37 +0100
Subject: [PATCH 159/201] auth: Switch kafka_api to broker_authn_endpoint

`kafka_authn_endpoint_format` makes `broker_authn_endpoint` a
drop-in replacement for `broker_endpoint` for describe_configs.

A future commit will wire this up.

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/config/node_config.cc                   |  4 +++-
 src/v/config/node_config.h                    | 20 ++++++++++++++++---
 .../kafka/server/handlers/describe_configs.cc | 20 ++++++++++++++++++-
 src/v/redpanda/tests/fixture.h                |  8 +++++---
 4 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/src/v/config/node_config.cc b/src/v/config/node_config.cc
index ba673ebbdd335..0a8a468432896 100644
--- a/src/v/config/node_config.cc
+++ b/src/v/config/node_config.cc
@@ -63,7 +63,9 @@ node_config::node_config() noexcept
       "kafka_api",
       "Address and port of an interface to listen for Kafka API requests",
       {.visibility = visibility::user},
-      {model::broker_endpoint(net::unresolved_address("127.0.0.1", 9092))})
+      {config::broker_authn_endpoint{
+        .address = net::unresolved_address("127.0.0.1", 9092),
+        .authn_method = std::nullopt}})
   , kafka_api_tls(
       *this,
       "kafka_api_tls",
diff --git a/src/v/config/node_config.h b/src/v/config/node_config.h
index 6d802fa90f02b..c85eaf2fc36b1 100644
--- a/src/v/config/node_config.h
+++ b/src/v/config/node_config.h
@@ -9,6 +9,7 @@
 
 #pragma once
 
+#include "config/broker_authn_endpoint.h"
 #include "config/broker_endpoint.h"
 #include "config/convert.h"
 #include "config/data_directory_path.h"
@@ -16,6 +17,9 @@
 #include "config/seed_server.h"
 #include "config_store.h"
 
+#include <algorithm>
+#include <iterator>
+
 namespace config {
 
 struct node_config final : public config_store {
@@ -31,7 +35,7 @@ struct node_config final : public config_store {
     property<tls_config> rpc_server_tls;
 
     // Kafka RPC listener
-    one_or_many_property<model::broker_endpoint> kafka_api;
+    one_or_many_property<config::broker_authn_endpoint> kafka_api;
     one_or_many_property<endpoint_tls_config> kafka_api_tls;
 
     // Admin API listener
@@ -55,9 +59,19 @@ struct node_config final : public config_store {
         return data_directory().path / "pid.lock";
     }
 
-    const std::vector<model::broker_endpoint>& advertised_kafka_api() const {
+    std::vector<model::broker_endpoint> advertised_kafka_api() const {
         if (_advertised_kafka_api().empty()) {
-            return kafka_api();
+            std::vector<model::broker_endpoint> eps;
+            auto api = kafka_api();
+            eps.reserve(api.size());
+            std::transform(
+              std::make_move_iterator(api.begin()),
+              std::make_move_iterator(api.end()),
+              std::back_inserter(eps),
+              [](auto ep) {
+                  return model::broker_endpoint{ep.name, ep.address};
+              });
+            return eps;
         }
         return _advertised_kafka_api();
     }
diff --git a/src/v/kafka/server/handlers/describe_configs.cc b/src/v/kafka/server/handlers/describe_configs.cc
index ff25d103c4456..f713130653989 100644
--- a/src/v/kafka/server/handlers/describe_configs.cc
+++ b/src/v/kafka/server/handlers/describe_configs.cc
@@ -265,6 +265,24 @@ kafka_endpoint_format(const std::vector<model::broker_endpoint>& endpoints) {
     return ssx::sformat("{}", fmt::join(uris, ","));
 }
 
+static ss::sstring kafka_authn_endpoint_format(
+  const std::vector<config::broker_authn_endpoint>& endpoints) {
+    std::vector<ss::sstring> uris;
+    uris.reserve(endpoints.size());
+    std::transform(
+      endpoints.cbegin(),
+      endpoints.cend(),
+      std::back_inserter(uris),
+      [](const config::broker_authn_endpoint& ep) {
+          return ssx::sformat(
+            "{}://{}:{}",
+            (ep.name.empty() ? "plain" : ep.name),
+            ep.address.host(),
+            ep.address.port());
+      });
+    return ssx::sformat("{}", fmt::join(uris, ","));
+}
+
 static void report_broker_config(
   const describe_configs_resource& resource,
   describe_configs_result& result,
@@ -299,7 +317,7 @@ static void report_broker_config(
       "listeners",
       config::node().kafka_api,
       include_synonyms,
-      &kafka_endpoint_format);
+      &kafka_authn_endpoint_format);
 
     add_broker_config_if_requested(
       resource,
diff --git a/src/v/redpanda/tests/fixture.h b/src/v/redpanda/tests/fixture.h
index b5986f36ecaf7..c90622550b264 100644
--- a/src/v/redpanda/tests/fixture.h
+++ b/src/v/redpanda/tests/fixture.h
@@ -19,6 +19,7 @@
 #include "cluster/shard_table.h"
 #include "cluster/topics_frontend.h"
 #include "cluster/types.h"
+#include "config/broker_authn_endpoint.h"
 #include "config/node_config.h"
 #include "coproc/api.h"
 #include "kafka/client/transport.h"
@@ -183,9 +184,10 @@ class redpanda_thread_fixture {
             node_config.get("rpc_server")
               .set_value(net::unresolved_address("127.0.0.1", rpc_port));
             node_config.get("kafka_api")
-              .set_value(
-                std::vector<model::broker_endpoint>{model::broker_endpoint(
-                  net::unresolved_address("127.0.0.1", kafka_port))});
+              .set_value(std::vector<config::broker_authn_endpoint>{
+                config::broker_authn_endpoint{
+                  .address = net::unresolved_address(
+                    "127.0.0.1", kafka_port)}});
             node_config.get("data_directory")
               .set_value(config::data_directory_path{.path = base_path});
             node_config.get("coproc_supervisor_server")

From 17a5e26a3d6af3c7d4a27126f9b451b12102f24f Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Tue, 21 Jun 2022 13:54:01 +0100
Subject: [PATCH 160/201] auth: Introduce kafka_enable_authorization

This overrides `enable_sasl` as a flag for enabling authorization.

A future commit will wire this up, existing behaviour is
preserved if the flag is not set.

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/config/configuration.cc           | 14 +++++++++-
 src/v/config/configuration.h            |  1 +
 src/v/kafka/server/connection_context.h | 12 ++++-----
 src/v/kafka/server/protocol.cc          | 34 +++++++++++++++++++++++--
 4 files changed, 52 insertions(+), 9 deletions(-)

diff --git a/src/v/config/configuration.cc b/src/v/config/configuration.cc
index c386ffffdaf7a..745908ba01368 100644
--- a/src/v/config/configuration.cc
+++ b/src/v/config/configuration.cc
@@ -18,6 +18,7 @@
 #include "units.h"
 
 #include <cstdint>
+#include <optional>
 
 namespace config {
 using namespace std::chrono_literals;
@@ -780,9 +781,20 @@ configuration::configuration()
   , enable_sasl(
       *this,
       "enable_sasl",
-      "Enable SASL authentication for Kafka connections.",
+      "Enable SASL authentication for Kafka connections, authorization is "
+      "required. see also `kafka_enable_authorization`",
       {.needs_restart = needs_restart::no, .visibility = visibility::user},
       false)
+  , kafka_enable_authorization(
+      *this,
+      "kafka_enable_authorization",
+      "Enable authorization for Kafka connections. Values:"
+      "- `nil`: Ignored. Authorization is enabled with `enable_sasl: true`"
+      "; `true`: authorization is required"
+      "; `false`: authorization is disabled"
+      ". See also: `enable_sasl` and `kafka_api[].authentication_method`",
+      {.needs_restart = needs_restart::no, .visibility = visibility::user},
+      std::nullopt)
   , controller_backend_housekeeping_interval_ms(
       *this,
       "controller_backend_housekeeping_interval_ms",
diff --git a/src/v/config/configuration.h b/src/v/config/configuration.h
index 10fa20a4e7d00..b01b285a2c716 100644
--- a/src/v/config/configuration.h
+++ b/src/v/config/configuration.h
@@ -171,6 +171,7 @@ struct configuration final : public config_store {
     property<int16_t> id_allocator_log_capacity;
     property<int16_t> id_allocator_batch_size;
     property<bool> enable_sasl;
+    property<std::optional<bool>> kafka_enable_authorization;
     property<std::chrono::milliseconds>
       controller_backend_housekeeping_interval_ms;
     property<std::chrono::milliseconds> node_management_operation_timeout_ms;
diff --git a/src/v/kafka/server/connection_context.h b/src/v/kafka/server/connection_context.h
index 4276f8f9da652..fb1adc73de22e 100644
--- a/src/v/kafka/server/connection_context.h
+++ b/src/v/kafka/server/connection_context.h
@@ -68,6 +68,10 @@ class connection_context final
     template<typename T>
     bool authorized(
       security::acl_operation operation, const T& name, authz_quiet quiet) {
+        // authorization disabled?
+        if (!_enable_authorizer) {
+            return true;
+        }
         // mtls configured?
         if (_use_mtls) {
             if (_mtls_principal.has_value()) {
@@ -76,12 +80,8 @@ class connection_context final
             }
             return false;
         }
-        // sasl configured?
-        if (!_enable_authorizer) {
-            return true;
-        }
-        auto user = sasl().principal();
-        return authorized_user(std::move(user), operation, name, quiet);
+        // use sasl
+        return authorized_user(sasl().principal(), operation, name, quiet);
     }
 
     template<typename T>
diff --git a/src/v/kafka/server/protocol.cc b/src/v/kafka/server/protocol.cc
index d5a7ba1c004e5..d36f36d1848f7 100644
--- a/src/v/kafka/server/protocol.cc
+++ b/src/v/kafka/server/protocol.cc
@@ -12,6 +12,7 @@
 #include "cluster/topics_frontend.h"
 #include "config/broker_authn_endpoint.h"
 #include "config/configuration.h"
+#include "config/node_config.h"
 #include "kafka/server/connection_context.h"
 #include "kafka/server/coordinator_ntp_mapper.h"
 #include "kafka/server/group_router.h"
@@ -95,10 +96,36 @@ coordinator_ntp_mapper& protocol::coordinator_mapper() {
 }
 
 config::broker_authn_method get_authn_method(const net::connection& conn) {
+    // If authn_method is set on the endpoint
+    //    Use it
+    // Else if kafka_enable_authorization is not set
+    //    Use sasl if enable_sasl
+    // Else if has mtls mapping rules
+    //    Use mtls_identity
+    // Else
+    //    Disable AuthN
+
+    std::optional<config::broker_authn_method> authn_method;
+    auto n = conn.name();
+    const auto& kafka_api = config::node().kafka_api.value();
+    auto ep_it = std::find_if(
+      kafka_api.begin(),
+      kafka_api.end(),
+      [&n](const config::broker_authn_endpoint& ep) { return ep.name == n; });
+    if (ep_it != kafka_api.end()) {
+        authn_method = ep_it->authn_method;
+    }
+    if (authn_method.has_value()) {
+        return *authn_method;
+    }
     const auto& config = config::shard_local_cfg();
-    if (config.enable_sasl()) {
+    // if kafka_enable_authorization is not set, use sasl iff enable_sasl
+    if (
+      !config.kafka_enable_authorization().has_value()
+      && config.enable_sasl()) {
         return config::broker_authn_method::sasl;
     }
+    // mtls_identity is currently predicated on having mapping rules
     if (conn.get_principal_mapping().has_value()) {
         return config::broker_authn_method::mtls_identity;
     }
@@ -106,6 +133,9 @@ config::broker_authn_method get_authn_method(const net::connection& conn) {
 }
 
 ss::future<> protocol::apply(net::server::resources rs) {
+    const bool authz_enabled
+      = config::shard_local_cfg().kafka_enable_authorization().value_or(
+        config::shard_local_cfg().enable_sasl());
     const auto authn_method = get_authn_method(*rs.conn);
 
     /*
@@ -121,7 +151,7 @@ ss::future<> protocol::apply(net::server::resources rs) {
       *this,
       std::move(rs),
       std::move(sasl),
-      authn_method != config::broker_authn_method::none,
+      authz_enabled,
       authn_method == config::broker_authn_method::mtls_identity);
 
     return ss::do_until(

From 4b4ff85baf0ec81381b213e456981f5465dae365 Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Wed, 22 Jun 2022 18:08:26 +0100
Subject: [PATCH 161/201] auth: Fix ducktape acls_test due to config changes

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 tests/rptest/services/redpanda.py             | 32 +++++++++++--
 tests/rptest/services/templates/redpanda.yaml |  6 +++
 tests/rptest/tests/acls_test.py               | 46 +++++++++++++------
 3 files changed, 66 insertions(+), 18 deletions(-)

diff --git a/tests/rptest/services/redpanda.py b/tests/rptest/services/redpanda.py
index e7109b4fe857b..5b6a0c5de89e3 100644
--- a/tests/rptest/services/redpanda.py
+++ b/tests/rptest/services/redpanda.py
@@ -357,10 +357,18 @@ class SecurityConfig:
 
     def __init__(self):
         self.enable_sasl = False
+        self.kafka_enable_authorization: Optional[bool] = None
+        self.endpoint_authn_method: Optional[str] = None
         self.tls_provider: Optional[TLSProvider] = None
 
-        # extract principal from mtls distinguished name
-        self.enable_mtls_identity = False
+    # sasl is required
+    def sasl_enabled(self):
+        return (self.kafka_enable_authorization is None
+                and self.enable_sasl) or self.endpoint_authn_method == "sasl"
+
+    # principal is extracted from mtls distinguished name
+    def mtls_identity_enabled(self):
+        return self.endpoint_authn_method == "mtls_identity"
 
 
 class RedpandaService(Service):
@@ -546,7 +554,13 @@ def _init_tls(self):
                 self, "redpanda.service.admin")
 
     def sasl_enabled(self):
-        return self._security.enable_sasl
+        return self._security.sasl_enabled()
+
+    def mtls_identity_enabled(self):
+        return self._security.mtls_identity_enabled()
+
+    def endpoint_authn_method(self):
+        return self._security.endpoint_authn_method
 
     @property
     def dedicated_nodes(self):
@@ -1294,7 +1308,8 @@ def write_node_conf_file(self, node, override_cfg_params=None):
                            enable_pp=self._enable_pp,
                            enable_sr=self._enable_sr,
                            superuser=self._superuser,
-                           sasl_enabled=self.sasl_enabled())
+                           sasl_enabled=self.sasl_enabled(),
+                           endpoint_authn_method=self.endpoint_authn_method())
 
         if override_cfg_params or self._extra_node_conf[node]:
             doc = yaml.full_load(conf)
@@ -1317,7 +1332,7 @@ def write_node_conf_file(self, node, override_cfg_params=None):
                 cert_file=RedpandaService.TLS_SERVER_CRT_FILE,
                 truststore_file=RedpandaService.TLS_CA_CRT_FILE,
             )
-            if self._security.enable_mtls_identity:
+            if self.mtls_identity_enabled():
                 tls_config.update(
                     dict(principal_mapping_rules=SecurityConfig.
                          PRINCIPAL_MAPPING_RULES, ))
@@ -1341,6 +1356,13 @@ def write_bootstrap_cluster_config(self):
         if self._security.enable_sasl:
             self.logger.debug("Enabling SASL in cluster configuration")
             conf.update(dict(enable_sasl=True))
+        if self._security.kafka_enable_authorization is not None:
+            self.logger.debug(
+                f"Setting kafka_enable_authorization: {self._security.kafka_enable_authorization} in cluster configuration"
+            )
+            conf.update(
+                dict(kafka_enable_authorization=self._security.
+                     kafka_enable_authorization))
 
         conf_yaml = yaml.dump(conf)
         for node in self.nodes:
diff --git a/tests/rptest/services/templates/redpanda.yaml b/tests/rptest/services/templates/redpanda.yaml
index 18ec6f1d3e13a..c68605ae3f710 100644
--- a/tests/rptest/services/templates/redpanda.yaml
+++ b/tests/rptest/services/templates/redpanda.yaml
@@ -21,9 +21,15 @@ redpanda:
     - name: dnslistener
       address: "{{node.account.hostname}}"
       port: 9092
+      {% if endpoint_authn_method %}
+      authentication_method: {{ endpoint_authn_method }}
+      {% endif %}
     - name: iplistener
       address: "{{node_ip}}"
       port: {{kafka_alternate_port}}
+      {% if endpoint_authn_method %}
+      authentication_method: {{ endpoint_authn_method }}
+      {% endif %}
   admin:
     - address: 127.0.0.1
       port: 9644
diff --git a/tests/rptest/tests/acls_test.py b/tests/rptest/tests/acls_test.py
index 4d839a9676e25..7f353fab4ce1e 100644
--- a/tests/rptest/tests/acls_test.py
+++ b/tests/rptest/tests/acls_test.py
@@ -46,10 +46,15 @@ def setUp(self):
         # it with custom security settings
         return
 
-    def prepare_cluster(self, use_tls, use_sasl):
+    def prepare_cluster(self,
+                        use_tls,
+                        use_sasl,
+                        enable_authz=None,
+                        authn_method=None):
         self.security = SecurityConfig()
         self.security.enable_sasl = use_sasl
-        self.security.enable_mtls_identity = use_tls and not use_sasl
+        self.security.kafka_enable_authorization = enable_authz
+        self.security.endpoint_authn_method = authn_method
 
         if use_tls:
             self.tls = tls.TLSCertManager(self.logger)
@@ -78,7 +83,7 @@ def prepare_cluster(self, use_tls, use_sasl):
 
         admin = Admin(self.redpanda)
 
-        if self.security.enable_mtls_identity:
+        if self.security.mtls_identity_enabled():
             feature_name = "mtls_authentication"
             admin.put_feature(feature_name, {"state": "active"})
 
@@ -92,11 +97,11 @@ def check_feature_active():
             wait_until(check_feature_active, timeout_sec=10, backoff_sec=1)
 
         # base case user is not a superuser and has no configured ACLs
-        if use_sasl:
+        if use_sasl or enable_authz:
             admin.create_user("base", self.password, self.algorithm)
 
         # only grant cluster describe permission to user cluster_describe
-        if use_sasl:
+        if use_sasl or enable_authz:
             admin.create_user("cluster_describe", self.password,
                               self.algorithm)
         client = self.get_super_client()
@@ -105,7 +110,7 @@ def check_feature_active():
         # there is not a convenient interface for waiting for acls to propogate
         # to all nodes so when we are using mtls only for identity we inject a
         # sleep here to try to avoid any acl propogation races.
-        if self.security.enable_mtls_identity:
+        if self.security.mtls_identity_enabled():
             time.sleep(5)
             return
 
@@ -120,7 +125,7 @@ def users_propogated():
         wait_until(users_propogated, timeout_sec=10, backoff_sec=1)
 
     def get_client(self, username):
-        if self.security.enable_mtls_identity:
+        if self.security.mtls_identity_enabled():
             if username == "base":
                 cert = self.base_user_cert
             elif username == "cluster_describe":
@@ -140,7 +145,7 @@ def get_client(self, username):
                        tls_cert=cert)
 
     def get_super_client(self):
-        if self.security.enable_mtls_identity:
+        if self.security.mtls_identity_enabled():
             return RpkTool(self.redpanda, tls_cert=self.admin_user_cert)
 
         username, password, _ = self.redpanda.SUPERUSER_CREDENTIALS
@@ -154,16 +159,31 @@ def get_super_client(self):
                        sasl_mechanism=self.algorithm,
                        tls_cert=cert)
 
+    # The old config style has use_sasl at the top level, which enables
+    # authorization. New config style has kafka_enable_authorization at the
+    # top-level, with authentication_method on the listener.
     @cluster(num_nodes=3)
+    # plaintext conn + sasl for authn (global sasl config)
     @parametrize(use_tls=False,
-                 use_sasl=True)  # plaintext conn + sasl for authn
-    @parametrize(use_tls=True, use_sasl=True)  # ssl/tls conn + sasl for authn
-    @parametrize(use_tls=True, use_sasl=False)  # ssl/tls conn + mtls for authn
-    def test_describe_acls(self, use_tls, use_sasl):
+                 use_sasl=True,
+                 enable_authz=None,
+                 authn_method=None)
+    # ssl/tls conn + sasl for authn (global sasl config)
+    @parametrize(use_tls=True,
+                 use_sasl=True,
+                 enable_authz=None,
+                 authn_method=None)
+    # ssl/tls conn + mtls for authn (listener mtls config)
+    @parametrize(use_tls=True,
+                 use_sasl=False,
+                 enable_authz=True,
+                 authn_method="mtls_identity")
+    def test_describe_acls(self, use_tls, use_sasl, enable_authz,
+                           authn_method):
         """
         security::acl_operation::describe, security::default_cluster_name
         """
-        self.prepare_cluster(use_tls, use_sasl)
+        self.prepare_cluster(use_tls, use_sasl, enable_authz, authn_method)
 
         # run a few times for good health
         for _ in range(5):

From 819205c338450c1b25bb0b321b14eca26369d76e Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Wed, 29 Jun 2022 09:53:42 +0100
Subject: [PATCH 162/201] auth: Add Ducktape auth tests for new endpoint
 configuration

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 tests/rptest/tests/acls_test.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/rptest/tests/acls_test.py b/tests/rptest/tests/acls_test.py
index 7f353fab4ce1e..47f1daf40ca76 100644
--- a/tests/rptest/tests/acls_test.py
+++ b/tests/rptest/tests/acls_test.py
@@ -173,6 +173,11 @@ def get_super_client(self):
                  use_sasl=True,
                  enable_authz=None,
                  authn_method=None)
+    # ssl/tls conn + sasl for authn (listener sasl config)
+    @parametrize(use_tls=True,
+                 use_sasl=False,
+                 enable_authz=True,
+                 authn_method="sasl")
     # ssl/tls conn + mtls for authn (listener mtls config)
     @parametrize(use_tls=True,
                  use_sasl=False,

From 63baa0122f0a1bb8d8a06f9b8b137bc5938d59ab Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Wed, 13 Jul 2022 01:31:46 +0100
Subject: [PATCH 163/201] auth: Add Ducktape auth test for disabling authz

Ensure that `kafka_enable_authorization: false` overrides
`enable_sasl: true` with regard to authz

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 tests/rptest/tests/acls_test.py | 36 +++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/tests/rptest/tests/acls_test.py b/tests/rptest/tests/acls_test.py
index 47f1daf40ca76..08d2194b2324e 100644
--- a/tests/rptest/tests/acls_test.py
+++ b/tests/rptest/tests/acls_test.py
@@ -125,7 +125,8 @@ def users_propogated():
         wait_until(users_propogated, timeout_sec=10, backoff_sec=1)
 
     def get_client(self, username):
-        if self.security.mtls_identity_enabled():
+        if self.security.mtls_identity_enabled(
+        ) or not self.security.sasl_enabled():
             if username == "base":
                 cert = self.base_user_cert
             elif username == "cluster_describe":
@@ -145,7 +146,8 @@ def get_client(self, username):
                        tls_cert=cert)
 
     def get_super_client(self):
-        if self.security.mtls_identity_enabled():
+        if self.security.mtls_identity_enabled(
+        ) or not self.security.sasl_enabled():
             return RpkTool(self.redpanda, tls_cert=self.admin_user_cert)
 
         username, password, _ = self.redpanda.SUPERUSER_CREDENTIALS
@@ -183,8 +185,30 @@ def get_super_client(self):
                  use_sasl=False,
                  enable_authz=True,
                  authn_method="mtls_identity")
-    def test_describe_acls(self, use_tls, use_sasl, enable_authz,
-                           authn_method):
+    # Disable authz
+    @parametrize(use_tls=True,
+                 use_sasl=True,
+                 enable_authz=False,
+                 authn_method=None,
+                 always_succeed=True)
+    # Disable authz
+    @parametrize(use_tls=True,
+                 use_sasl=True,
+                 enable_authz=False,
+                 authn_method="sasl",
+                 always_succeed=True)
+    # Disable authz
+    @parametrize(use_tls=True,
+                 use_sasl=True,
+                 enable_authz=False,
+                 authn_method="mtls_identity",
+                 always_succeed=True)
+    def test_describe_acls(self,
+                           use_tls,
+                           use_sasl,
+                           enable_authz,
+                           authn_method,
+                           always_succeed=False):
         """
         security::acl_operation::describe, security::default_cluster_name
         """
@@ -194,9 +218,9 @@ def test_describe_acls(self, use_tls, use_sasl, enable_authz,
         for _ in range(5):
             try:
                 self.get_client("base").acl_list()
-                assert False, "list acls should have failed"
+                assert always_succeed, "list acls should have failed"
             except ClusterAuthorizationError:
-                pass
+                assert not always_succeed
 
             self.get_client("cluster_describe").acl_list()
             self.get_super_client().acl_list()

From 9fa11d8fdbb140f221c09254bfc1168cb3c3ca43 Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Wed, 11 May 2022 12:35:56 +0100
Subject: [PATCH 164/201] auth: Allow config of principle_mapping_rules

The configuration will be moved in a later commit.

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 tests/rptest/services/redpanda.py | 9 ++++++---
 tests/rptest/tests/acls_test.py   | 7 ++++++-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/tests/rptest/services/redpanda.py b/tests/rptest/services/redpanda.py
index 5b6a0c5de89e3..e22875b80a9c0 100644
--- a/tests/rptest/services/redpanda.py
+++ b/tests/rptest/services/redpanda.py
@@ -353,7 +353,7 @@ class SecurityConfig:
     # the rules, so instead we use a fixed mapping and arrange for certs to use
     # a similar format. this will change when we get closer to GA and the
     # configuration becomes more general.
-    PRINCIPAL_MAPPING_RULES = "RULE:^O=Redpanda,CN=(.*?)$/$1/L, DEFAULT"
+    __DEFAULT_PRINCIPAL_MAPPING_RULES = "RULE:^O=Redpanda,CN=(.*?)$/$1/L, DEFAULT"
 
     def __init__(self):
         self.enable_sasl = False
@@ -361,6 +361,9 @@ def __init__(self):
         self.endpoint_authn_method: Optional[str] = None
         self.tls_provider: Optional[TLSProvider] = None
 
+        # The rules to extract principal from mtls
+        self.principal_mapping_rules = self.__DEFAULT_PRINCIPAL_MAPPING_RULES
+
     # sasl is required
     def sasl_enabled(self):
         return (self.kafka_enable_authorization is None
@@ -1334,8 +1337,8 @@ def write_node_conf_file(self, node, override_cfg_params=None):
             )
             if self.mtls_identity_enabled():
                 tls_config.update(
-                    dict(principal_mapping_rules=SecurityConfig.
-                         PRINCIPAL_MAPPING_RULES, ))
+                    dict(principal_mapping_rules=self._security.
+                         principal_mapping_rules))
             doc = yaml.full_load(conf)
             doc["redpanda"].update(dict(kafka_api_tls=tls_config))
             conf = yaml.dump(doc)
diff --git a/tests/rptest/tests/acls_test.py b/tests/rptest/tests/acls_test.py
index 08d2194b2324e..01fb6de039f24 100644
--- a/tests/rptest/tests/acls_test.py
+++ b/tests/rptest/tests/acls_test.py
@@ -50,7 +50,8 @@ def prepare_cluster(self,
                         use_tls,
                         use_sasl,
                         enable_authz=None,
-                        authn_method=None):
+                        authn_method=None,
+                        principal_mapping_rules=None):
         self.security = SecurityConfig()
         self.security.enable_sasl = use_sasl
         self.security.kafka_enable_authorization = enable_authz
@@ -78,6 +79,10 @@ def prepare_cluster(self,
 
             self.security.tls_provider = MTLSProvider(self.tls)
 
+        if self.security.mtls_identity_enabled(
+        ) and principal_mapping_rules is not None:
+            self.security.principal_mapping_rules = principal_mapping_rules
+
         self.redpanda.set_security_settings(self.security)
         self.redpanda.start()
 

From 3f480196d0caa568b4afce0ef46afdd3ed49f70d Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Wed, 11 May 2022 16:07:44 +0100
Subject: [PATCH 165/201] auth: Add tests for extracting principal from mTLS
 Subject

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 tests/rptest/tests/acls_test.py | 44 ++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/tests/rptest/tests/acls_test.py b/tests/rptest/tests/acls_test.py
index 01fb6de039f24..86eefb0cf9894 100644
--- a/tests/rptest/tests/acls_test.py
+++ b/tests/rptest/tests/acls_test.py
@@ -31,7 +31,9 @@ def create_broker_cert(self, redpanda, node):
         return self.tls.create_cert(node.name)
 
     def create_service_client_cert(self, _, name):
-        return self.tls.create_cert(socket.gethostname(), name=name)
+        return self.tls.create_cert(socket.gethostname(),
+                                    name=name,
+                                    common_name=name)
 
 
 class AccessControlListTest(RedpandaTest):
@@ -229,3 +231,43 @@ def test_describe_acls(self,
 
             self.get_client("cluster_describe").acl_list()
             self.get_super_client().acl_list()
+
+    # Test mtls identity
+    # Principals in use:
+    # * redpanda.service.admin: the default admin client
+    # * admin: used for acl bootstrap
+    # * cluster_describe: the principal under test
+    @cluster(num_nodes=3)
+    # DEFAULT: The whole SAN
+    @parametrize(rules="DEFAULT", fail=True)
+    #  Match admin, or O (Redpanda)
+    @parametrize(
+        rules=
+        "RULE:^O=Redpanda,CN=(redpanda.service.admin|admin)$/$1/, RULE:^O=([^,]+),CN=(.*?)$/$1/",
+        fail=True)
+    # Wrong Case
+    @parametrize(rules="RULE:^O=Redpanda,CN=(.*?)$/$1/U", fail=True)
+    # Match CN
+    @parametrize(rules="RULE:^O=Redpanda,CN=(.*?)$/$1/L", fail=False)
+    # Full Match
+    @parametrize(
+        rules=
+        "RULE:^O=Redpanda,CN=(cluster_describe|redpanda.service.admin|admin)$/$1/",
+        fail=False)
+    def test_mtls_principal(self, rules=None, fail=False):
+        """
+        security::acl_operation::describe, security::default_cluster_name
+        """
+        self.prepare_cluster(use_tls=True,
+                             use_sasl=False,
+                             enable_authz=True,
+                             authn_method="mtls_identity",
+                             principal_mapping_rules=rules)
+
+        # run a few times for good health
+        for _ in range(5):
+            try:
+                self.get_client("cluster_describe").acl_list()
+                assert not fail, "list acls should have failed"
+            except ClusterAuthorizationError:
+                assert fail, "list acls should have succeeded"

From 13a9383be8b9527e0a693036616ec37a520bb2bb Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Thu, 16 Jun 2022 20:21:30 +0100
Subject: [PATCH 166/201] auth: Add a mechanism to validate mapping rules

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/security/mtls.cc | 10 ++++++++++
 src/v/security/mtls.h  |  3 +++
 2 files changed, 13 insertions(+)

diff --git a/src/v/security/mtls.cc b/src/v/security/mtls.cc
index 5f5e7c465da5a..99f679a8ff7f9 100644
--- a/src/v/security/mtls.cc
+++ b/src/v/security/mtls.cc
@@ -147,6 +147,16 @@ std::optional<ss::sstring> rule::apply(std::string_view dn) const {
     return result;
 }
 
+std::optional<ss::sstring>
+validate_rules(const std::optional<ss::sstring>& r) noexcept {
+    try {
+        security::tls::detail::parse_rules(r);
+    } catch (const std::exception& e) {
+        return e.what();
+    }
+    return std::nullopt;
+}
+
 std::ostream& operator<<(std::ostream& os, const rule& r) {
     fmt::print(os, "{}", r);
     return os;
diff --git a/src/v/security/mtls.h b/src/v/security/mtls.h
index 492f7d30dd719..1c8d51a8c52ab 100644
--- a/src/v/security/mtls.h
+++ b/src/v/security/mtls.h
@@ -82,6 +82,9 @@ class principal_mapper {
     std::vector<rule> _rules;
 };
 
+std::optional<ss::sstring>
+validate_rules(const std::optional<ss::sstring>& r) noexcept;
+
 } // namespace security::tls
 
 template<>

From 00acfbfdad78b353ef3c39fc7f9267414cba6168 Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Tue, 21 Jun 2022 20:41:16 +0100
Subject: [PATCH 167/201] auth: Add kafka_mtls_principal_mapping_rules

A future commit will wire this up and replace the rules currently
in tls_config.

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/config/configuration.cc | 8 ++++++++
 src/v/config/configuration.h  | 1 +
 2 files changed, 9 insertions(+)

diff --git a/src/v/config/configuration.cc b/src/v/config/configuration.cc
index 745908ba01368..297e6abac797d 100644
--- a/src/v/config/configuration.cc
+++ b/src/v/config/configuration.cc
@@ -13,6 +13,7 @@
 #include "config/node_config.h"
 #include "config/validators.h"
 #include "model/metadata.h"
+#include "security/mtls.h"
 #include "storage/chunk_cache.h"
 #include "storage/segment_appender.h"
 #include "units.h"
@@ -795,6 +796,13 @@ configuration::configuration()
       ". See also: `enable_sasl` and `kafka_api[].authentication_method`",
       {.needs_restart = needs_restart::no, .visibility = visibility::user},
       std::nullopt)
+  , kafka_mtls_principal_mapping_rules(
+      *this,
+      "kafka_mtls_principal_mapping_rules",
+      "Principal Mapping Rules for mTLS Authentication on the Kafka API",
+      {.needs_restart = needs_restart::no, .visibility = visibility::user},
+      std::nullopt,
+      security::tls::validate_rules)
   , controller_backend_housekeeping_interval_ms(
       *this,
       "controller_backend_housekeeping_interval_ms",
diff --git a/src/v/config/configuration.h b/src/v/config/configuration.h
index b01b285a2c716..b9f6f4c046a1e 100644
--- a/src/v/config/configuration.h
+++ b/src/v/config/configuration.h
@@ -172,6 +172,7 @@ struct configuration final : public config_store {
     property<int16_t> id_allocator_batch_size;
     property<bool> enable_sasl;
     property<std::optional<bool>> kafka_enable_authorization;
+    property<std::optional<ss::sstring>> kafka_mtls_principal_mapping_rules;
     property<std::chrono::milliseconds>
       controller_backend_housekeeping_interval_ms;
     property<std::chrono::milliseconds> node_management_operation_timeout_ms;

From 79b2f8d5cd927bbcfb0d7d1a3e56d9d7b001647d Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Thu, 30 Jun 2022 16:40:55 +0100
Subject: [PATCH 168/201] security: Add missing header guard

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/security/errc.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/v/security/errc.h b/src/v/security/errc.h
index af7ef3e1645d3..0373cfd90de40 100644
--- a/src/v/security/errc.h
+++ b/src/v/security/errc.h
@@ -8,6 +8,8 @@
  * the Business Source License, use of this software will be governed
  * by the Apache License, Version 2.0
  */
+#pragma once
+
 #include "outcome.h"
 
 namespace security {

From 035d307d2ab50ba6de9bc5774cdc71ce827fef86 Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Mon, 11 Jul 2022 12:02:58 +0100
Subject: [PATCH 169/201] auth: Refactor mtls_principal_mapper

* Move most of the usage from connection_context to protocol
  * There's now an instance per protocol, rather than per connection
  * Getting the principal is now done at connection startup
* Move mapping rules from tls_config
  * Rely on mapping rules from top-level config
  * Remove from tls_config
  * Mapping rules are now runtime via cluster config

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/config/rjson_serialization.cc           |  5 --
 src/v/config/tls_config.h                     | 49 +++------------
 src/v/kafka/server/connection_context.cc      | 55 +----------------
 src/v/kafka/server/connection_context.h       | 18 +++---
 src/v/kafka/server/protocol.cc                | 59 ++++++++++++++-----
 src/v/kafka/server/protocol.h                 |  2 +
 .../kafka/server/tests/request_parser_test.cc |  2 +-
 src/v/redpanda/application.cc                 | 13 +---
 src/v/redpanda/tests/fixture.h                |  2 +-
 src/v/rpc/test/rpc_gen_cycling_test.cc        |  9 +--
 src/v/security/mtls.cc                        | 16 +++++
 src/v/security/mtls.h                         | 33 +++++------
 src/v/security/tests/mtls_test.cc             | 29 ++++++---
 tests/rptest/services/redpanda.py             |  7 +--
 tests/rptest/tests/acls_test.py               | 10 +++-
 15 files changed, 129 insertions(+), 180 deletions(-)

diff --git a/src/v/config/rjson_serialization.cc b/src/v/config/rjson_serialization.cc
index 59fee261d2d46..f1c2f6a012d3e 100644
--- a/src/v/config/rjson_serialization.cc
+++ b/src/v/config/rjson_serialization.cc
@@ -60,11 +60,6 @@ void rjson_serialize_impl(
         w.Key("truststore_file");
         w.String((*(v.get_truststore_file())).c_str());
     }
-
-    if (v.get_principal_mapping_rules()) {
-        w.Key("principal_mapping_rules");
-        w.String(*v.get_principal_mapping_rules());
-    }
 }
 
 void rjson_serialize(
diff --git a/src/v/config/tls_config.h b/src/v/config/tls_config.h
index 3389a327f9182..fb461da7d1e90 100644
--- a/src/v/config/tls_config.h
+++ b/src/v/config/tls_config.h
@@ -23,7 +23,6 @@
 #include <seastar/net/tls.hh>
 
 #include <boost/filesystem.hpp>
-#include <security/mtls.h>
 #include <yaml-cpp/yaml.h>
 
 #include <optional>
@@ -57,13 +56,11 @@ class tls_config {
       bool enabled,
       std::optional<key_cert> key_cert,
       std::optional<ss::sstring> truststore,
-      bool require_client_auth,
-      std::optional<ss::sstring> principal_mapping_rules)
+      bool require_client_auth)
       : _enabled(enabled)
       , _key_cert(std::move(key_cert))
       , _truststore_file(std::move(truststore))
-      , _require_client_auth(require_client_auth)
-      , _principal_mapping_rules(std::move(principal_mapping_rules)) {}
+      , _require_client_auth(require_client_auth) {}
 
     bool is_enabled() const { return _enabled; }
 
@@ -77,10 +74,6 @@ class tls_config {
 
     bool get_require_client_auth() const { return _require_client_auth; }
 
-    const std::optional<ss::sstring>& get_principal_mapping_rules() const {
-        return _principal_mapping_rules;
-    }
-
     ss::future<std::optional<ss::tls::credentials_builder>>
     get_credentials_builder() const& {
         if (_enabled) {
@@ -125,19 +118,6 @@ class tls_config {
             return "Trust store is required when client authentication is "
                    "enabled";
         }
-        if (c.get_principal_mapping_rules()) {
-            if (!c.get_require_client_auth()) {
-                return "Client authentication is required when principal "
-                       "mapping rules are set";
-            }
-            // Validate regex of the mapping rules
-            try {
-                security::tls::detail::parse_rules(
-                  c.get_principal_mapping_rules());
-            } catch (const std::runtime_error& e) {
-                return e.what();
-            }
-        }
 
         return std::nullopt;
     }
@@ -150,12 +130,9 @@ class tls_config {
           << "enabled: " << c.is_enabled() << " "
           << "key/cert files: " << c.get_key_cert_files() << " "
           << "ca file: " << c.get_truststore_file() << " "
-          << "client_auth_required: " << c.get_require_client_auth();
-        if (c.get_principal_mapping_rules()) {
-            o << " principal_mapping_rules: "
-              << c.get_principal_mapping_rules();
-        }
-        return o << " }";
+          << "client_auth_required: " << c.get_require_client_auth() << ""
+          << " }";
+        return o;
     }
 
 private:
@@ -163,7 +140,6 @@ class tls_config {
     std::optional<key_cert> _key_cert;
     std::optional<ss::sstring> _truststore_file;
     bool _require_client_auth{false};
-    std::optional<ss::sstring> _principal_mapping_rules;
 };
 
 } // namespace config
@@ -203,11 +179,6 @@ struct convert<config::tls_config> {
             node["truststore_file"] = *rhs.get_truststore_file();
         }
 
-        if (rhs.get_principal_mapping_rules()) {
-            node["principal_mapping_rules"]
-              = *rhs.get_principal_mapping_rules();
-        }
-
         return node;
     }
 
@@ -228,8 +199,7 @@ struct convert<config::tls_config> {
         }
         auto enabled = node["enabled"] && node["enabled"].as<bool>();
         if (!enabled) {
-            rhs = config::tls_config(
-              false, std::nullopt, std::nullopt, false, std::nullopt);
+            rhs = config::tls_config(false, std::nullopt, std::nullopt, false);
         } else {
             auto key_cert
               = node["key_file"]
@@ -237,17 +207,12 @@ struct convert<config::tls_config> {
                     to_absolute(node["key_file"].as<ss::sstring>()),
                     to_absolute(node["cert_file"].as<ss::sstring>())})
                   : std::nullopt;
-            auto principal_mapping_rules
-              = node["principal_mapping_rules"]
-                  ? node["principal_mapping_rules"].as<ss::sstring>()
-                  : std::optional<ss::sstring>();
             rhs = config::tls_config(
               enabled,
               key_cert,
               to_absolute(read_optional(node, "truststore_file")),
               node["require_client_auth"]
-                && node["require_client_auth"].as<bool>(),
-              principal_mapping_rules);
+                && node["require_client_auth"].as<bool>());
         }
         return true;
     }
diff --git a/src/v/kafka/server/connection_context.cc b/src/v/kafka/server/connection_context.cc
index 4847f601013bd..1752617b8ff8e 100644
--- a/src/v/kafka/server/connection_context.cc
+++ b/src/v/kafka/server/connection_context.cc
@@ -68,10 +68,7 @@ ss::future<> connection_context::process_one_request() {
                       _rs.probe().header_corrupted();
                       return ss::make_ready_future<>();
                   }
-                  return handle_mtls_auth()
-                    .then([this, h = std::move(h.value()), s]() mutable {
-                        return dispatch_method_once(std::move(h), s);
-                    })
+                  return dispatch_method_once(std::move(h.value()), s)
                     .handle_exception_type([this](const std::bad_alloc&) {
                         // In general, dispatch_method_once does not throw,
                         // but bad_allocs are an exception.  Log it cleanly
@@ -87,56 +84,6 @@ ss::future<> connection_context::process_one_request() {
       });
 }
 
-/*
- * handle mtls authentication. this should only happen once when the connection
- * is setup. even though this is called in the normal request handling path,
- * this property should hold becuase:
- *
- * 1. is a noop if a mtls principal has been extracted
- * 2. all code paths that don't set the principal throw and drop the connection
- *
- * NOTE: handle_mtls_auth is called after reading header off the wire. this is
- * odd because we would expect that tls negotation etc... all happens before we
- * here to the application layer. however, it appears that the way seastar works
- * that we need to read some data off the wire to drive this process within the
- * internal connection handling.
- */
-ss::future<> connection_context::handle_mtls_auth() {
-    if (!_use_mtls || _mtls_principal.has_value()) {
-        return ss::now();
-    }
-    return ss::with_timeout(
-             model::timeout_clock::now() + 5s,
-             _rs.conn->get_distinguished_name())
-      .then([this](std::optional<ss::session_dn> dn) {
-          if (!dn.has_value()) {
-              throw security::exception(
-                security::errc::invalid_credentials,
-                "failed to fetch distinguished name");
-          }
-          /*
-           * for now it probably is fine to store the mapping per connection.
-           * but it seems like we could also share this across all connections
-           * with the same tls configuration.
-           */
-          _mtls_principal = _rs.conn->get_principal_mapping()->apply(
-            dn->subject);
-          if (!_mtls_principal) {
-              throw security::exception(
-                security::errc::invalid_credentials,
-                fmt::format(
-                  "failed to extract principal from distinguished name: {}",
-                  dn->subject));
-          }
-
-          vlog(
-            _authlog.debug,
-            "got principal: {}, from distinguished name: {}",
-            *_mtls_principal,
-            dn->subject);
-      });
-}
-
 /*
  * The SASL authentication flow for a client using version 0 of SASL handshake
  * doesn't use an envelope request for tokens. This method intercepts the
diff --git a/src/v/kafka/server/connection_context.h b/src/v/kafka/server/connection_context.h
index fb1adc73de22e..61a9d286296e8 100644
--- a/src/v/kafka/server/connection_context.h
+++ b/src/v/kafka/server/connection_context.h
@@ -14,6 +14,7 @@
 #include "net/server.h"
 #include "seastarx.h"
 #include "security/acl.h"
+#include "security/mtls.h"
 #include "security/sasl_authentication.h"
 #include "utils/hdr_hist.h"
 #include "utils/named_type.h"
@@ -45,7 +46,7 @@ class connection_context final
       net::server::resources&& r,
       security::sasl_server sasl,
       bool enable_authorizer,
-      bool use_mtls) noexcept
+      std::optional<security::tls::mtls_state> mtls_state) noexcept
       : _proto(p)
       , _rs(std::move(r))
       , _sasl(std::move(sasl))
@@ -53,7 +54,7 @@ class connection_context final
       , _client_addr(_rs.conn ? _rs.conn->addr.addr() : ss::net::inet_address{})
       , _enable_authorizer(enable_authorizer)
       , _authlog(_client_addr, client_port())
-      , _use_mtls(use_mtls) {}
+      , _mtls_state(std::move(mtls_state)) {}
 
     ~connection_context() noexcept = default;
     connection_context(const connection_context&) = delete;
@@ -73,12 +74,9 @@ class connection_context final
             return true;
         }
         // mtls configured?
-        if (_use_mtls) {
-            if (_mtls_principal.has_value()) {
-                return authorized_user(
-                  _mtls_principal.value(), operation, name, quiet);
-            }
-            return false;
+        if (_mtls_state) {
+            return authorized_user(
+              _mtls_state->principal(), operation, name, quiet);
         }
         // use sasl
         return authorized_user(sasl().principal(), operation, name, quiet);
@@ -164,7 +162,6 @@ class connection_context final
     ss::future<session_resources>
     throttle_request(const request_header&, size_t sz);
 
-    ss::future<> handle_mtls_auth();
     ss::future<> dispatch_method_once(request_header, size_t sz);
     ss::future<> process_next_response();
     ss::future<> do_process(request_context);
@@ -234,8 +231,7 @@ class connection_context final
     const ss::net::inet_address _client_addr;
     const bool _enable_authorizer;
     ctx_log _authlog;
-    bool _use_mtls{false};
-    std::optional<ss::sstring> _mtls_principal;
+    std::optional<security::tls::mtls_state> _mtls_state;
 };
 
 } // namespace kafka
diff --git a/src/v/kafka/server/protocol.cc b/src/v/kafka/server/protocol.cc
index d36f36d1848f7..2ed5678734380 100644
--- a/src/v/kafka/server/protocol.cc
+++ b/src/v/kafka/server/protocol.cc
@@ -20,6 +20,8 @@
 #include "kafka/server/request_context.h"
 #include "kafka/server/response.h"
 #include "net/connection.h"
+#include "security/errc.h"
+#include "security/exceptions.h"
 #include "security/mtls.h"
 #include "security/scram_algorithm.h"
 #include "utils/utf8.h"
@@ -83,7 +85,9 @@ protocol::protocol(
   , _controller_api(controller_api)
   , _tx_gateway_frontend(tx_gateway_frontend)
   , _coproc_partition_manager(coproc_partition_manager)
-  , _data_policy_table(data_policy_table) {
+  , _data_policy_table(data_policy_table)
+  , _mtls_principal_mapper(
+      config::shard_local_cfg().kafka_mtls_principal_mapping_rules.bind()) {
     if (qdc_config) {
         _qdc_mon.emplace(*qdc_config);
     }
@@ -125,13 +129,38 @@ config::broker_authn_method get_authn_method(const net::connection& conn) {
       && config.enable_sasl()) {
         return config::broker_authn_method::sasl;
     }
-    // mtls_identity is currently predicated on having mapping rules
-    if (conn.get_principal_mapping().has_value()) {
-        return config::broker_authn_method::mtls_identity;
-    }
     return config::broker_authn_method::none;
 }
 
+ss::future<security::tls::mtls_state> get_mtls_principal_state(
+  const security::tls::principal_mapper& pm, net::connection& conn) {
+    using namespace std::chrono_literals;
+    return ss::with_timeout(
+             model::timeout_clock::now() + 5s, conn.get_distinguished_name())
+      .then([&pm](std::optional<ss::session_dn> dn) {
+          ss::sstring anonymous_principal;
+          if (!dn.has_value()) {
+              vlog(klog.info, "failed to fetch distinguished name");
+              return security::tls::mtls_state{anonymous_principal};
+          }
+          auto principal = pm.apply(dn->subject);
+          if (!principal) {
+              vlog(
+                klog.info,
+                "failed to extract principal from distinguished name: {}",
+                dn->subject);
+              return security::tls::mtls_state{anonymous_principal};
+          }
+
+          vlog(
+            klog.debug,
+            "got principal: {}, from distinguished name: {}",
+            *principal,
+            dn->subject);
+          return security::tls::mtls_state{*principal};
+      });
+}
+
 ss::future<> protocol::apply(net::server::resources rs) {
     const bool authz_enabled
       = config::shard_local_cfg().kafka_enable_authorization().value_or(
@@ -147,16 +176,18 @@ ss::future<> protocol::apply(net::server::resources rs) {
         ? security::sasl_server::sasl_state::initial
         : security::sasl_server::sasl_state::complete);
 
+    std::optional<security::tls::mtls_state> mtls_state;
+    if (authn_method == config::broker_authn_method::mtls_identity) {
+        mtls_state = co_await get_mtls_principal_state(
+          _mtls_principal_mapper, *rs.conn);
+    }
+
     auto ctx = ss::make_lw_shared<connection_context>(
-      *this,
-      std::move(rs),
-      std::move(sasl),
-      authz_enabled,
-      authn_method == config::broker_authn_method::mtls_identity);
-
-    return ss::do_until(
-             [ctx] { return ctx->is_finished_parsing(); },
-             [ctx] { return ctx->process_one_request(); })
+      *this, std::move(rs), std::move(sasl), authz_enabled, mtls_state);
+
+    co_return co_await ss::do_until(
+      [ctx] { return ctx->is_finished_parsing(); },
+      [ctx] { return ctx->process_one_request(); })
       .handle_exception([ctx](std::exception_ptr eptr) {
           auto disconnected = net::is_disconnect_exception(eptr);
           if (config::shard_local_cfg().enable_sasl()) {
diff --git a/src/v/kafka/server/protocol.h b/src/v/kafka/server/protocol.h
index 84ba9aced12df..b65b2f3011782 100644
--- a/src/v/kafka/server/protocol.h
+++ b/src/v/kafka/server/protocol.h
@@ -21,6 +21,7 @@
 #include "net/server.h"
 #include "security/authorizer.h"
 #include "security/credential_store.h"
+#include "security/mtls.h"
 #include "utils/ema.h"
 #include "v8_engine/data_policy_table.h"
 
@@ -159,6 +160,7 @@ class protocol final : public net::server::protocol {
     ss::sharded<v8_engine::data_policy_table>& _data_policy_table;
     std::optional<qdc_monitor> _qdc_mon;
     kafka::fetch_metadata_cache _fetch_metadata_cache;
+    security::tls::principal_mapper _mtls_principal_mapper;
 
     latency_probe _probe;
 };
diff --git a/src/v/kafka/server/tests/request_parser_test.cc b/src/v/kafka/server/tests/request_parser_test.cc
index 27f5b90a32924..afefbcfef7e6a 100644
--- a/src/v/kafka/server/tests/request_parser_test.cc
+++ b/src/v/kafka/server/tests/request_parser_test.cc
@@ -83,7 +83,7 @@ get_request_context(kafka::protocol& proto, ss::input_stream<char>&& input) {
                               net::server::resources(nullptr, nullptr),
                               std::move(sasl),
                               false,
-                              false);
+                              std::nullopt);
 
                           return kafka::request_context(
                             conn,
diff --git a/src/v/redpanda/application.cc b/src/v/redpanda/application.cc
index 9f4ed50d1d193..bc5f02d47261f 100644
--- a/src/v/redpanda/application.cc
+++ b/src/v/redpanda/application.cc
@@ -1086,7 +1086,6 @@ void application::wire_up_redpanda_services() {
               auto& tls_config = config::node().kafka_api_tls.value();
               for (const auto& ep : config::node().kafka_api()) {
                   ss::shared_ptr<ss::tls::server_credentials> credentails;
-                  std::optional<security::tls::principal_mapper> tls_pm;
                   // find credentials for this endpoint
                   auto it = find_if(
                     tls_config.begin(),
@@ -1115,20 +1114,10 @@ void application::wire_up_redpanda_services() {
                                   })
                                 .get0()
                             : nullptr;
-
-                      auto tls_pm_rules
-                        = it->config.get_principal_mapping_rules();
-                      if (tls_pm_rules) {
-                          tls_pm = security::tls::principal_mapper(
-                            tls_pm_rules);
-                      }
                   }
 
                   c.addrs.emplace_back(
-                    ep.name,
-                    net::resolve_dns(ep.address).get0(),
-                    credentails,
-                    std::move(tls_pm));
+                    ep.name, net::resolve_dns(ep.address).get0(), credentails);
               }
 
               c.disable_metrics = net::metrics_disabled(
diff --git a/src/v/redpanda/tests/fixture.h b/src/v/redpanda/tests/fixture.h
index c90622550b264..99be33f5c414a 100644
--- a/src/v/redpanda/tests/fixture.h
+++ b/src/v/redpanda/tests/fixture.h
@@ -414,7 +414,7 @@ class redpanda_thread_fixture {
           net::server::resources(nullptr, nullptr),
           std::move(sasl),
           false,
-          false);
+          std::nullopt);
 
         kafka::request_header header;
         auto encoder_context = kafka::request_context(
diff --git a/src/v/rpc/test/rpc_gen_cycling_test.cc b/src/v/rpc/test/rpc_gen_cycling_test.cc
index 63f50e9542b2b..0b46c3391a83b 100644
--- a/src/v/rpc/test/rpc_gen_cycling_test.cc
+++ b/src/v/rpc/test/rpc_gen_cycling_test.cc
@@ -133,8 +133,7 @@ FIXTURE_TEST(echo_round_trip_tls, rpc_integration_fixture) {
                            true,
                            config::key_cert{"redpanda.key", "redpanda.crt"},
                            "root_certificate_authority.chain_cert",
-                           false,
-                           std::nullopt)
+                           false)
                            .get_credentials_builder()
                            .get0();
 
@@ -210,8 +209,7 @@ FIXTURE_TEST(rpcgen_reload_credentials_integration, rpc_integration_fixture) {
                                   config::key_cert{
                                     client_key.native(), client_crt.native()},
                                   client_ca.native(),
-                                  true,
-                                  std::nullopt)
+                                  true)
                                   .get_credentials_builder()
                                   .get0();
     // server credentials
@@ -224,8 +222,7 @@ FIXTURE_TEST(rpcgen_reload_credentials_integration, rpc_integration_fixture) {
                                   config::key_cert{
                                     server_key.native(), server_crt.native()},
                                   server_ca.native(),
-                                  true,
-                                  std::nullopt)
+                                  true)
                                   .get_credentials_builder()
                                   .get0();
 
diff --git a/src/v/security/mtls.cc b/src/v/security/mtls.cc
index 99f679a8ff7f9..4849d45f913ac 100644
--- a/src/v/security/mtls.cc
+++ b/src/v/security/mtls.cc
@@ -167,6 +167,22 @@ std::ostream& operator<<(std::ostream& os, const principal_mapper& p) {
     return os;
 }
 
+principal_mapper::principal_mapper(
+  config::binding<std::optional<ss::sstring>> cb)
+  : _binding(std::move(cb))
+  , _rules{detail::parse_rules(_binding())} {
+    _binding.watch([this]() { _rules = detail::parse_rules(_binding()); });
+}
+
+std::optional<ss::sstring> principal_mapper::apply(std::string_view sv) const {
+    for (const auto& r : _rules) {
+        if (auto p = r.apply(sv); p.has_value()) {
+            return {std::move(p).value()};
+        }
+    }
+    return std::nullopt;
+}
+
 } // namespace security::tls
 
 // explicit instantiations so as to avoid bringing in <fmt/ranges.h> in the
diff --git a/src/v/security/mtls.h b/src/v/security/mtls.h
index 1c8d51a8c52ab..1ef4d66d01343 100644
--- a/src/v/security/mtls.h
+++ b/src/v/security/mtls.h
@@ -11,6 +11,7 @@
 
 #pragma once
 
+#include "config/property.h"
 #include "seastarx.h"
 
 #include <seastar/core/sstring.hh>
@@ -18,7 +19,6 @@
 
 #include <fmt/core.h>
 
-#include <iosfwd>
 #include <optional>
 #include <regex>
 #include <string_view>
@@ -53,25 +53,10 @@ class rule {
     make_upper _to_upper{false};
 };
 
-namespace detail {
-
-std::vector<rule> parse_rules(std::optional<std::string_view> unparsed_rules);
-
-} // namespace detail
-
 class principal_mapper {
 public:
-    explicit principal_mapper(std::optional<std::string_view> sv)
-      : _rules{detail::parse_rules(sv)} {}
-
-    std::optional<ss::sstring> apply(std::string_view sv) const {
-        for (const auto& r : _rules) {
-            if (auto p = r.apply(sv); p.has_value()) {
-                return {std::move(p).value()};
-            }
-        }
-        return std::nullopt;
-    }
+    explicit principal_mapper(config::binding<std::optional<ss::sstring>> cb);
+    std::optional<ss::sstring> apply(std::string_view sv) const;
 
 private:
     friend struct fmt::formatter<principal_mapper>;
@@ -79,9 +64,21 @@ class principal_mapper {
     friend std::ostream&
     operator<<(std::ostream& os, const principal_mapper& p);
 
+    config::binding<std::optional<ss::sstring>> _binding;
     std::vector<rule> _rules;
 };
 
+class mtls_state {
+public:
+    explicit mtls_state(ss::sstring principal)
+      : _principal{std::move(principal)} {}
+
+    const ss::sstring& principal() { return _principal; }
+
+private:
+    ss::sstring _principal;
+};
+
 std::optional<ss::sstring>
 validate_rules(const std::optional<ss::sstring>& r) noexcept;
 
diff --git a/src/v/security/tests/mtls_test.cc b/src/v/security/tests/mtls_test.cc
index c69c544edd6a7..773cfdfca72d7 100644
--- a/src/v/security/tests/mtls_test.cc
+++ b/src/v/security/tests/mtls_test.cc
@@ -6,6 +6,7 @@
 // As of the Change Date specified in that file, in accordance with
 // the Business Source License, use of this software will be governed
 // by the Apache License, Version 2.0
+#include "config/property.h"
 #include "random/generators.h"
 #include "security/mtls.h"
 #include "utils/base64.h"
@@ -44,7 +45,8 @@ std::array<std::string_view, 8> mtls_valid_rules{
   "RULE:^CN=([^,DEFAULT,]+)(,.*|$)/$1/"};
 
 BOOST_DATA_TEST_CASE(test_mtls_valid_rules, bdata::make(mtls_valid_rules), c) {
-    BOOST_REQUIRE_NO_THROW(principal_mapper{c});
+    BOOST_REQUIRE_NO_THROW(
+      principal_mapper{config::mock_binding(std::optional<ss::sstring>{c})});
 }
 
 std::array<std::string_view, 10> mtls_invalid_rules{
@@ -61,7 +63,9 @@ std::array<std::string_view, 10> mtls_invalid_rules{
 
 BOOST_DATA_TEST_CASE(
   test_mtls_invalid_rules, bdata::make(mtls_invalid_rules), c) {
-    BOOST_REQUIRE_THROW(principal_mapper{c}, std::runtime_error);
+    BOOST_REQUIRE_THROW(
+      principal_mapper{config::mock_binding(std::optional<ss::sstring>{c})},
+      std::runtime_error);
 }
 
 struct record {
@@ -87,11 +91,12 @@ static std::array<record, 5> mtls_principal_mapper_data{
 BOOST_DATA_TEST_CASE(
   test_mtls_principal_mapper, bdata::make(mtls_principal_mapper_data), c) {
     security::tls::principal_mapper mapper{
-      "RULE:^CN=(.*?),OU=ServiceUsers.*$/$1/L, "
-      "RULE:^CN=(.*?),OU=(.*?),O=(.*?),L=(.*?),ST=(.*?),C=(.*?)$/$1@$2/L, "
-      "RULE:^cn=(.*?),ou=(.*?),dc=(.*?),dc=(.*?)$/$1@$2/U, "
-      "RULE:^.*[Cc][Nn]=([a-zA-Z0-9.]*).*$/$1/U, "
-      "DEFAULT"};
+      config::mock_binding(std::optional<ss::sstring>{
+        "RULE:^CN=(.*?),OU=ServiceUsers.*$/$1/L, "
+        "RULE:^CN=(.*?),OU=(.*?),O=(.*?),L=(.*?),ST=(.*?),C=(.*?)$/$1@$2/L, "
+        "RULE:^cn=(.*?),ou=(.*?),dc=(.*?),dc=(.*?)$/$1@$2/U, "
+        "RULE:^.*[Cc][Nn]=([a-zA-Z0-9.]*).*$/$1/U, "
+        "DEFAULT"})};
     BOOST_REQUIRE_EQUAL(c.expected, *mapper.apply(c.input));
 }
 
@@ -121,13 +126,19 @@ static std::array<record, 17> mtls_rule_splitting_data{
 };
 BOOST_DATA_TEST_CASE(
   test_mtls_rule_splitting, bdata::make(mtls_rule_splitting_data), c) {
-    BOOST_CHECK_EQUAL(c.expected, fmt::format("{}", principal_mapper(c.input)));
+    BOOST_CHECK_EQUAL(
+      c.expected,
+      fmt::format(
+        "{}",
+        principal_mapper(
+          config::mock_binding(std::optional<ss::sstring>{c.input}))));
 }
 
 BOOST_AUTO_TEST_CASE(test_mtls_comma_with_whitespace) {
     BOOST_CHECK_EQUAL(
       "Tkac\\, Adam",
-      principal_mapper("RULE:^CN=((\\\\, *|\\w)+)(,.*|$)/$1/,DEFAULT")
+      principal_mapper(config::mock_binding(std::optional<ss::sstring>{
+                         "RULE:^CN=((\\\\, *|\\w)+)(,.*|$)/$1/,DEFAULT"}))
         .apply("CN=Tkac\\, Adam,OU=ITZ,DC=geodis,DC=cz")
         .value_or(""));
 }
diff --git a/tests/rptest/services/redpanda.py b/tests/rptest/services/redpanda.py
index e22875b80a9c0..d28c9df94bec0 100644
--- a/tests/rptest/services/redpanda.py
+++ b/tests/rptest/services/redpanda.py
@@ -539,6 +539,9 @@ def set_resource_settings(self, rs):
     def set_extra_rp_conf(self, conf):
         self._extra_rp_conf = conf
 
+    def add_extra_rp_conf(self, conf):
+        self._extra_rp_conf = {**self._extra_rp_conf, **conf}
+
     def set_extra_node_conf(self, node, conf):
         assert node in self.nodes
         self._extra_node_conf[node] = conf
@@ -1335,10 +1338,6 @@ def write_node_conf_file(self, node, override_cfg_params=None):
                 cert_file=RedpandaService.TLS_SERVER_CRT_FILE,
                 truststore_file=RedpandaService.TLS_CA_CRT_FILE,
             )
-            if self.mtls_identity_enabled():
-                tls_config.update(
-                    dict(principal_mapping_rules=self._security.
-                         principal_mapping_rules))
             doc = yaml.full_load(conf)
             doc["redpanda"].update(dict(kafka_api_tls=tls_config))
             conf = yaml.dump(doc)
diff --git a/tests/rptest/tests/acls_test.py b/tests/rptest/tests/acls_test.py
index 86eefb0cf9894..23b9494fd63f7 100644
--- a/tests/rptest/tests/acls_test.py
+++ b/tests/rptest/tests/acls_test.py
@@ -81,9 +81,13 @@ def prepare_cluster(self,
 
             self.security.tls_provider = MTLSProvider(self.tls)
 
-        if self.security.mtls_identity_enabled(
-        ) and principal_mapping_rules is not None:
-            self.security.principal_mapping_rules = principal_mapping_rules
+        if self.security.mtls_identity_enabled():
+            if principal_mapping_rules is not None:
+                self.security.principal_mapping_rules = principal_mapping_rules
+            self.redpanda.add_extra_rp_conf({
+                'kafka_mtls_principal_mapping_rules':
+                self.security.principal_mapping_rules
+            })
 
         self.redpanda.set_security_settings(self.security)
         self.redpanda.start()

From eb977812064db4eb1138b2899b2f5bb0a9e3975a Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Wed, 6 Jul 2022 13:48:13 +0100
Subject: [PATCH 170/201] auth: Exclude auth settings from config fuzzing

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 tests/rptest/tests/cluster_config_test.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/rptest/tests/cluster_config_test.py b/tests/rptest/tests/cluster_config_test.py
index feef159957568..56bd89c0331b0 100644
--- a/tests/rptest/tests/cluster_config_test.py
+++ b/tests/rptest/tests/cluster_config_test.py
@@ -484,7 +484,10 @@ def test_valid_settings(self):
 
         # Don't change these settings, they prevent the test from subsequently
         # using the cluster
-        exclude_settings = {'enable_sasl'}
+        exclude_settings = {
+            'enable_sasl', 'kafka_enable_authorization',
+            'kafka_mtls_principal_mapping_rules'
+        }
 
         # Don't enable coproc: it generates log errors if its companion service isn't running
         exclude_settings.add('enable_coproc')

From 38650a7b96707ce4a13d9879d459b08bb0a59bba Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Wed, 6 Jul 2022 12:11:32 +0100
Subject: [PATCH 171/201] auth: Convert principal_mapping to a vector

`kafka_mtls_principal_mapping_rules` now requires a list:

```yaml
redpanda:
  kafka_mtls_principal_mapping_rules:
  - RULE:^CN=(.*)/$1/,DEFAULT
```

Or instead of a comma seperator:

```yaml
redpanda:
  kafka_mtls_principal_mapping_rules:
  - RULE:^CN=(.*)/$1/
  - DEFAULT
```

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/config/configuration.h      |  3 ++-
 src/v/security/mtls.cc            | 12 ++++++---
 src/v/security/mtls.h             |  7 +++---
 src/v/security/tests/mtls_test.cc | 42 ++++++++++++++++++++-----------
 tests/rptest/tests/acls_test.py   |  7 +++++-
 5 files changed, 47 insertions(+), 24 deletions(-)

diff --git a/src/v/config/configuration.h b/src/v/config/configuration.h
index b9f6f4c046a1e..c5bc0548aa001 100644
--- a/src/v/config/configuration.h
+++ b/src/v/config/configuration.h
@@ -172,7 +172,8 @@ struct configuration final : public config_store {
     property<int16_t> id_allocator_batch_size;
     property<bool> enable_sasl;
     property<std::optional<bool>> kafka_enable_authorization;
-    property<std::optional<ss::sstring>> kafka_mtls_principal_mapping_rules;
+    property<std::optional<std::vector<ss::sstring>>>
+      kafka_mtls_principal_mapping_rules;
     property<std::chrono::milliseconds>
       controller_backend_housekeeping_interval_ms;
     property<std::chrono::milliseconds> node_management_operation_timeout_ms;
diff --git a/src/v/security/mtls.cc b/src/v/security/mtls.cc
index 4849d45f913ac..5ecc7191150a5 100644
--- a/src/v/security/mtls.cc
+++ b/src/v/security/mtls.cc
@@ -68,11 +68,15 @@ constexpr std::optional<std::string_view> make_sv(const std::csub_match& sm) {
              : std::optional<std::string_view>{std::nullopt};
 }
 
-std::vector<rule> parse_rules(std::optional<std::string_view> unparsed_rules) {
+std::vector<rule>
+parse_rules(std::optional<std::vector<ss::sstring>> unparsed_rules) {
     static const std::regex rule_splitter = make_regex(rule_pattern_splitter);
     static const std::regex rule_parser = make_regex(rule_pattern);
 
-    std::string_view rules{trim(unparsed_rules.value_or("DEFAULT"))};
+    std::string rules
+      = unparsed_rules.has_value() ? fmt::format(
+          "{}", fmt::join(unparsed_rules->begin(), unparsed_rules->end(), ","))
+                                   : "DEFAULT";
 
     std::vector<rule> result;
     std::cmatch rules_match;
@@ -148,7 +152,7 @@ std::optional<ss::sstring> rule::apply(std::string_view dn) const {
 }
 
 std::optional<ss::sstring>
-validate_rules(const std::optional<ss::sstring>& r) noexcept {
+validate_rules(const std::optional<std::vector<ss::sstring>>& r) noexcept {
     try {
         security::tls::detail::parse_rules(r);
     } catch (const std::exception& e) {
@@ -168,7 +172,7 @@ std::ostream& operator<<(std::ostream& os, const principal_mapper& p) {
 }
 
 principal_mapper::principal_mapper(
-  config::binding<std::optional<ss::sstring>> cb)
+  config::binding<std::optional<std::vector<ss::sstring>>> cb)
   : _binding(std::move(cb))
   , _rules{detail::parse_rules(_binding())} {
     _binding.watch([this]() { _rules = detail::parse_rules(_binding()); });
diff --git a/src/v/security/mtls.h b/src/v/security/mtls.h
index 1ef4d66d01343..56be6921aea98 100644
--- a/src/v/security/mtls.h
+++ b/src/v/security/mtls.h
@@ -55,7 +55,8 @@ class rule {
 
 class principal_mapper {
 public:
-    explicit principal_mapper(config::binding<std::optional<ss::sstring>> cb);
+    explicit principal_mapper(
+      config::binding<std::optional<std::vector<ss::sstring>>> cb);
     std::optional<ss::sstring> apply(std::string_view sv) const;
 
 private:
@@ -64,7 +65,7 @@ class principal_mapper {
     friend std::ostream&
     operator<<(std::ostream& os, const principal_mapper& p);
 
-    config::binding<std::optional<ss::sstring>> _binding;
+    config::binding<std::optional<std::vector<ss::sstring>>> _binding;
     std::vector<rule> _rules;
 };
 
@@ -80,7 +81,7 @@ class mtls_state {
 };
 
 std::optional<ss::sstring>
-validate_rules(const std::optional<ss::sstring>& r) noexcept;
+validate_rules(const std::optional<std::vector<ss::sstring>>& r) noexcept;
 
 } // namespace security::tls
 
diff --git a/src/v/security/tests/mtls_test.cc b/src/v/security/tests/mtls_test.cc
index 773cfdfca72d7..36c646487cce5 100644
--- a/src/v/security/tests/mtls_test.cc
+++ b/src/v/security/tests/mtls_test.cc
@@ -34,7 +34,7 @@ namespace security::tls {
 
 namespace bdata = boost::unit_test::data;
 
-std::array<std::string_view, 8> mtls_valid_rules{
+std::array<ss::sstring, 8> mtls_valid_rules{
   "DEFAULT",
   "RULE:^CN=(.*?),OU=ServiceUsers.*$/$1/",
   "RULE:^CN=(.*?),OU=ServiceUsers.*$/$1/L, DEFAULT",
@@ -45,11 +45,11 @@ std::array<std::string_view, 8> mtls_valid_rules{
   "RULE:^CN=([^,DEFAULT,]+)(,.*|$)/$1/"};
 
 BOOST_DATA_TEST_CASE(test_mtls_valid_rules, bdata::make(mtls_valid_rules), c) {
-    BOOST_REQUIRE_NO_THROW(
-      principal_mapper{config::mock_binding(std::optional<ss::sstring>{c})});
+    BOOST_REQUIRE_NO_THROW(principal_mapper{
+      config::mock_binding(std::optional<std::vector<ss::sstring>>{{c}})});
 }
 
-std::array<std::string_view, 10> mtls_invalid_rules{
+std::array<ss::sstring, 10> mtls_invalid_rules{
   "default",
   "DEFAUL",
   "DEFAULT/L",
@@ -64,7 +64,8 @@ std::array<std::string_view, 10> mtls_invalid_rules{
 BOOST_DATA_TEST_CASE(
   test_mtls_invalid_rules, bdata::make(mtls_invalid_rules), c) {
     BOOST_REQUIRE_THROW(
-      principal_mapper{config::mock_binding(std::optional<ss::sstring>{c})},
+      principal_mapper{
+        config::mock_binding(std::optional<std::vector<ss::sstring>>{{c}})},
       std::runtime_error);
 }
 
@@ -91,12 +92,12 @@ static std::array<record, 5> mtls_principal_mapper_data{
 BOOST_DATA_TEST_CASE(
   test_mtls_principal_mapper, bdata::make(mtls_principal_mapper_data), c) {
     security::tls::principal_mapper mapper{
-      config::mock_binding(std::optional<ss::sstring>{
-        "RULE:^CN=(.*?),OU=ServiceUsers.*$/$1/L, "
-        "RULE:^CN=(.*?),OU=(.*?),O=(.*?),L=(.*?),ST=(.*?),C=(.*?)$/$1@$2/L, "
-        "RULE:^cn=(.*?),ou=(.*?),dc=(.*?),dc=(.*?)$/$1@$2/U, "
-        "RULE:^.*[Cc][Nn]=([a-zA-Z0-9.]*).*$/$1/U, "
-        "DEFAULT"})};
+      config::mock_binding(std::optional<std::vector<ss::sstring>>{
+        {"RULE:^CN=(.*?),OU=ServiceUsers.*$/$1/L, "
+         "RULE:^CN=(.*?),OU=(.*?),O=(.*?),L=(.*?),ST=(.*?),C=(.*?)$/$1@$2/L, "
+         "RULE:^cn=(.*?),ou=(.*?),dc=(.*?),dc=(.*?)$/$1@$2/U, "
+         "RULE:^.*[Cc][Nn]=([a-zA-Z0-9.]*).*$/$1/U, "
+         "DEFAULT"}})};
     BOOST_REQUIRE_EQUAL(c.expected, *mapper.apply(c.input));
 }
 
@@ -130,17 +131,28 @@ BOOST_DATA_TEST_CASE(
       c.expected,
       fmt::format(
         "{}",
-        principal_mapper(
-          config::mock_binding(std::optional<ss::sstring>{c.input}))));
+        principal_mapper(config::mock_binding(
+          std::optional<std::vector<ss::sstring>>{{ss::sstring{c.input}}}))));
 }
 
 BOOST_AUTO_TEST_CASE(test_mtls_comma_with_whitespace) {
     BOOST_CHECK_EQUAL(
       "Tkac\\, Adam",
-      principal_mapper(config::mock_binding(std::optional<ss::sstring>{
-                         "RULE:^CN=((\\\\, *|\\w)+)(,.*|$)/$1/,DEFAULT"}))
+      principal_mapper(
+        config::mock_binding(std::optional<std::vector<ss::sstring>>{
+          {"RULE:^CN=((\\\\, *|\\w)+)(,.*|$)/$1/,DEFAULT"}}))
         .apply("CN=Tkac\\, Adam,OU=ITZ,DC=geodis,DC=cz")
         .value_or(""));
 }
 
+BOOST_AUTO_TEST_CASE(test_mtls_parsing_with_multiline) {
+    BOOST_CHECK_EQUAL(
+      "test_cn",
+      principal_mapper(
+        config::mock_binding(std::optional<std::vector<ss::sstring>>{
+          {{"RULE:^OU=(.*)/$1/"}, {"RULE:^CN=(.*)/$1/"}}}))
+        .apply("CN=test_cn")
+        .value_or(""));
+}
+
 } // namespace security::tls
diff --git a/tests/rptest/tests/acls_test.py b/tests/rptest/tests/acls_test.py
index 23b9494fd63f7..52fe67170c7e6 100644
--- a/tests/rptest/tests/acls_test.py
+++ b/tests/rptest/tests/acls_test.py
@@ -86,7 +86,7 @@ def prepare_cluster(self,
                 self.security.principal_mapping_rules = principal_mapping_rules
             self.redpanda.add_extra_rp_conf({
                 'kafka_mtls_principal_mapping_rules':
-                self.security.principal_mapping_rules
+                [self.security.principal_mapping_rules]
             })
 
         self.redpanda.set_security_settings(self.security)
@@ -258,6 +258,11 @@ def test_describe_acls(self,
         rules=
         "RULE:^O=Redpanda,CN=(cluster_describe|redpanda.service.admin|admin)$/$1/",
         fail=False)
+    # Match admin or empty
+    @parametrize(
+        rules=
+        "RULE:^O=Redpanda,CN=(admin|redpanda.service.admin)$/$1/, RULE:^O=Redpanda,CN=()$/$1/L",
+        fail=True)
     def test_mtls_principal(self, rules=None, fail=False):
         """
         security::acl_operation::describe, security::default_cluster_name

From 92385511fe4094780ad90034cd206e2e3105066a Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Wed, 6 Jul 2022 16:07:59 +0100
Subject: [PATCH 172/201] auth: Allow newline as a rule seperator

`kafka_mtls_principal_mapping_rules` now accepts a newline:

```yaml
redpanda:
  kafka_mtls_principal_mapping_rules:
  - |
    RULE:^CN=(.*)/$1/
    DEFAULT
```

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/security/mtls.cc            |  2 +-
 src/v/security/tests/mtls_test.cc | 13 ++++++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/v/security/mtls.cc b/src/v/security/mtls.cc
index 5ecc7191150a5..708fe612d4741 100644
--- a/src/v/security/mtls.cc
+++ b/src/v/security/mtls.cc
@@ -26,7 +26,7 @@ namespace detail {
 static constexpr const char* const rule_pattern{
   R"((DEFAULT)|RULE:((\\.|[^\\/])*)\/((\\.|[^\\/])*)\/([LU]?).*?|(.*?))"};
 static constexpr const char* const rule_pattern_splitter{
-  R"(\s*((DEFAULT)|RULE:((\\.|[^\\/])*)\/((\\.|[^\\/])*)\/([LU]?).*?|(.*?))\s*(,\s*|$))"};
+  R"(\s*((DEFAULT)|RULE:((\\.|[^\\/])*)\/((\\.|[^\\/])*)\/([LU]?).*?|(.*?))\s*([,\n]\s*|$))"};
 
 std::regex make_regex(std::string_view sv) {
     return std::regex{
diff --git a/src/v/security/tests/mtls_test.cc b/src/v/security/tests/mtls_test.cc
index 36c646487cce5..da3d2214fd90a 100644
--- a/src/v/security/tests/mtls_test.cc
+++ b/src/v/security/tests/mtls_test.cc
@@ -101,7 +101,7 @@ BOOST_DATA_TEST_CASE(
     BOOST_REQUIRE_EQUAL(c.expected, *mapper.apply(c.input));
 }
 
-static std::array<record, 17> mtls_rule_splitting_data{
+static std::array<record, 18> mtls_rule_splitting_data{
   record{"[]", ""},
   {"[DEFAULT]", "DEFAULT"},
   {"[RULE:/]", "RULE://"},
@@ -124,6 +124,7 @@ static std::array<record, 17> mtls_rule_splitting_data{
    "DEFAULT, /DEFAULT, DEFAULT]",
    "RULE:,RULE:,/,RULE:,\\//U,RULE:,/RULE:,/,RULE:,RULE:,/L,RULE:,/L,RULE:, "
    "DEFAULT, /DEFAULT/,DEFAULT"},
+  {"[RULE:/, DEFAULT]", "RULE://\nDEFAULT"},
 };
 BOOST_DATA_TEST_CASE(
   test_mtls_rule_splitting, bdata::make(mtls_rule_splitting_data), c) {
@@ -155,4 +156,14 @@ BOOST_AUTO_TEST_CASE(test_mtls_parsing_with_multiline) {
         .value_or(""));
 }
 
+BOOST_AUTO_TEST_CASE(test_mtls_parsing_with_newline) {
+    BOOST_CHECK_EQUAL(
+      "test_cn",
+      principal_mapper(
+        config::mock_binding(std::optional<std::vector<ss::sstring>>{
+          {"RULE:^OU=(.*)/$1/\nRULE:^CN=(.*)/$1/"}}))
+        .apply("CN=test_cn")
+        .value_or(""));
+}
+
 } // namespace security::tls

From e1f129c6bd4073aa8be767a6c89711334b99b8dd Mon Sep 17 00:00:00 2001
From: Evgeny Lazin <evgeny@vectorized.io>
Date: Fri, 1 Jul 2022 11:05:49 -0400
Subject: [PATCH 173/201] cloud_storage: Add tx_range_manifest

The manifest contains a list of aborted transactions ranges. It's used
by the SI in case if there is no local data available (no rm_stm
snapshot).
---
 src/v/cloud_storage/CMakeLists.txt       |   1 +
 src/v/cloud_storage/base_manifest.h      |   1 +
 src/v/cloud_storage/probe.h              |  20 ++++
 src/v/cloud_storage/remote.cc            |   6 ++
 src/v/cloud_storage/tx_range_manifest.cc | 121 +++++++++++++++++++++++
 src/v/cloud_storage/tx_range_manifest.h  |  73 ++++++++++++++
 src/v/cloud_storage/types.h              |   6 ++
 src/v/cluster/rm_stm.h                   |   2 +
 8 files changed, 230 insertions(+)
 create mode 100644 src/v/cloud_storage/tx_range_manifest.cc
 create mode 100644 src/v/cloud_storage/tx_range_manifest.h

diff --git a/src/v/cloud_storage/CMakeLists.txt b/src/v/cloud_storage/CMakeLists.txt
index 205c3f7e67383..d3dbfa1325571 100644
--- a/src/v/cloud_storage/CMakeLists.txt
+++ b/src/v/cloud_storage/CMakeLists.txt
@@ -17,6 +17,7 @@ v_cc_library(
     remote_segment.cc
     remote_partition.cc
     remote_segment_index.cc
+    tx_range_manifest.cc
   DEPS
     Seastar::seastar
     v::bytes
diff --git a/src/v/cloud_storage/base_manifest.h b/src/v/cloud_storage/base_manifest.h
index e3169cb6c8973..96d37fec16cb4 100644
--- a/src/v/cloud_storage/base_manifest.h
+++ b/src/v/cloud_storage/base_manifest.h
@@ -27,6 +27,7 @@ struct serialized_json_stream {
 enum class manifest_type {
     topic,
     partition,
+    tx_range,
 };
 
 class base_manifest {
diff --git a/src/v/cloud_storage/probe.h b/src/v/cloud_storage/probe.h
index cc59faf668fd8..7acf0e5dafb2f 100644
--- a/src/v/cloud_storage/probe.h
+++ b/src/v/cloud_storage/probe.h
@@ -59,6 +59,22 @@ class remote_probe {
         return _cnt_partition_manifest_downloads;
     }
 
+    /// Register manifest (re)upload
+    void txrange_manifest_upload() { _cnt_tx_manifest_uploads++; }
+
+    /// Get manifest (re)upload
+    uint64_t get_txrange_manifest_uploads() const {
+        return _cnt_tx_manifest_uploads;
+    }
+
+    /// Register manifest download
+    void txrange_manifest_download() { _cnt_tx_manifest_downloads++; }
+
+    /// Get manifest download
+    uint64_t get_txrange_manifest_downloads() const {
+        return _cnt_tx_manifest_downloads;
+    }
+
     /// Register backof invocation during manifest upload
     void manifest_upload_backoff() { _cnt_manifest_upload_backoff++; }
 
@@ -166,6 +182,10 @@ class remote_probe {
     uint64_t _cnt_bytes_sent{0};
     /// Number of bytes being successfully received from S3
     uint64_t _cnt_bytes_received{0};
+    /// Number of tx-range manifest uploads
+    uint64_t _cnt_tx_manifest_uploads{0};
+    /// Number of tx-range manifest downloads
+    uint64_t _cnt_tx_manifest_downloads{0};
 
     ss::metrics::metric_groups _metrics;
     ss::metrics::metric_groups _public_metrics;
diff --git a/src/v/cloud_storage/remote.cc b/src/v/cloud_storage/remote.cc
index 29af6e1790cee..3f3c27c4f37c3 100644
--- a/src/v/cloud_storage/remote.cc
+++ b/src/v/cloud_storage/remote.cc
@@ -216,6 +216,9 @@ ss::future<download_result> remote::download_manifest(
             case manifest_type::topic:
                 _probe.topic_manifest_download();
                 break;
+            case manifest_type::tx_range:
+                _probe.txrange_manifest_download();
+                break;
             }
             co_return download_result::success;
         } catch (...) {
@@ -298,6 +301,9 @@ ss::future<upload_result> remote::upload_manifest(
             case manifest_type::topic:
                 _probe.topic_manifest_upload();
                 break;
+            case manifest_type::tx_range:
+                _probe.txrange_manifest_upload();
+                break;
             }
             _probe.register_upload_size(size);
             co_return upload_result::success;
diff --git a/src/v/cloud_storage/tx_range_manifest.cc b/src/v/cloud_storage/tx_range_manifest.cc
new file mode 100644
index 0000000000000..89eb6e948db7d
--- /dev/null
+++ b/src/v/cloud_storage/tx_range_manifest.cc
@@ -0,0 +1,121 @@
+#include "cloud_storage/tx_range_manifest.h"
+
+#include "bytes/iobuf.h"
+#include "bytes/iobuf_istreambuf.h"
+#include "bytes/iobuf_ostreambuf.h"
+#include "cloud_storage/partition_manifest.h"
+#include "cloud_storage/types.h"
+#include "cluster/rm_stm.h"
+#include "json/istreamwrapper.h"
+#include "model/record.h"
+#include "utils/fragmented_vector.h"
+
+#include <rapidjson/document.h>
+#include <rapidjson/istreamwrapper.h>
+#include <rapidjson/ostreamwrapper.h>
+#include <rapidjson/rapidjson.h>
+#include <rapidjson/writer.h>
+
+namespace cloud_storage {
+
+remote_manifest_path generate_remote_tx_path(const remote_segment_path& path) {
+    return remote_manifest_path(fmt::format("{}.tx", path().native()));
+}
+
+tx_range_manifest::tx_range_manifest(
+  remote_segment_path spath,
+  const std::vector<cluster::rm_stm::tx_range>& range)
+  : _path(std::move(spath)) {
+    for (const auto& tx : range) {
+        _ranges.push_back(tx);
+    }
+    _ranges.shrink_to_fit();
+}
+
+tx_range_manifest::tx_range_manifest(remote_segment_path spath)
+  : _path(std::move(spath)) {}
+
+ss::future<> tx_range_manifest::update(ss::input_stream<char> is) {
+    using namespace rapidjson;
+    iobuf result;
+    auto os = make_iobuf_ref_output_stream(result);
+    co_await ss::copy(is, os);
+    iobuf_istreambuf ibuf(result);
+    std::istream stream(&ibuf);
+    Document m;
+    IStreamWrapper wrapper(stream);
+    m.ParseStream(wrapper);
+    update(m);
+}
+
+void tx_range_manifest::update(const rapidjson::Document& doc) {
+    _ranges = fragmented_vector<cluster::rm_stm::tx_range>();
+    auto version = doc["version"].GetInt();
+    auto compat_version = doc["compat_version"].GetInt();
+    if (
+      compat_version
+      > static_cast<int>(tx_range_manifest_version::current_version)) {
+        throw std::runtime_error(fmt::sprintf(
+          "Can't deserialize tx manifest, supported version {}, manifest "
+          "version {}, compatible version {}",
+          static_cast<int32_t>(tx_range_manifest_version::current_version),
+          version,
+          compat_version));
+    }
+    if (doc.HasMember("ranges")) {
+        const auto& arr = doc["ranges"].GetArray();
+        for (const auto& it : arr) {
+            const auto& tx_range = it.GetObject();
+            auto id = tx_range["pid.id"].GetInt64();
+            auto epoch = tx_range["pid.epoch"].GetInt();
+            auto first = model::offset{tx_range["first"].GetInt64()};
+            auto last = model::offset{tx_range["last"].GetInt64()};
+            model::producer_identity pid(id, static_cast<int16_t>(epoch));
+            _ranges.push_back(cluster::rm_stm::tx_range{pid, first, last});
+        }
+    }
+    _ranges.shrink_to_fit();
+}
+
+serialized_json_stream tx_range_manifest::serialize() const {
+    iobuf serialized;
+    iobuf_ostreambuf obuf(serialized);
+    std::ostream os(&obuf);
+    serialize(os);
+    size_t size_bytes = serialized.size_bytes();
+    return {
+      .stream = make_iobuf_input_stream(std::move(serialized)),
+      .size_bytes = size_bytes};
+}
+
+remote_manifest_path tx_range_manifest::get_manifest_path() const {
+    return generate_remote_tx_path(_path);
+}
+
+void tx_range_manifest::serialize(std::ostream& out) const {
+    using namespace rapidjson;
+    OStreamWrapper wrapper(out);
+    Writer<OStreamWrapper> w(wrapper);
+    w.StartObject();
+    w.Key("version");
+    w.Int(static_cast<int>(tx_range_manifest_version::current_version));
+    w.Key("compat_version");
+    w.Int(static_cast<int>(tx_range_manifest_version::compat_version));
+    w.Key("ranges");
+    w.StartArray();
+    for (const auto& tx : _ranges) {
+        w.StartObject();
+        w.Key("pid.id");
+        w.Int64(tx.pid.id);
+        w.Key("pid.epoch");
+        w.Int(tx.pid.epoch);
+        w.Key("first");
+        w.Int64(tx.first());
+        w.Key("last");
+        w.Int64(tx.last());
+        w.EndObject();
+    }
+    w.EndArray();
+    w.EndObject();
+}
+} // namespace cloud_storage
diff --git a/src/v/cloud_storage/tx_range_manifest.h b/src/v/cloud_storage/tx_range_manifest.h
new file mode 100644
index 0000000000000..43e85393ad43c
--- /dev/null
+++ b/src/v/cloud_storage/tx_range_manifest.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2022 Redpanda Data, Inc.
+ *
+ * Licensed as a Redpanda Enterprise file under the Redpanda Community
+ * License (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://github.com/redpanda-data/redpanda/blob/master/licenses/rcl.md
+ */
+
+#pragma once
+
+#include "cloud_storage/base_manifest.h"
+#include "cluster/rm_stm.h"
+#include "cluster/types.h"
+#include "json/document.h"
+#include "model/fundamental.h"
+#include "model/metadata.h"
+
+#include <rapidjson/document.h>
+
+#include <optional>
+
+namespace cloud_storage {
+
+/// Transactional metadata path in S3
+remote_manifest_path generate_remote_tx_path(const remote_segment_path& path);
+
+class tx_range_manifest final : public base_manifest {
+public:
+    /// Create manifest for specific ntp
+    explicit tx_range_manifest(
+      remote_segment_path spath,
+      const std::vector<cluster::rm_stm::tx_range>& range);
+
+    /// Create empty manifest that supposed to be updated later
+    explicit tx_range_manifest(remote_segment_path spath);
+
+    friend bool
+    operator==(const tx_range_manifest& lhs, const tx_range_manifest& rhs) {
+        return lhs._path == rhs._path && lhs._ranges == rhs._ranges;
+    }
+
+    /// Update manifest file from input_stream (remote set)
+    ss::future<> update(ss::input_stream<char> is) override;
+    void update(const rapidjson::Document& is);
+
+    /// Serialize manifest object
+    ///
+    /// \return asynchronous input_stream with the serialized json
+    serialized_json_stream serialize() const override;
+
+    /// Manifest object name in S3
+    remote_manifest_path get_manifest_path() const override;
+
+    /// Serialize manifest object
+    ///
+    /// \param out output stream that should be used to output the json
+    void serialize(std::ostream& out) const;
+
+    manifest_type get_manifest_type() const override {
+        return manifest_type::tx_range;
+    };
+
+    fragmented_vector<cluster::rm_stm::tx_range>&& get_tx_range() && {
+        return std::move(_ranges);
+    }
+
+private:
+    remote_segment_path _path;
+    fragmented_vector<cluster::rm_stm::tx_range> _ranges;
+};
+} // namespace cloud_storage
diff --git a/src/v/cloud_storage/types.h b/src/v/cloud_storage/types.h
index a2d5dd2cf316e..b86b32ccbe80e 100644
--- a/src/v/cloud_storage/types.h
+++ b/src/v/cloud_storage/types.h
@@ -60,6 +60,12 @@ enum class manifest_version : int32_t {
     v1 = 1,
 };
 
+enum class tx_range_manifest_version : int32_t {
+    v1 = 1,
+    current_version = v1,
+    compat_version = v1,
+};
+
 static constexpr int32_t topic_manifest_version = 1;
 
 std::ostream& operator<<(std::ostream& o, const download_result& r);
diff --git a/src/v/cluster/rm_stm.h b/src/v/cluster/rm_stm.h
index 6e6c09c542e25..5ced34663a920 100644
--- a/src/v/cluster/rm_stm.h
+++ b/src/v/cluster/rm_stm.h
@@ -58,6 +58,8 @@ class rm_stm final : public persisted_stm {
         model::producer_identity pid;
         model::offset first;
         model::offset last;
+
+        auto operator<=>(const tx_range&) const = default;
     };
 
     struct abort_index {

From 9f16731fa2658c2824c3129a0baa463f689d32d7 Mon Sep 17 00:00:00 2001
From: Evgeny Lazin <evgeny@vectorized.io>
Date: Thu, 7 Jul 2022 09:21:14 -0400
Subject: [PATCH 174/201] Add tx-range test

---
 src/v/cloud_storage/tests/CMakeLists.txt      |  1 +
 .../tests/tx_range_manifest_test.cc           | 90 +++++++++++++++++++
 2 files changed, 91 insertions(+)
 create mode 100644 src/v/cloud_storage/tests/tx_range_manifest_test.cc

diff --git a/src/v/cloud_storage/tests/CMakeLists.txt b/src/v/cloud_storage/tests/CMakeLists.txt
index 1396bb7ec58fd..018041ac6f9ec 100644
--- a/src/v/cloud_storage/tests/CMakeLists.txt
+++ b/src/v/cloud_storage/tests/CMakeLists.txt
@@ -5,6 +5,7 @@ rp_test(
     directory_walker_test.cc
     partition_manifest_test.cc
     topic_manifest_test.cc
+    tx_range_manifest_test.cc
     s3_imposter.cc
     remote_test.cc
     offset_translation_layer_test.cc
diff --git a/src/v/cloud_storage/tests/tx_range_manifest_test.cc b/src/v/cloud_storage/tests/tx_range_manifest_test.cc
new file mode 100644
index 0000000000000..85b7006c8d5da
--- /dev/null
+++ b/src/v/cloud_storage/tests/tx_range_manifest_test.cc
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2022 Redpanda Data, Inc.
+ *
+ * Licensed as a Redpanda Enterprise file under the Redpanda Community
+ * License (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://github.com/redpanda-data/redpanda/blob/master/licenses/rcl.md
+ */
+
+#include "bytes/iobuf.h"
+#include "bytes/iobuf_parser.h"
+#include "cloud_storage/partition_manifest.h"
+#include "cloud_storage/tx_range_manifest.h"
+#include "cloud_storage/types.h"
+#include "cluster/types.h"
+#include "model/compression.h"
+#include "model/fundamental.h"
+#include "model/metadata.h"
+#include "model/record.h"
+#include "seastarx.h"
+
+#include <seastar/testing/test_case.hh>
+#include <seastar/testing/thread_test_case.hh>
+
+#include <boost/test/tools/old/interface.hpp>
+#include <boost/test/unit_test.hpp>
+
+#include <chrono>
+#include <string>
+#include <string_view>
+#include <system_error>
+
+using namespace cloud_storage;
+
+static remote_segment_path
+  segment_path("abcdef01/kafka/topic/0_1/0-1-v1.log.1");
+static remote_manifest_path
+  manifest_path("abcdef01/kafka/topic/0_1/0-1-v1.log.1.tx");
+
+using tx_range_t = cluster::rm_stm::tx_range;
+
+static std::vector<tx_range_t> ranges = {
+  tx_range_t{
+    .pid = model::producer_identity(1, 2),
+    .first = model::offset(3),
+    .last = model::offset(5),
+  },
+  tx_range_t{
+    .pid = model::producer_identity(2, 3),
+    .first = model::offset(4),
+    .last = model::offset(6),
+  }};
+
+SEASTAR_THREAD_TEST_CASE(manifest_type_tx) {
+    tx_range_manifest m(segment_path);
+    BOOST_REQUIRE(m.get_manifest_type() == manifest_type::tx_range);
+}
+
+SEASTAR_THREAD_TEST_CASE(create_tx_manifest) {
+    tx_range_manifest m(segment_path);
+    auto path = m.get_manifest_path();
+    BOOST_REQUIRE_EQUAL(path, manifest_path);
+}
+
+SEASTAR_THREAD_TEST_CASE(empty_serialization_roundtrip_test) {
+    tx_range_manifest m(segment_path);
+    auto [is, size] = m.serialize();
+    iobuf buf;
+    auto os = make_iobuf_ref_output_stream(buf);
+    ss::copy(is, os).get();
+
+    auto rstr = make_iobuf_input_stream(std::move(buf));
+    tx_range_manifest restored(segment_path);
+    restored.update(std::move(rstr)).get();
+    BOOST_REQUIRE(m == restored);
+}
+
+SEASTAR_THREAD_TEST_CASE(serialization_roundtrip_test) {
+    tx_range_manifest m(segment_path, ranges);
+    auto [is, size] = m.serialize();
+    iobuf buf;
+    auto os = make_iobuf_ref_output_stream(buf);
+    ss::copy(is, os).get();
+
+    auto rstr = make_iobuf_input_stream(std::move(buf));
+    tx_range_manifest restored(segment_path);
+    restored.update(std::move(rstr)).get();
+    BOOST_REQUIRE(m == restored);
+}

From 36710be006961fd3e30b990d9a9bcc5ceb9fc9a5 Mon Sep 17 00:00:00 2001
From: Evgeny Lazin <evgeny@vectorized.io>
Date: Fri, 1 Jul 2022 11:58:27 -0400
Subject: [PATCH 175/201] archival: Upload tx-range manifests

Upload tx_range_manifest during segment upload. The segment and the
manifest are uploaded in parallel. The segment upload is successful only
if both the segment and the tx-range are uploaded.
---
 src/v/archival/ntp_archiver_service.cc | 47 +++++++++++++++++++++++++-
 src/v/archival/ntp_archiver_service.h  |  8 ++++-
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/src/v/archival/ntp_archiver_service.cc b/src/v/archival/ntp_archiver_service.cc
index bdb481ae53430..fa181a7199da5 100644
--- a/src/v/archival/ntp_archiver_service.cc
+++ b/src/v/archival/ntp_archiver_service.cc
@@ -14,6 +14,7 @@
 #include "archival/logger.h"
 #include "cloud_storage/partition_manifest.h"
 #include "cloud_storage/remote.h"
+#include "cloud_storage/tx_range_manifest.h"
 #include "cloud_storage/types.h"
 #include "cluster/partition_manager.h"
 #include "model/metadata.h"
@@ -327,6 +328,34 @@ ntp_archiver::upload_segment(upload_candidate candidate) {
       _bucket, path, candidate.content_length, reset_func, fib);
 }
 
+ss::future<cloud_storage::upload_result>
+ntp_archiver::upload_tx(upload_candidate candidate) {
+    gate_guard guard{_gate};
+    retry_chain_node fib(
+      _segment_upload_timeout, _cloud_storage_initial_backoff, &_rtcnode);
+    retry_chain_logger ctxlog(archival_log, fib, _ntp.path());
+
+    vlog(
+      ctxlog.debug, "Uploading segment's tx range {}", candidate.exposed_name);
+
+    auto tx_range = co_await _partition->aborted_transactions(
+      candidate.starting_offset, candidate.final_offset);
+
+    if (tx_range.empty()) {
+        // The actual upload only happens if tx_range is not empty.
+        // The remote_segment should act as if the tx_range is empty if the
+        // request returned NoSuchKey error.
+        co_return cloud_storage::upload_result::success;
+    }
+
+    auto path = cloud_storage::generate_remote_segment_path(
+      _ntp, _rev, candidate.exposed_name, _start_term);
+
+    cloud_storage::tx_range_manifest manifest(path, tx_range);
+
+    co_return co_await _remote.upload_manifest(_bucket, manifest, fib);
+}
+
 ss::future<ntp_archiver::scheduled_upload> ntp_archiver::schedule_single_upload(
   model::offset start_upload_offset, model::offset last_stable_offset) {
     std::optional<storage::log> log = _partition_manager.log(_ntp);
@@ -415,8 +444,24 @@ ss::future<ntp_archiver::scheduled_upload> ntp_archiver::schedule_single_upload(
     start_upload_offset = offset + model::offset(1);
     auto delta
       = base - _partition->get_offset_translator_state()->from_log_offset(base);
+    // The upload is successful only if both segment and tx_range are uploaded.
+    auto upl_fut
+      = ss::when_all(upload_segment(upload), upload_tx(upload))
+          .then([](auto tup) {
+              auto [fs, ftx] = std::move(tup);
+              auto rs = fs.get();
+              auto rtx = ftx.get();
+              if (
+                rs == cloud_storage::upload_result::success
+                && rtx == cloud_storage::upload_result::success) {
+                  return rs;
+              } else if (rs != cloud_storage::upload_result::success) {
+                  return rs;
+              }
+              return rtx;
+          });
     co_return scheduled_upload{
-      .result = upload_segment(upload),
+      .result = std::move(upl_fut),
       .inclusive_last_offset = offset,
       .meta = cloud_storage::partition_manifest::segment_meta{
         .is_compacted = upload.source->is_compacted_segment(),
diff --git a/src/v/archival/ntp_archiver_service.h b/src/v/archival/ntp_archiver_service.h
index ca45e7c5868a0..c36a12c3fae6b 100644
--- a/src/v/archival/ntp_archiver_service.h
+++ b/src/v/archival/ntp_archiver_service.h
@@ -163,10 +163,16 @@ class ntp_archiver {
 
     /// Upload individual segment to S3.
     ///
-    /// \return true on success and false otherwise
+    /// \return error code
     ss::future<cloud_storage::upload_result>
     upload_segment(upload_candidate candidate);
 
+    /// Upload segment's transactions metadata to S3.
+    ///
+    /// \return error code
+    ss::future<cloud_storage::upload_result>
+    upload_tx(upload_candidate candidate);
+
     /// Upload manifest to the pre-defined S3 location
     ss::future<cloud_storage::upload_result> upload_manifest();
 

From d51546ac731654b38188b9e01539ae410bf26271 Mon Sep 17 00:00:00 2001
From: Evgeny Lazin <evgeny@vectorized.io>
Date: Mon, 4 Jul 2022 16:24:03 -0400
Subject: [PATCH 176/201] cloud_storage: Download manifest with aborted
 transations

Download tx manifest as part of the segment hydration in remote_segment.
Previously, the background hydration fiber in the remote_segment
downloaded the segment file and materialized the segment index. It
handled several cases: the segment might be already in the cache or not.
The index file might exist in cache or not.

This commit implements hydration of the aborted transactions metadata
(which is stored in tx manifest). This manifest has to be downloaded
alongside the segment. It's tiny compared to segment so normally it's
not a big deal. But the problem is that the number of cases that the
background fiber has to handle increases. Every file in cache can be
present or not present or be in progress. With two files we have 9
possible combinations.

The operation is now split into two steps:
1. Hydration (downloading) of data files in SI cache
2. Materialization of in-memory structures

The hydration step deals with complexity by eliminating impossible
combinations (in_progress state is not possible since we have only one
background fiber per remote_segment instance and only one remote_segment
instance per actual segment). It hydrates both tx manifest and segment
or only one of the files if another one is present.

The materialization step follows the hydration step. The segment file is
used to materialize the segment index and open a file handle. The tx
manifest file is used to build the list of aborted transactions (which
is stored in memory). If one of the materialization goals can't be met
(because the data file was evicted from cache) the background fiber will
just repeat the hydration step. When the materialization step is
completed the remote_segment is self contained and the files can be
freely evicted by cache.
---
 src/v/cloud_storage/remote_segment.cc | 257 ++++++++++++++++++++++----
 src/v/cloud_storage/remote_segment.h  |  21 ++-
 2 files changed, 244 insertions(+), 34 deletions(-)

diff --git a/src/v/cloud_storage/remote_segment.cc b/src/v/cloud_storage/remote_segment.cc
index a6380aa2eb703..7c8ca39d6d419 100644
--- a/src/v/cloud_storage/remote_segment.cc
+++ b/src/v/cloud_storage/remote_segment.cc
@@ -13,7 +13,9 @@
 #include "bytes/iobuf.h"
 #include "cloud_storage/cache_service.h"
 #include "cloud_storage/logger.h"
+#include "cloud_storage/partition_manifest.h"
 #include "cloud_storage/remote_segment_index.h"
+#include "cloud_storage/tx_range_manifest.h"
 #include "cloud_storage/types.h"
 #include "config/configuration.h"
 #include "model/fundamental.h"
@@ -32,10 +34,12 @@
 #include <seastar/core/loop.hh>
 #include <seastar/core/lowres_clock.hh>
 #include <seastar/core/queue.hh>
+#include <seastar/core/seastar.hh>
 #include <seastar/core/semaphore.hh>
 #include <seastar/core/temporary_buffer.hh>
 #include <seastar/core/timed_out_error.hh>
 #include <seastar/core/when_all.hh>
+#include <seastar/coroutine/all.hh>
 #include <seastar/util/defer.hh>
 #include <seastar/util/log.hh>
 
@@ -232,7 +236,7 @@ remote_segment::maybe_get_offsets(model::offset kafka_offset) {
     return pos;
 }
 
-ss::future<> remote_segment::do_hydrate() {
+ss::future<> remote_segment::do_hydrate_segment() {
     auto callback = [this](
                       uint64_t size_bytes,
                       ss::input_stream<char> s) -> ss::future<uint64_t> {
@@ -296,6 +300,106 @@ ss::future<> remote_segment::do_hydrate() {
     }
 }
 
+ss::future<> remote_segment::do_hydrate_txrange() {
+    ss::gate::holder guard(_gate);
+    retry_chain_node local_rtc(
+      cache_hydration_timeout, cache_hydration_backoff, &_rtc);
+
+    tx_range_manifest manifest(_path);
+
+    auto res = co_await _api.download_manifest(
+      _bucket, manifest.get_manifest_path(), manifest, local_rtc);
+
+    if (res == download_result::notfound) {
+        vlog(
+          _ctxlog.debug,
+          "tx_range {}, doesn't exist in the bucket",
+          manifest.get_manifest_path());
+    } else if (res != download_result::success) {
+        vlog(
+          _ctxlog.debug,
+          "Failed to hydrating a tx_range {}, {} waiter will be "
+          "invoked",
+          manifest.get_manifest_path(),
+          _wait_list.size());
+        throw download_exception(res, _path);
+    }
+    _tx_range = std::move(manifest).get_tx_range();
+}
+
+ss::future<bool> remote_segment::do_materialize_segment() {
+    if (_data_file) {
+        co_return true;
+    }
+    auto maybe_file = co_await _cache.get(_path);
+    if (!maybe_file) {
+        // We could got here because the cache check returned
+        // 'cache_element_status::available' but right after
+        // that the file was evicted from cache. It's also
+        // possible (but very unlikely) that we got here after
+        // successful hydration which was immediately followed
+        // by eviction. In any case we should just re-hydrate
+        // the segment.
+        vlog(
+          _ctxlog.info,
+          "Segment {} was deleted from cache and need to be "
+          "re-hydrated, {} waiter are pending",
+          _path,
+          _wait_list.size());
+        co_return false;
+    }
+    _data_file = maybe_file->body;
+    if (!_index) {
+        // Materialize index state if it's not materialized yet.
+        // If do_hydrate_segment was called _index will be populated
+        // and this branch won't be triggered. If the segment was
+        // available on disk then this branch will read it and populate
+        // the _index.
+        co_await maybe_materialize_index();
+    }
+    co_return true;
+}
+
+ss::future<bool> remote_segment::do_materialize_txrange() {
+    if (_tx_range) {
+        co_return true;
+    }
+    auto path = generate_remote_tx_path(_path);
+    if (auto cache_item = co_await _cache.get(path); cache_item.has_value()) {
+        // The cache item is expected to be present if the this method is
+        // called.
+        vlog(_ctxlog.info, "Trying to materialize tx_range '{}'", path);
+        tx_range_manifest manifest(_path);
+        try {
+            ss::file_input_stream_options options{};
+            options.buffer_size
+              = config::shard_local_cfg().storage_read_buffer_size;
+            options.read_ahead
+              = config::shard_local_cfg().storage_read_readahead_count;
+            options.io_priority_class
+              = priority_manager::local().shadow_indexing_priority();
+            auto inp_stream = ss::make_file_input_stream(
+              cache_item->body, options);
+            co_await manifest.update(std::move(inp_stream));
+            _tx_range = std::move(manifest).get_tx_range();
+        } catch (...) {
+            vlog(
+              _ctxlog.warn,
+              "Failed to materialize tx_range '{}'. Error: {}",
+              path,
+              std::current_exception());
+        }
+        co_await cache_item->body.close();
+    } else {
+        vlog(
+          _ctxlog.info,
+          "tx_range '{}' is not available in cache, retrying",
+          path);
+        co_return false;
+    }
+    co_return true;
+}
+
 ss::future<> remote_segment::maybe_materialize_index() {
     ss::gate::holder guard(_gate);
     auto path = _path().native() + ".index";
@@ -342,6 +446,59 @@ ss::future<> remote_segment::maybe_materialize_index() {
     }
 }
 
+// NOTE: Aborted transactions handled using tx_range manifests.
+// The manifests are uploaded alongside the segments with (.tx)
+// suffix added to the name. The hydration of tx_range manifest
+// is not optional. We can't use the segment without it. The following
+// cases are possible:
+// - Both segment and tx-range are not hydrated;
+// - The segment is hydrated but tx-range isn't
+// - The segment is not hydrated but tx-range is
+// - Both segment and tx-range are hydrated
+// This doesn't include various 'in_progress' combinations which are
+// disallowed.
+//
+// Also, both segment and tx-range can be materialized or not. In case
+// of the segment this means that we're holding an opened file handler.
+// In case of tx-range this means that we parsed the json and populated
+// _tx_range collection.
+//
+// In order to be able to deal with the complexity this code combines
+// the flags and tries to handle all combinations that makes sense.
+enum class segment_txrange_status {
+    in_progress,
+    available,
+    not_available,
+    available_not_available,
+    not_available_available,
+};
+
+static segment_txrange_status
+combine_statuses(cache_element_status segment, cache_element_status tx_range) {
+    switch (segment) {
+    case cache_element_status::in_progress:
+        return segment_txrange_status::in_progress;
+    case cache_element_status::available:
+        switch (tx_range) {
+        case cache_element_status::available:
+            return segment_txrange_status::available;
+        case cache_element_status::in_progress:
+            return segment_txrange_status::in_progress;
+        case cache_element_status::not_available:
+            return segment_txrange_status::available_not_available;
+        }
+    case cache_element_status::not_available:
+        switch (tx_range) {
+        case cache_element_status::available:
+            return segment_txrange_status::not_available_available;
+        case cache_element_status::in_progress:
+            return segment_txrange_status::in_progress;
+        case cache_element_status::not_available:
+            return segment_txrange_status::not_available;
+        }
+    }
+}
+
 ss::future<> remote_segment::run_hydrate_bg() {
     ss::gate::holder guard(_gate);
     try {
@@ -361,57 +518,62 @@ ss::future<> remote_segment::run_hydrate_bg() {
                 // and retrieve the file out of it or hydrate.
                 // If _data_file is initialized we can use it safely since the
                 // cache can't delete it until we close it.
-                auto status = co_await _cache.is_cached(_path);
+                auto tx_path = generate_remote_tx_path(_path);
+                auto segment_status = co_await _cache.is_cached(_path);
+                auto txrange_status = co_await _cache.is_cached(tx_path);
+                auto status = combine_statuses(segment_status, txrange_status);
                 switch (status) {
-                case cache_element_status::in_progress:
+                case segment_txrange_status::in_progress:
                     vassert(
                       false,
-                      "Hydration of segment {} is already in progress, {} "
-                      "waiters",
+                      "Hydration of segment or tx-manifest {} is already in "
+                      "progress, {} waiters",
                       _path,
                       _wait_list.size());
-                case cache_element_status::available:
+                case segment_txrange_status::available:
                     vlog(
                       _ctxlog.debug,
                       "Hydrated segment {} is already available, {} waiters "
-                      "will "
-                      "be invoked",
+                      "will be invoked",
                       _path,
                       _wait_list.size());
                     break;
-                case cache_element_status::not_available: {
-                    vlog(_ctxlog.info, "Hydrating segment {}", _path);
+                case segment_txrange_status::not_available:
+                    vlog(
+                      _ctxlog.info,
+                      "Hydrating segment and tx-manifest {}",
+                      _path);
                     try {
-                        co_await do_hydrate();
+                        co_await ss::coroutine::all(
+                          [this] { return do_hydrate_segment(); },
+                          [this] { return do_hydrate_txrange(); });
                     } catch (const download_exception&) {
                         err = std::current_exception();
                     }
-                } break;
+                    break;
+                case segment_txrange_status::not_available_available:
+                    vlog(_ctxlog.info, "Hydrating only segment {}", _path);
+                    try {
+                        co_await do_hydrate_segment();
+                    } catch (const download_exception&) {
+                        err = std::current_exception();
+                    }
+                    break;
+                case segment_txrange_status::available_not_available:
+                    vlog(_ctxlog.info, "Hydrating only tx-manifest {}", _path);
+                    try {
+                        co_await do_hydrate_txrange();
+                    } catch (const download_exception&) {
+                        err = std::current_exception();
+                    }
+                    break;
                 }
                 if (!err) {
-                    auto maybe_file = co_await _cache.get(_path);
-                    if (!maybe_file) {
-                        // We could got here because the cache check returned
-                        // 'cache_element_status::available' but right after
-                        // that the file was evicted from cache. It's also
-                        // possible (but very unlikely) that we got here after
-                        // successful hydration which was immediately followed
-                        // by eviction. In any case we should just re-hydrate
-                        // the segment. The 'wait' on cond-variable won't block
-                        // because the
-                        // '_wait_list' is not empty.
-                        vlog(
-                          _ctxlog.info,
-                          "Segment {} was deleted from cache and need to be "
-                          "re-hydrated, {} waiter are pending",
-                          _path,
-                          _wait_list.size());
+                    if (co_await do_materialize_segment() == false) {
                         continue;
                     }
-                    _data_file = maybe_file->body;
-                    if (!_index) {
-                        // materialize index state
-                        co_await maybe_materialize_index();
+                    if (co_await do_materialize_txrange() == false) {
+                        continue;
                     }
                 }
             }
@@ -454,6 +616,35 @@ ss::future<> remote_segment::hydrate() {
     });
 }
 
+ss::future<std::vector<cluster::rm_stm::tx_range>>
+remote_segment::aborted_transactions(model::offset from, model::offset to) {
+    co_await hydrate();
+    std::vector<cluster::rm_stm::tx_range> result;
+    if (!_tx_range) {
+        // We got NoSuchKey when we tried to download the
+        // tx-manifest. This means that segment doesn't have
+        // any record batches which belong to aborted transactions.
+        vlog(_ctxlog.debug, "segment {} no tx-metadata available", _path);
+        co_return result;
+    }
+    for (const auto& it : *_tx_range) {
+        if (it.last < from) {
+            continue;
+        }
+        if (it.first > to) {
+            continue;
+        }
+        result.push_back(it);
+    }
+    vlog(
+      _ctxlog.debug,
+      "found {} aborted transactions for {}-{} offset range in this segment",
+      result.size(),
+      from,
+      to);
+    co_return result;
+}
+
 /// Batch consumer that connects to remote_segment_batch_reader.
 /// It also does offset translation based on incomplete data in
 /// manifests.
diff --git a/src/v/cloud_storage/remote_segment.h b/src/v/cloud_storage/remote_segment.h
index 3a2e8e2277a06..ecfeca01bec9c 100644
--- a/src/v/cloud_storage/remote_segment.h
+++ b/src/v/cloud_storage/remote_segment.h
@@ -17,6 +17,7 @@
 #include "cloud_storage/remote.h"
 #include "cloud_storage/remote_segment_index.h"
 #include "cloud_storage/types.h"
+#include "cluster/rm_stm.h"
 #include "model/fundamental.h"
 #include "model/record.h"
 #include "s3/client.h"
@@ -118,6 +119,13 @@ class remote_segment final {
 
     bool download_in_progress() const noexcept { return !_wait_list.empty(); }
 
+    /// Return aborted transactions metadata associated with the segment
+    ///
+    /// \param from start redpanda offset
+    /// \param to end redpanda offset
+    ss::future<std::vector<cluster::rm_stm::tx_range>>
+    aborted_transactions(model::offset from, model::offset to);
+
 private:
     /// get a file offset for the corresponding kafka offset
     /// if the index is available
@@ -133,7 +141,15 @@ class remote_segment final {
 
     /// Actually hydrate the segment. The method downloads the segment file
     /// to the cache dir and updates the segment index.
-    ss::future<> do_hydrate();
+    ss::future<> do_hydrate_segment();
+    /// Hydrate tx manifest. Method downloads the manifest file to the cache
+    /// dir.
+    ss::future<> do_hydrate_txrange();
+    /// Materilize segment. Segment has to be hydrated beforehand. The
+    /// 'materialization' process opens file handle and creates
+    /// compressed segment index in memory.
+    ss::future<bool> do_materialize_segment();
+    ss::future<bool> do_materialize_txrange();
 
     /// Load segment index from file (if available)
     ss::future<> maybe_materialize_index();
@@ -162,6 +178,9 @@ class remote_segment final {
 
     ss::file _data_file;
     std::optional<offset_index> _index;
+
+    using tx_range_vec = fragmented_vector<cluster::rm_stm::tx_range>;
+    std::optional<tx_range_vec> _tx_range;
 };
 
 class remote_segment_batch_consumer;

From 8aac5692d55e95aeed6dafa8312071226456899f Mon Sep 17 00:00:00 2001
From: Evgeny Lazin <evgeny@vectorized.io>
Date: Thu, 7 Jul 2022 09:36:38 -0400
Subject: [PATCH 177/201] cloud_storage: Propagate aborted transactions
 metadata

Add method that returns aborted txn's state in the remote_partition.
Also, make it available in the cluster partition.

The method hydrates segments in the requested range if needed. Normally
the hydration won't be needed because 'aborted_transactions' method is
called after the actual read from 'remote_partition' so the segments
will most likely be already hydrated and materialized.
---
 src/v/cloud_storage/remote_partition.cc | 49 +++++++++++++++++++++++++
 src/v/cloud_storage/remote_partition.h  | 11 ++++++
 src/v/cluster/partition.h               |  5 +++
 3 files changed, 65 insertions(+)

diff --git a/src/v/cloud_storage/remote_partition.cc b/src/v/cloud_storage/remote_partition.cc
index ef90f003ad621..b8e6a342a64ae 100644
--- a/src/v/cloud_storage/remote_partition.cc
+++ b/src/v/cloud_storage/remote_partition.cc
@@ -29,6 +29,7 @@
 
 #include <chrono>
 #include <exception>
+#include <iterator>
 #include <variant>
 
 using namespace std::chrono_literals;
@@ -498,6 +499,54 @@ remote_partition::get_term_last_offset(model::term_id term) const {
     return std::nullopt;
 }
 
+ss::future<std::vector<cluster::rm_stm::tx_range>>
+remote_partition::aborted_transactions(offset_range offsets) {
+    // Here we have to use kafka offsets to locate the segments and
+    // redpanda offsets to extract aborted transactions metadata because
+    // tx-manifests contains redpanda offsets.
+    std::vector<cluster::rm_stm::tx_range> result;
+    auto first_it = _segments.upper_bound(offsets.begin);
+    if (first_it != _segments.begin()) {
+        first_it = std::prev(first_it);
+    }
+    for (auto it = first_it; it != _segments.end(); it++) {
+        if (it->first > offsets.end) {
+            break;
+        }
+        auto& st = it->second;
+        auto tx = co_await ss::visit(
+          st,
+          [this, &st, offsets, offset_key = it->first](
+            offloaded_segment_state& off_state) {
+              auto tmp = off_state->materialize(*this, offset_key);
+              auto res = tmp->segment->aborted_transactions(
+                offsets.begin_rp, offsets.end_rp);
+              st = std::move(tmp);
+              return res;
+          },
+          [offsets](materialized_segment_ptr& m_state) {
+              return m_state->segment->aborted_transactions(
+                offsets.begin_rp, offsets.end_rp);
+          });
+        std::copy(tx.begin(), tx.end(), std::back_inserter(result));
+    }
+    // Adjacent segments might return the same transaction record.
+    // In this case we will have a duplicate. The duplicates will always
+    // be located next to each other in the sequence.
+    auto last = std::unique(result.begin(), result.end());
+    result.erase(last, result.end());
+    vlog(
+      _ctxlog.debug,
+      "found {} aborted transactions for {}-{} offset range ({}-{} before "
+      "offset translaction)",
+      result.size(),
+      offsets.begin_rp,
+      offsets.begin,
+      offsets.end_rp,
+      offsets.end);
+    co_return result;
+}
+
 ss::future<> remote_partition::stop() {
     vlog(_ctxlog.debug, "remote partition stop {} segments", _segments.size());
     _stm_timer.cancel();
diff --git a/src/v/cloud_storage/remote_partition.h b/src/v/cloud_storage/remote_partition.h
index 929c9010c858f..3d14fce357291 100644
--- a/src/v/cloud_storage/remote_partition.h
+++ b/src/v/cloud_storage/remote_partition.h
@@ -133,6 +133,13 @@ class btree_map_stable_iterator
 
 } // namespace details
 
+struct offset_range {
+    model::offset begin;
+    model::offset end;
+    model::offset begin_rp;
+    model::offset end_rp;
+};
+
 /// Remote partition manintains list of remote segments
 /// and list of active readers. Only one reader can be
 /// maintained per segment. The idea here is that the
@@ -192,6 +199,10 @@ class remote_partition
     // returns term last kafka offset
     std::optional<model::offset> get_term_last_offset(model::term_id) const;
 
+    // Get list of aborted transactions that overlap with the offset range
+    ss::future<std::vector<cluster::rm_stm::tx_range>>
+    aborted_transactions(offset_range offsets);
+
 private:
     /// Create new remote_segment instances for all new
     /// items in the manifest.
diff --git a/src/v/cluster/partition.h b/src/v/cluster/partition.h
index 8263b97517610..cae61e69878d5 100644
--- a/src/v/cluster/partition.h
+++ b/src/v/cluster/partition.h
@@ -213,6 +213,11 @@ class partition {
         return _rm_stm->aborted_transactions(from, to);
     }
 
+    ss::future<std::vector<rm_stm::tx_range>>
+    aborted_transactions_cloud(cloud_storage::offset_range offsets) {
+        return _cloud_storage_partition->aborted_transactions(offsets);
+    }
+
     const ss::shared_ptr<cluster::archival_metadata_stm>&
     archival_meta_stm() const {
         return _archival_meta_stm;

From 8be2b135e9a96fc126dde59914726048884ebca2 Mon Sep 17 00:00:00 2001
From: Evgeny Lazin <evgeny@vectorized.io>
Date: Thu, 7 Jul 2022 09:42:51 -0400
Subject: [PATCH 178/201] kafka: Use new aborted txn metadata

Use aborted transactions metadata from the remote_partition when rm_stm
snapshot doesn't have the data.

The aborted transactions snapshot doesn't guarantee that the data will
always be available. In some cases (partition movement) the snapshot
will be re-created using local data. If some segments were removed by
retention they won't be covered by the snapshot anymore.

Previously, the code always used the snapshot to get information about
aborted transactions. After this change we will use new data source in
SI for that if the data came from SI and we will use rm_stm snapshot
otherwise.
---
 src/v/kafka/server/replicated_partition.cc | 96 +++++++++++++++++++---
 src/v/kafka/server/replicated_partition.h  | 10 +++
 2 files changed, 96 insertions(+), 10 deletions(-)

diff --git a/src/v/kafka/server/replicated_partition.cc b/src/v/kafka/server/replicated_partition.cc
index 6c2583f6b29fb..4bce1aa8766e4 100644
--- a/src/v/kafka/server/replicated_partition.cc
+++ b/src/v/kafka/server/replicated_partition.cc
@@ -22,6 +22,7 @@
 #include "storage/types.h"
 
 #include <seastar/core/coroutine.hh>
+#include <seastar/core/future.hh>
 
 #include <optional>
 
@@ -115,12 +116,9 @@ ss::future<storage::translating_reader> replicated_partition::make_reader(
 }
 
 ss::future<std::vector<cluster::rm_stm::tx_range>>
-replicated_partition::aborted_transactions(
-  model::offset base,
-  model::offset last,
+replicated_partition::aborted_transactions_local(
+  cloud_storage::offset_range offsets,
   ss::lw_shared_ptr<const storage::offset_translator_state> ot_state) {
-    vassert(ot_state, "ntp {}: offset translator state must be present", ntp());
-
     // Note: here we expect that local _partition contains aborted transaction
     // ids for both local and remote offset ranges. This is true as long as
     // rm_stm state has not been reset (for example when there is a partition
@@ -128,14 +126,13 @@ replicated_partition::aborted_transactions(
     // eviction point). See
     // https://github.com/redpanda-data/redpanda/issues/3001
 
-    auto base_rp = ot_state->to_log_offset(base);
-    auto last_rp = ot_state->to_log_offset(last);
-    auto source = co_await _partition->aborted_transactions(base_rp, last_rp);
+    auto source = co_await _partition->aborted_transactions(
+      offsets.begin_rp, offsets.end_rp);
 
     // We trim beginning of aborted ranges to `trim_at` because we don't have
     // offset translation info for earlier offsets.
     model::offset trim_at;
-    if (base_rp >= _partition->start_offset()) {
+    if (offsets.begin_rp >= _partition->start_offset()) {
         // Local fetch. Trim to start of the log - it is safe because clients
         // can't read earlier offsets.
         trim_at = _partition->start_offset();
@@ -144,7 +141,7 @@ replicated_partition::aborted_transactions(
         // incorrect because clients can still see earlier offsets but will work
         // if they won't use aborted ranges from this request to filter batches
         // belonging to earlier offsets.
-        trim_at = base_rp;
+        trim_at = offsets.begin_rp;
     }
 
     std::vector<cluster::rm_stm::tx_range> target;
@@ -159,6 +156,85 @@ replicated_partition::aborted_transactions(
     co_return target;
 }
 
+ss::future<std::vector<cluster::rm_stm::tx_range>>
+replicated_partition::aborted_transactions_remote(
+  cloud_storage::offset_range offsets,
+  ss::lw_shared_ptr<const storage::offset_translator_state> ot_state) {
+    auto source = co_await _partition->aborted_transactions_cloud(offsets);
+    std::vector<cluster::rm_stm::tx_range> target;
+    target.reserve(source.size());
+    for (const auto& range : source) {
+        target.push_back(cluster::rm_stm::tx_range{
+          .pid = range.pid,
+          .first = ot_state->from_log_offset(
+            std::max(offsets.begin_rp, range.first)),
+          .last = ot_state->from_log_offset(range.last)});
+    }
+    co_return target;
+}
+
+ss::future<std::vector<cluster::rm_stm::tx_range>>
+replicated_partition::aborted_transactions(
+  model::offset base,
+  model::offset last,
+  ss::lw_shared_ptr<const storage::offset_translator_state> ot_state) {
+    // We can extract information about aborted transactions from local raft log
+    // or from the S3 bucket. The decision is made using the following logic:
+    // - if the record batches were produced by shadow indexing (downloaded from
+    // S3)
+    //   then we should use the same source for transactions metadata. It's
+    //   guaranteed that in this case we will find the corresponding manifest
+    //   (it's downloaded alongside the segment to SI cache). This also means
+    //   that we will have the manifests hydrated on disk (since we just
+    //   downloaded corresponding segments from S3 to produce batches).
+    // - if the source of data is local raft log then we should use abroted
+    // transactions
+    //   snapshot.
+    //
+    // Sometimes the snapshot will have data for the offset range even if the
+    // source is S3 bucket. In this case we won't be using this data because
+    // it's not guaranteed that it has the data for the entire offset range and
+    // we won't be able to tell the difference by looking at the results (for
+    // instance, the offset range is 0-100, but the snapshot has data starting
+    // from offset 50, it will return data for range 50-100 and we won't be able
+    // to tell if it didn't have data for 0-50 or there wasn't any transactions
+    // in that range).
+    vassert(ot_state, "ntp {}: offset translator state must be present", ntp());
+    auto base_rp = ot_state->to_log_offset(base);
+    auto last_rp = ot_state->to_log_offset(last);
+    cloud_storage::offset_range offsets = {
+      .begin = base,
+      .end = last,
+      .begin_rp = base_rp,
+      .end_rp = last_rp,
+    };
+    if (_partition->is_read_replica_mode_enabled()) {
+        // Always use SI for read replicas
+        co_return co_await aborted_transactions_remote(offsets, ot_state);
+    }
+    if (
+      _partition->cloud_data_available()
+      && offsets.begin_rp < _partition->start_offset()) {
+        // The fetch request was satisfied using shadow indexing.
+        auto tx_remote = co_await aborted_transactions_remote(
+          offsets, ot_state);
+        if (!tx_remote.empty()) {
+            // NOTE: we don't have a way to upload tx-manifests to the cloud
+            // for segments which was uploaded by old redpanda version because
+            // we can't guarantee that the local snapshot still has the data.
+            // This means that 'aborted_transaction_remote' might return empty
+            // result in case if the segment was uploaded by previous version of
+            // redpanda. In this case we will try to fetch the aborted
+            // transactions metadata from local snapshot. This approach provide
+            // the same guarantees that we have in v22.1 for data produced by
+            // v22.1 and earlier. But for new data we will guarantee that the
+            // metadata is always available in S3.
+            co_return tx_remote;
+        }
+    }
+    co_return co_await aborted_transactions_local(offsets, ot_state);
+}
+
 ss::future<std::optional<storage::timequery_result>>
 replicated_partition::timequery(storage::timequery_config cfg) {
     return _partition->timequery(cfg).then(
diff --git a/src/v/kafka/server/replicated_partition.h b/src/v/kafka/server/replicated_partition.h
index 187a2f4e89ce5..e442021f1c2e3 100644
--- a/src/v/kafka/server/replicated_partition.h
+++ b/src/v/kafka/server/replicated_partition.h
@@ -125,6 +125,16 @@ class replicated_partition final : public kafka::partition_proxy::impl {
       model::offset, model::timeout_clock::time_point) final;
 
 private:
+    ss::future<std::vector<cluster::rm_stm::tx_range>>
+      aborted_transactions_local(
+        cloud_storage::offset_range,
+        ss::lw_shared_ptr<const storage::offset_translator_state>);
+
+    ss::future<std::vector<cluster::rm_stm::tx_range>>
+    aborted_transactions_remote(
+      cloud_storage::offset_range offsets,
+      ss::lw_shared_ptr<const storage::offset_translator_state> ot_state);
+
     ss::lw_shared_ptr<cluster::partition> _partition;
     ss::lw_shared_ptr<const storage::offset_translator_state> _translator;
 };

From 57a9a8ecb3deb56a8d7426b013f32a0d4fd0897b Mon Sep 17 00:00:00 2001
From: Ben Pope <ben@redpanda.com>
Date: Sat, 16 Jul 2022 14:28:25 +0100
Subject: [PATCH 179/201] net: Remove security::tls::principal_mapper

Revert 77853dbf5b74df1058090ef1621bc035d3c70d31

Signed-off-by: Ben Pope <ben@redpanda.com>
---
 src/v/net/connection.cc |  6 ++----
 src/v/net/connection.h  | 10 +---------
 src/v/net/server.cc     | 12 +-----------
 src/v/net/server.h      | 19 -------------------
 4 files changed, 4 insertions(+), 43 deletions(-)

diff --git a/src/v/net/connection.cc b/src/v/net/connection.cc
index 52bee6c65ac1e..8c617f1d5dca7 100644
--- a/src/v/net/connection.cc
+++ b/src/v/net/connection.cc
@@ -56,16 +56,14 @@ connection::connection(
   ss::connected_socket f,
   ss::socket_address a,
   server_probe& p,
-  std::optional<size_t> in_max_buffer_size,
-  std::optional<security::tls::principal_mapper> tls_pm)
+  std::optional<size_t> in_max_buffer_size)
   : addr(a)
   , _hook(hook)
   , _name(std::move(name))
   , _fd(std::move(f))
   , _in(_fd.input())
   , _out(_fd.output())
-  , _probe(p)
-  , _tls_pm(std::move(tls_pm)) {
+  , _probe(p) {
     if (in_max_buffer_size.has_value()) {
         auto in_config = ss::connected_socket_input_stream_config{};
         in_config.max_buffer_size = in_max_buffer_size.value();
diff --git a/src/v/net/connection.h b/src/v/net/connection.h
index 9fb6fae352c58..d5fbad17a7033 100644
--- a/src/v/net/connection.h
+++ b/src/v/net/connection.h
@@ -14,7 +14,6 @@
 #include "net/batched_output_stream.h"
 #include "net/server_probe.h"
 #include "seastarx.h"
-#include "security/mtls.h"
 
 #include <seastar/core/iostream.hh>
 #include <seastar/net/api.hh>
@@ -39,8 +38,7 @@ class connection : public boost::intrusive::list_base_hook<> {
       ss::connected_socket f,
       ss::socket_address a,
       server_probe& p,
-      std::optional<size_t> in_max_buffer_size,
-      std::optional<security::tls::principal_mapper> tls_pm);
+      std::optional<size_t> in_max_buffer_size);
     ~connection() noexcept;
     connection(const connection&) = delete;
     connection& operator=(const connection&) = delete;
@@ -64,11 +62,6 @@ class connection : public boost::intrusive::list_base_hook<> {
         return ss::tls::get_dn_information(_fd);
     }
 
-    const std::optional<security::tls::principal_mapper>&
-    get_principal_mapping() const {
-        return _tls_pm;
-    }
-
 private:
     boost::intrusive::list<connection>& _hook;
     ss::sstring _name;
@@ -76,7 +69,6 @@ class connection : public boost::intrusive::list_base_hook<> {
     ss::input_stream<char> _in;
     net::batched_output_stream _out;
     server_probe& _probe;
-    std::optional<security::tls::principal_mapper> _tls_pm;
 };
 
 } // namespace net
diff --git a/src/v/net/server.cc b/src/v/net/server.cc
index 682b25ae8e562..ca3f675c058cb 100644
--- a/src/v/net/server.cc
+++ b/src/v/net/server.cc
@@ -215,23 +215,13 @@ ss::future<> server::accept(listener& s) {
                   }
               }
 
-              std::optional<security::tls::principal_mapper> tls_pm;
-              auto se_it = std::find_if(
-                cfg.addrs.begin(), cfg.addrs.end(), [&name](const auto& a) {
-                    return a.name == name;
-                });
-              if (se_it != cfg.addrs.end()) {
-                  tls_pm = se_it->principal_mapper;
-              }
-
               auto conn = ss::make_lw_shared<net::connection>(
                 _connections,
                 name,
                 std::move(ar.connection),
                 ar.remote_address,
                 _probe,
-                cfg.stream_recv_buf,
-                tls_pm);
+                cfg.stream_recv_buf);
               vlog(
                 rpc::rpclog.trace,
                 "{} - Incoming connection from {} on \"{}\"",
diff --git a/src/v/net/server.h b/src/v/net/server.h
index 6c88cccadc421..487b870c92458 100644
--- a/src/v/net/server.h
+++ b/src/v/net/server.h
@@ -16,7 +16,6 @@
 #include "net/connection.h"
 #include "net/connection_rate.h"
 #include "net/types.h"
-#include "security/mtls.h"
 #include "utils/hdr_hist.h"
 
 #include <seastar/core/abort_source.hh>
@@ -43,7 +42,6 @@ struct server_endpoint {
     ss::sstring name;
     ss::socket_address addr;
     ss::shared_ptr<ss::tls::server_credentials> credentials;
-    std::optional<security::tls::principal_mapper> principal_mapper;
 
     server_endpoint(ss::sstring name, ss::socket_address addr)
       : name(std::move(name))
@@ -57,28 +55,11 @@ struct server_endpoint {
       , addr(addr)
       , credentials(std::move(creds)) {}
 
-    server_endpoint(
-      ss::sstring name,
-      ss::socket_address addr,
-      ss::shared_ptr<ss::tls::server_credentials> creds,
-      std::optional<security::tls::principal_mapper> principal_mapper)
-      : name(std::move(name))
-      , addr(addr)
-      , credentials(std::move(creds))
-      , principal_mapper(std::move(principal_mapper)) {}
-
     server_endpoint(
       ss::socket_address addr,
       ss::shared_ptr<ss::tls::server_credentials> creds)
       : server_endpoint("", addr, std::move(creds)) {}
 
-    server_endpoint(
-      ss::socket_address addr,
-      ss::shared_ptr<ss::tls::server_credentials> creds,
-      security::tls::principal_mapper principal_mapper)
-      : server_endpoint(
-        "", addr, std::move(creds), std::move(principal_mapper)) {}
-
     explicit server_endpoint(ss::socket_address addr)
       : server_endpoint("", addr) {}
 

From 7bd39299ed5719f99db229d2e8ab9711eb6e925d Mon Sep 17 00:00:00 2001
From: Noah Watkins <noahwatkins@gmail.com>
Date: Sat, 16 Jul 2022 12:46:32 -0700
Subject: [PATCH 180/201] cluster: add default ctor to partition_balancer types

Needed for serde support.

Signed-off-by: Noah Watkins <noahwatkins@gmail.com>
---
 src/v/cluster/partition_balancer_types.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/v/cluster/partition_balancer_types.h b/src/v/cluster/partition_balancer_types.h
index 7131f77bb1fab..f562ed813bd14 100644
--- a/src/v/cluster/partition_balancer_types.h
+++ b/src/v/cluster/partition_balancer_types.h
@@ -49,6 +49,7 @@ struct partition_balancer_violations
         model::node_id id;
         model::timestamp unavailable_since;
 
+        unavailable_node() noexcept = default;
         unavailable_node(model::node_id id, model::timestamp unavailable_since)
           : id(id)
           , unavailable_since(unavailable_since) {}
@@ -65,6 +66,7 @@ struct partition_balancer_violations
         model::node_id id;
         uint32_t disk_used_percent;
 
+        full_node() noexcept = default;
         full_node(model::node_id id, uint32_t disk_used_percent)
           : id(id)
           , disk_used_percent(disk_used_percent) {}

From 1f640f12fea7de02850f742cf5000c5948621e03 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noah@redpanda.com>
Date: Fri, 27 May 2022 16:52:17 -0700
Subject: [PATCH 181/201] cluster: remove unused service

Signed-off-by: Noah Watkins <noah@redpanda.com>
---
 src/v/cluster/controller.h | 1 -
 src/v/cluster/fwd.h        | 1 -
 2 files changed, 2 deletions(-)

diff --git a/src/v/cluster/controller.h b/src/v/cluster/controller.h
index b560b744ab789..509c9f89ff486 100644
--- a/src/v/cluster/controller.h
+++ b/src/v/cluster/controller.h
@@ -136,7 +136,6 @@ class controller {
     ss::sharded<topics_frontend> _tp_frontend;       // instance per core
     ss::sharded<controller_backend> _backend;        // instance per core
     ss::sharded<controller_stm> _stm;                // single instance
-    ss::sharded<controller_service> _service;        // instance per core
     ss::sharded<controller_api> _api;                // instance per core
     ss::sharded<members_frontend> _members_frontend; // instance per core
     ss::sharded<members_backend> _members_backend;   // single instance
diff --git a/src/v/cluster/fwd.h b/src/v/cluster/fwd.h
index c1c28a36e85c2..726f9c59df70f 100644
--- a/src/v/cluster/fwd.h
+++ b/src/v/cluster/fwd.h
@@ -15,7 +15,6 @@ namespace cluster {
 
 class controller;
 class controller_backend;
-class controller_service;
 class controller_stm_shard;
 class id_allocator_frontend;
 class rm_partition_frontend;

From cc3c5ef77b99b2a94a03e7a636e105bb59791309 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noah@redpanda.com>
Date: Thu, 26 May 2022 18:17:52 -0700
Subject: [PATCH 182/201] rpc: avoid unclean transport shutdown on test failure

A test failure skipped stopping the transport and caused
internal assertion failures:

    ERROR 2022-05-26 18:15:09,185 [shard 0] assert - Assert failure:
    (../../../src/v/rpc/transport.cc:272) '!is_valid()' connection
    '(server:{host: 127.0.0.1, port: 32147}, _correlations:0,
    _correlation_idx:1)' is still valid. must call stop() before destroying

Fixed with RAII ss::defer helper.

Signed-off-by: Noah Watkins <noah@redpanda.com>
---
 src/v/rpc/test/rpc_gen_cycling_test.cc | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/v/rpc/test/rpc_gen_cycling_test.cc b/src/v/rpc/test/rpc_gen_cycling_test.cc
index 0b46c3391a83b..50a8c66446d7a 100644
--- a/src/v/rpc/test/rpc_gen_cycling_test.cc
+++ b/src/v/rpc/test/rpc_gen_cycling_test.cc
@@ -396,6 +396,7 @@ FIXTURE_TEST(missing_method_test, rpc_integration_fixture) {
 
     rpc::transport t(client_config());
     t.connect(model::no_timeout).get();
+    auto stop = ss::defer([&t] { t.stop().get(); });
     auto client = echo::echo_client_protocol(t);
 
     const auto check_missing = [&] {
@@ -441,8 +442,6 @@ FIXTURE_TEST(missing_method_test, rpc_integration_fixture) {
     }
 
     ss::when_all_succeed(requests.begin(), requests.end()).get();
-
-    t.stop().get();
 }
 
 FIXTURE_TEST(corrupted_header_at_client_test, rpc_integration_fixture) {
@@ -538,6 +537,7 @@ FIXTURE_TEST(version_not_supported, rpc_integration_fixture) {
 
     rpc::transport t(client_config());
     t.connect(model::no_timeout).get();
+    auto stop = ss::defer([&t] { t.stop().get(); });
     auto client = echo::echo_client_protocol(t);
 
     const auto check_unsupported = [&] {
@@ -592,8 +592,6 @@ FIXTURE_TEST(version_not_supported, rpc_integration_fixture) {
     }
 
     ss::when_all_succeed(requests.begin(), requests.end()).get();
-
-    t.stop().get();
 }
 
 class erroneous_protocol_exception : public std::exception {};

From abc348ea3baeacccf052511c0ff20709a7b29b5d Mon Sep 17 00:00:00 2001
From: Noah Watkins <noah@redpanda.com>
Date: Thu, 26 May 2022 17:09:18 -0700
Subject: [PATCH 183/201] rpc: use max value for unsupported version

Using a maximum value rather than something like max_supported + 1 has
the same semantic meaning when used. However the problem with
max_supported + 1 is that it makes it harder to create a patch series
that incrementally adds complexity. The reason is that it is convenient
to add the names of the new versions before the patch series fully
supports the new version. The result is an intermediate state in the
patch series in which max_supported + 1 has the same value as another
name, causing issues for existing switch statements as an example
complication.

Signed-off-by: Noah Watkins <noah@redpanda.com>
---
 src/v/rpc/types.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/v/rpc/types.h b/src/v/rpc/types.h
index de447992f5e1c..b36de6daa7f0e 100644
--- a/src/v/rpc/types.h
+++ b/src/v/rpc/types.h
@@ -35,6 +35,7 @@
 #include <chrono>
 #include <cstdint>
 #include <iosfwd>
+#include <limits>
 #include <type_traits>
 #include <vector>
 
@@ -79,7 +80,7 @@ enum class transport_version : uint8_t {
      * unsupported is a convenience name used in tests to construct a message
      * with an unsupported version. the bits should not be considered reserved.
      */
-    unsupported = max_supported + 1,
+    unsupported = std::numeric_limits<uint8_t>::max()
 };
 
 /// \brief core struct for communications. sent with _each_ payload

From 6fae88473fcaaca989c2bdc376cf96aa6bdf515f Mon Sep 17 00:00:00 2001
From: Noah Watkins <noah@redpanda.com>
Date: Thu, 26 May 2022 17:56:24 -0700
Subject: [PATCH 184/201] rpc: print transport version as raw numeric value

Looking at logs seeing references to transport_version::unsupported is
not intuitive. Instead print the raw value. This has the side affect
that it makes it easier to print specific context when the version is
actually unsupported by doesn't map to a specific enum name.

Signed-off-by: Noah Watkins <noah@redpanda.com>
---
 src/v/rpc/types.cc | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/v/rpc/types.cc b/src/v/rpc/types.cc
index 503a0d5aa0a7c..d048de021b512 100644
--- a/src/v/rpc/types.cc
+++ b/src/v/rpc/types.cc
@@ -67,12 +67,11 @@ std::ostream& operator<<(std::ostream& o, const status& s) {
 }
 
 std::ostream& operator<<(std::ostream& o, transport_version v) {
-    switch (v) {
-    case transport_version::v0:
-        return o << "rpc::transport_version::v0";
-    case transport_version::unsupported:
-        return o << "rpc::transport_version::unsupported";
-    }
+    fmt::print(
+      o,
+      "rpc::transport_version::v{}",
+      static_cast<std::underlying_type_t<transport_version>>(v));
+    return o;
 }
 
 } // namespace rpc

From 7f2e7911fc91737cbb191bf850ddb8ad59fe8754 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noah@redpanda.com>
Date: Thu, 26 May 2022 16:34:45 -0700
Subject: [PATCH 185/201] rpc: introduce new transport versions

v0: the og verison used by rpc simple protocol. at this version level clients
and servers (1) assume adl encoding, (2) ignore the version when handling a
request, and (3) always respond with version 0.

v1,2: starting with version v1, clients and servers no longer ignore the
version. v1 indicates adl encoding and v2 indicates serde encoding.

These new version enums will not become active (and max_supported updated)
until more of the infrastructure for dealing with versioning is introduced in
subsequent patches.

Signed-off-by: Noah Watkins <noah@redpanda.com>
---
 src/v/rpc/types.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/v/rpc/types.h b/src/v/rpc/types.h
index b36de6daa7f0e..669718ba553c4 100644
--- a/src/v/rpc/types.h
+++ b/src/v/rpc/types.h
@@ -73,7 +73,20 @@ enum class status : uint32_t {
 };
 
 enum class transport_version : uint8_t {
+    /*
+     * the first version used by rpc simple protocol. at this version level
+     * clients and servers (1) assume adl encoding, (2) ignore the version when
+     * handling a request, and (3) always respond with version 0.
+     */
     v0 = 0,
+
+    /*
+     * starting with version v1 clients and servers no longer ignore the
+     * version. v1 indicates adl encoding and v2 indicates serde encoding.
+     */
+    v1 = 1,
+    v2 = 2,
+
     max_supported = v0,
 
     /*

From 2d5757a166a4b62a9973d249eeb2ebbffde38287 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noah@redpanda.com>
Date: Thu, 26 May 2022 16:04:57 -0700
Subject: [PATCH 186/201] rpc: use variable client transport version

Adds a version field to rpc client transport and uses this version when
sending a message to the server. The new version tracked is initialized
to v0 and never changed in this patch, preserving the existing behavior.

The client transport version will be used to dynamically upgrade a
client from adl to serde encoding through client/server negotiation.

Signed-off-by: Noah Watkins <noah@redpanda.com>
---
 src/v/rpc/transport.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/v/rpc/transport.h b/src/v/rpc/transport.h
index 7a5304ee19d97..8f7ae9b6fbff4 100644
--- a/src/v/rpc/transport.h
+++ b/src/v/rpc/transport.h
@@ -102,6 +102,14 @@ class transport final : public net::base_transport {
     requests_queue_t _requests_queue;
     sequence_t _seq;
     sequence_t _last_seq;
+
+    /*
+     * version level used when dispatching requests. this value may change
+     * during the lifetime of the transport. for example the version may be
+     * upgraded if it is discovered that a server supports a newer version.
+     */
+    transport_version _version{transport_version::v0};
+
     friend std::ostream& operator<<(std::ostream&, const transport&);
 };
 
@@ -165,7 +173,7 @@ inline ss::future<result<client_context<Output>>>
 transport::send_typed(Input r, uint32_t method_id, rpc::client_opts opts) {
     using ret_t = result<client_context<Output>>;
     return send_typed_versioned<Input, Output>(
-             std::move(r), method_id, std::move(opts), transport_version::v0)
+             std::move(r), method_id, std::move(opts), _version)
       .then([](result<result_context<Output>> res) {
           if (!res) {
               return ss::make_ready_future<ret_t>(res.error());

From 9814025632b1e4b87aa36537ad646ed72202aebc Mon Sep 17 00:00:00 2001
From: Noah Watkins <noah@redpanda.com>
Date: Thu, 26 May 2022 16:17:07 -0700
Subject: [PATCH 187/201] rpc: add type trait to exempt types from adl/serde
 encoding

the transition from adl to serde encoding in rpc requires a period of time
where both encodings are supported for all message types. however, we do not
want to extend this requirement to brand new messages / services, nor to rpc
types used in coproc which will remain in legacy adl format for now.

we use the type system to enforce these rules and allow types to be opt-out
on a case-by-case basis for adl (new messages) or serde (legacy like coproc).

the `rpc_adl_exempt` and `rpc_serde_exempt` type trait helpers can be used to
opt-out a type T from adl or serde support. a type is marked exempt by
defining the type `T::rpc_(adl|serde)_exempt`.  the typedef may be defined as
any type such as std::{void_t, true_type}.

Example:

    struct exempt_msg {
        using rpc_adl_exempt = std::true_type;
        ...
    };

then use the `is_rpc_adl_exempt` or `is_rpc_serde_exempt` concept to test.

Signed-off-by: Noah Watkins <noah@redpanda.com>
---
 src/v/rpc/parse_utils.h | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/src/v/rpc/parse_utils.h b/src/v/rpc/parse_utils.h
index 7bd0b8c7dcef9..0af66040b2c92 100644
--- a/src/v/rpc/parse_utils.h
+++ b/src/v/rpc/parse_utils.h
@@ -82,6 +82,39 @@ inline void validate_payload_and_header(const iobuf& io, const header& h) {
     }
 }
 
+/*
+ * the transition from adl to serde encoding in rpc requires a period of time
+ * where both encodings are supported for all message types. however, we do not
+ * want to extend this requirement to brand new messages / services, nor to rpc
+ * types used in coproc which will remain in legacy adl format for now.
+ *
+ * we use the type system to enforce these rules and allow types to be opt-out
+ * on a case-by-case basis for adl (new messages) or serde (legacy like coproc).
+ *
+ * the `rpc_adl_exempt` and `rpc_serde_exempt` type trait helpers can be used to
+ * opt-out a type T from adl or serde support. a type is marked exempt by
+ * defining the type `T::rpc_(adl|serde)_exempt`.  the typedef may be defined as
+ * any type such as std::{void_t, true_type}.
+ *
+ * Example:
+ *
+ *     struct exempt_msg {
+ *         using rpc_adl_exempt = std::true_type;
+ *         ...
+ *     };
+ *
+ * then use the `is_rpc_adl_exempt` or `is_rpc_serde_exempt` concept to test.
+ */
+template<typename T>
+concept is_rpc_adl_exempt = requires {
+    typename T::rpc_adl_exempt;
+};
+
+template<typename T>
+concept is_rpc_serde_exempt = requires {
+    typename T::rpc_serde_exempt;
+};
+
 template<typename T>
 ss::future<T> parse_type(ss::input_stream<char>& in, const header& h) {
     return read_iobuf_exactly(in, h.payload_size).then([h](iobuf io) {

From a1d5504df107313f3a0bc1e356d6b00fa11658ff Mon Sep 17 00:00:00 2001
From: Noah Watkins <noahwatkins@gmail.com>
Date: Sat, 16 Jul 2022 13:03:36 -0700
Subject: [PATCH 188/201] rpc: add type annotations to rpc exempt messages

- coproc messages continue to use adl
- several tests can continue to use adl

Signed-off-by: Noah Watkins <noahwatkins@gmail.com>
---
 src/v/coproc/types.h           | 6 ++++++
 src/v/rpc/test/rpc_gen_types.h | 9 +++++++++
 src/v/rpc/test/test_types.h    | 1 +
 3 files changed, 16 insertions(+)

diff --git a/src/v/coproc/types.h b/src/v/coproc/types.h
index 3951d7a2b4e9b..6bf8a426d8f00 100644
--- a/src/v/coproc/types.h
+++ b/src/v/coproc/types.h
@@ -47,6 +47,7 @@ enum class topic_ingestion_policy : int8_t { earliest = 0, stored, latest };
 
 /// \brief type to use for registration/deregistration of a topic
 struct enable_copros_request {
+    using rpc_serde_exempt = std::true_type;
     struct data {
         script_id id;
         iobuf source_code;
@@ -57,6 +58,7 @@ struct enable_copros_request {
 /// \brief registration acks per copro, responses are organized in the
 /// same order as the list of topics in the 'topics' array
 struct enable_copros_reply {
+    using rpc_serde_exempt = std::true_type;
     using topic_policy = std::pair<model::topic, topic_ingestion_policy>;
     struct script_metadata {
         script_id id;
@@ -76,12 +78,14 @@ using state_size_t = named_type<int64_t, struct state_size_tag>;
 /// \brief deregistration request, remove all topics registered to a coprocessor
 /// with id 'script_id'.
 struct disable_copros_request {
+    using rpc_serde_exempt = std::true_type;
     std::vector<script_id> ids;
 };
 
 /// \brief deregistration acks per topic, responses are organized in the
 /// same order as the list of topics in the 'ids' array
 struct disable_copros_reply {
+    using rpc_serde_exempt = std::true_type;
     using ack = std::pair<script_id, disable_response_code>;
     std::vector<ack> acks;
 };
@@ -89,6 +93,7 @@ struct disable_copros_reply {
 /// \brief Request that co-processors with the given script ids, process batches
 /// from the reader whose source topic is the given ntp
 struct process_batch_request {
+    using rpc_serde_exempt = std::true_type;
     struct data {
         std::vector<script_id> ids;
         model::ntp ntp;
@@ -100,6 +105,7 @@ struct process_batch_request {
 /// \brief Response from the above request, acks from script ids that have
 /// processed the record and produce new batches on a new materialized ntp
 struct process_batch_reply {
+    using rpc_serde_exempt = std::true_type;
     struct data {
         script_id id;
         model::ntp source;
diff --git a/src/v/rpc/test/rpc_gen_types.h b/src/v/rpc/test/rpc_gen_types.h
index 7527d8e9c2f87..cfc8cab4f0dc8 100644
--- a/src/v/rpc/test/rpc_gen_types.h
+++ b/src/v/rpc/test/rpc_gen_types.h
@@ -11,6 +11,7 @@
 
 #pragma once
 
+#include "rpc/parse_utils.h"
 #include "seastarx.h"
 
 #include <seastar/core/sstring.hh>
@@ -19,33 +20,41 @@
 
 namespace cycling {
 struct ultimate_cf_slx {
+    using rpc_serde_exempt = std::true_type;
     int x = 42;
 };
 struct nairo_quintana {
+    using rpc_serde_exempt = std::true_type;
     int x = 43;
 };
 struct san_francisco {
+    using rpc_serde_exempt = std::true_type;
     int x = 44;
 };
 struct mount_tamalpais {
+    using rpc_serde_exempt = std::true_type;
     int x = 45;
 };
 } // namespace cycling
 
 namespace echo {
 struct echo_req {
+    using rpc_serde_exempt = std::true_type;
     ss::sstring str;
 };
 
 struct echo_resp {
+    using rpc_serde_exempt = std::true_type;
     ss::sstring str;
 };
 
 struct cnt_req {
+    using rpc_serde_exempt = std::true_type;
     uint64_t expected;
 };
 
 struct cnt_resp {
+    using rpc_serde_exempt = std::true_type;
     uint64_t expected;
     uint64_t current;
 };
diff --git a/src/v/rpc/test/test_types.h b/src/v/rpc/test/test_types.h
index a6e08af0ac290..2c4169bd03296 100644
--- a/src/v/rpc/test/test_types.h
+++ b/src/v/rpc/test/test_types.h
@@ -21,6 +21,7 @@
 #include <vector>
 
 struct pod {
+    using rpc_serde_exempt = std::true_type;
     int16_t x = 1;
     int32_t y = 2;
     int64_t z = 3;

From dc8123be64451f1a35994d0ecd0537b7de3d7605 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noah@redpanda.com>
Date: Thu, 26 May 2022 19:43:06 -0700
Subject: [PATCH 189/201] rpc: refactor rpc::parse_result

The refactor makes subsequent changes more readable by reducing
indentation for most of the new code being added.

The next patch will adjust formatting.

Signed-off-by: Noah Watkins <noah@redpanda.com>
---
 src/v/rpc/transport.h | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/v/rpc/transport.h b/src/v/rpc/transport.h
index 8f7ae9b6fbff4..45940f46f656e 100644
--- a/src/v/rpc/transport.h
+++ b/src/v/rpc/transport.h
@@ -136,9 +136,17 @@ ss::future<result<rpc::client_context<T>>> parse_result(
     using ret_t = result<rpc::client_context<T>>;
     // check status first
     auto st = static_cast<status>(sctx->get_header().meta);
+    if (st != status::success) {
+    /**
+     * signal that request body is parsed since it is empty when status
+     * indicates server error.
+     */
+    sctx->signal_body_parse();
+
+    return ss::make_ready_future<ret_t>(map_server_error(st));
+    }
 
     // success case
-    if (st == status::success) {
         return parse_type<T>(in, sctx->get_header())
           .then_wrapped([sctx = std::move(sctx)](ss::future<T> data_fut) {
               if (data_fut.failed()) {
@@ -155,15 +163,6 @@ ss::future<result<rpc::client_context<T>>> parse_result(
               return ret_t(rpc::client_context<T>(
                 sctx->get_header(), std::move(data_fut.get())));
           });
-    }
-
-    /**
-     * signal that request body is parsed since it is empty when status
-     * indicates server error.
-     */
-    sctx->signal_body_parse();
-
-    return ss::make_ready_future<ret_t>(map_server_error(st));
 }
 
 } // namespace internal

From 73f8c7e24da6e1ef86099b03dfed0cd8adfdc152 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noah@redpanda.com>
Date: Thu, 26 May 2022 19:43:39 -0700
Subject: [PATCH 190/201] chore: apply clang-format

Signed-off-by: Noah Watkins <noah@redpanda.com>
---
 src/v/rpc/transport.h | 44 +++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/v/rpc/transport.h b/src/v/rpc/transport.h
index 45940f46f656e..2b8d2220c00b5 100644
--- a/src/v/rpc/transport.h
+++ b/src/v/rpc/transport.h
@@ -137,32 +137,32 @@ ss::future<result<rpc::client_context<T>>> parse_result(
     // check status first
     auto st = static_cast<status>(sctx->get_header().meta);
     if (st != status::success) {
-    /**
-     * signal that request body is parsed since it is empty when status
-     * indicates server error.
-     */
-    sctx->signal_body_parse();
+        /**
+         * signal that request body is parsed since it is empty when status
+         * indicates server error.
+         */
+        sctx->signal_body_parse();
 
-    return ss::make_ready_future<ret_t>(map_server_error(st));
+        return ss::make_ready_future<ret_t>(map_server_error(st));
     }
 
     // success case
-        return parse_type<T>(in, sctx->get_header())
-          .then_wrapped([sctx = std::move(sctx)](ss::future<T> data_fut) {
-              if (data_fut.failed()) {
-                  const auto ex = data_fut.get_exception();
-                  sctx->body_parse_exception(ex);
-                  /**
-                   * we want to throw an exception when body parsing failed.
-                   * this will invalidate the connection since it may not be
-                   * valid any more.
-                   */
-                  std::rethrow_exception(ex);
-              }
-              sctx->signal_body_parse();
-              return ret_t(rpc::client_context<T>(
-                sctx->get_header(), std::move(data_fut.get())));
-          });
+    return parse_type<T>(in, sctx->get_header())
+      .then_wrapped([sctx = std::move(sctx)](ss::future<T> data_fut) {
+          if (data_fut.failed()) {
+              const auto ex = data_fut.get_exception();
+              sctx->body_parse_exception(ex);
+              /**
+               * we want to throw an exception when body parsing failed.
+               * this will invalidate the connection since it may not be
+               * valid any more.
+               */
+              std::rethrow_exception(ex);
+          }
+          sctx->signal_body_parse();
+          return ret_t(rpc::client_context<T>(
+            sctx->get_header(), std::move(data_fut.get())));
+      });
 }
 
 } // namespace internal

From b2ce580ef30abe0e7d4b07ee8b935569faa00763 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noahwatkins@gmail.com>
Date: Sat, 16 Jul 2022 13:13:39 -0700
Subject: [PATCH 191/201] rpc: transport request adaptive encoding selection

When a message is sent the active transport version determines if it is
sent with adl (v0,1) or serde encoding (v2). Serde-only messages require
a version >= v2. Messages with adl-only support are always sent at v0.
Support for adl-only messages is a temporary solution to allow changes
to the RPC transport and servers to be worked on prior to all types
being supported by serde. Once all types have serde support the
allowance at the type-system level should be removed.  Sending a message
does not upgrade the transport. That is done when handling the reply.

Signed-off-by: Noah Watkins <noahwatkins@gmail.com>
---
 src/v/rpc/parse_utils.h | 41 +++++++++++++++++++++++++++++++++++++++++
 src/v/rpc/transport.h   | 19 +++++++++++++++----
 2 files changed, 56 insertions(+), 4 deletions(-)

diff --git a/src/v/rpc/parse_utils.h b/src/v/rpc/parse_utils.h
index 0af66040b2c92..a82d97ba0c3ec 100644
--- a/src/v/rpc/parse_utils.h
+++ b/src/v/rpc/parse_utils.h
@@ -115,6 +115,47 @@ concept is_rpc_serde_exempt = requires {
     typename T::rpc_serde_exempt;
 };
 
+/*
+ * Encode a client request for the given transport version.
+ *
+ * Unless the message type T is explicitly exempt from adl<> support, type T
+ * must be supported by both adl<> and serde encoding frameworks. When the type
+ * is not exempt from adl<> support, serde is used when the version >= v2.
+ *
+ * The returned version indicates what level of encoding is used. This is always
+ * equal to the input version, except for serde-only messags which return v2.
+ * Callers are expected to further validate the runtime implications of this.
+ */
+template<typename T>
+ss::future<transport_version>
+encode_for_version(iobuf& out, T msg, transport_version version) {
+    static_assert(!is_rpc_adl_exempt<T> || !is_rpc_serde_exempt<T>);
+
+    if constexpr (is_rpc_serde_exempt<T>) {
+        return reflection::async_adl<T>{}.to(out, std::move(msg)).then([] {
+            return transport_version::v0;
+        });
+    } else if constexpr (is_rpc_adl_exempt<T>) {
+        return ss::do_with(std::move(msg), [&out](T& msg) {
+            return serde::write_async(out, std::move(msg)).then([] {
+                return transport_version::v2;
+            });
+        });
+    } else {
+        if (version < transport_version::v2) {
+            return reflection::async_adl<T>{}
+              .to(out, std::move(msg))
+              .then([version] { return version; });
+        } else {
+            return ss::do_with(std::move(msg), [&out, version](T& msg) {
+                return serde::write_async(out, std::move(msg)).then([version] {
+                    return version;
+                });
+            });
+        }
+    }
+}
+
 template<typename T>
 ss::future<T> parse_type(ss::input_stream<char>& in, const header& h) {
     return read_iobuf_exactly(in, h.payload_size).then([h](iobuf io) {
diff --git a/src/v/rpc/transport.h b/src/v/rpc/transport.h
index 2b8d2220c00b5..b4cbfc9892900 100644
--- a/src/v/rpc/transport.h
+++ b/src/v/rpc/transport.h
@@ -196,13 +196,24 @@ transport::send_typed_versioned(
     b->set_min_compression_bytes(opts.min_compression_bytes);
     auto raw_b = b.get();
     raw_b->set_service_method_id(method_id);
-    raw_b->set_version(version);
 
     auto& target_buffer = raw_b->buffer();
     auto seq = ++_seq;
-    return reflection::async_adl<Input>{}
-      .to(target_buffer, std::move(r))
-      .then([this, b = std::move(b), seq, opts = std::move(opts)]() mutable {
+    return encode_for_version(target_buffer, std::move(r), version)
+      .then([this, version, b = std::move(b), seq, opts = std::move(opts)](
+              transport_version effective_version) mutable {
+          /*
+           * enforce the rule that a transport configured as v0 behaves like
+           * a v0 client transport and sends v0 messages.
+           */
+          vassert(
+            version != transport_version::v0
+              || effective_version == transport_version::v0,
+            "Request type {} cannot be encoded at version {} (effective {}).",
+            typeid(Input).name(),
+            version,
+            effective_version);
+          b->set_version(effective_version);
           return do_send(seq, std::move(*b.get()), std::move(opts));
       })
       .then([this](result<std::unique_ptr<streaming_context>> sctx) mutable {

From 1a61a7e9873f4ecaeeaeea24d5098b18ef9fad01 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noah@redpanda.com>
Date: Fri, 27 May 2022 11:11:42 -0700
Subject: [PATCH 192/201] rpc: server handling of adl and serde encodings

This commit adds version awareness to the server. The RPC server will
now decode v0,1 messages with adl<> and v2 messages with serde. It
returns responses with the same version and encoding as the request.

Signed-off-by: Noah Watkins <noah@redpanda.com>
---
 src/v/rpc/parse_utils.h | 37 +++++++++++++++++++++++++++++++++++--
 src/v/rpc/service.h     | 23 +++++++++++++++++++----
 2 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/src/v/rpc/parse_utils.h b/src/v/rpc/parse_utils.h
index a82d97ba0c3ec..8ecb81bcde91d 100644
--- a/src/v/rpc/parse_utils.h
+++ b/src/v/rpc/parse_utils.h
@@ -156,6 +156,39 @@ encode_for_version(iobuf& out, T msg, transport_version version) {
     }
 }
 
+/*
+ * Decode a client request at the given transport version.
+ */
+template<typename T>
+ss::future<T>
+decode_for_version(iobuf_parser& parser, transport_version version) {
+    static_assert(!is_rpc_adl_exempt<T> || !is_rpc_serde_exempt<T>);
+
+    if constexpr (is_rpc_serde_exempt<T>) {
+        if (version != transport_version::v0) {
+            return ss::make_exception_future<T>(std::runtime_error(fmt::format(
+              "Unexpected adl-only message {} at {} != v0",
+              typeid(T).name(),
+              version)));
+        }
+        return reflection::async_adl<T>{}.from(parser);
+    } else if constexpr (is_rpc_adl_exempt<T>) {
+        if (version < transport_version::v2) {
+            return ss::make_exception_future<T>(std::runtime_error(fmt::format(
+              "Unexpected serde-only message {} at {} < v2",
+              typeid(T).name(),
+              version)));
+        }
+        return serde::read_async<T>(parser);
+    } else {
+        if (version < transport_version::v2) {
+            return reflection::async_adl<T>{}.from(parser);
+        } else {
+            return serde::read_async<T>(parser);
+        }
+    }
+}
+
 template<typename T>
 ss::future<T> parse_type(ss::input_stream<char>& in, const header& h) {
     return read_iobuf_exactly(in, h.payload_size).then([h](iobuf io) {
@@ -178,8 +211,8 @@ ss::future<T> parse_type(ss::input_stream<char>& in, const header& h) {
 
         auto p = std::make_unique<iobuf_parser>(std::move(io));
         auto raw = p.get();
-        return reflection::async_adl<T>{}.from(*raw).finally(
-          [p = std::move(p)] {});
+        return decode_for_version<T>(*raw, h.version)
+          .finally([p = std::move(p)] {});
     });
 }
 
diff --git a/src/v/rpc/service.h b/src/v/rpc/service.h
index f61bbb3a922b8..567df19f79456 100644
--- a/src/v/rpc/service.h
+++ b/src/v/rpc/service.h
@@ -74,13 +74,28 @@ struct service::execution_helper {
                     auto input = input_f.get0();
                     return f(std::move(input), ctx);
                 })
-                .then([method_id](Output out) mutable {
+                .then([method_id, &ctx](Output out) mutable {
+                    const auto version = ctx.get_header().version;
                     auto b = std::make_unique<netbuf>();
                     auto raw_b = b.get();
                     raw_b->set_service_method_id(method_id);
-                    return reflection::async_adl<Output>{}
-                      .to(raw_b->buffer(), std::move(out))
-                      .then([b = std::move(b)] { return std::move(*b); });
+                    raw_b->set_version(version);
+                    return encode_for_version(
+                             raw_b->buffer(), std::move(out), version)
+                      .then([version, b = std::move(b)](
+                              transport_version effective_version) {
+                          /*
+                           * this assertion is safe because the conditions under
+                           * which this assertion would fail should have been
+                           * verified in parse_type above.
+                           */
+                          vassert(
+                            effective_version == version,
+                            "Unexpected encoding at effective {} != {}",
+                            effective_version,
+                            version);
+                          return std::move(*b);
+                      });
                 });
           });
     }

From 3685ad74aff5fed72bf58adcd54011d249aa52b0 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noahwatkins@gmail.com>
Date: Sat, 16 Jul 2022 13:04:39 -0700
Subject: [PATCH 193/201] rpc: add new service methods for testing

Adds three new test service methods:

  - echo_adl_only
  - echo_serde_only
  - echo_adl_serde

Each of these methods has normal echo behavior, but uses separate types
for each method specific to adl and serde support. For example,
echo_adl_serde uses types that are both adl and serde encodable.

Furthermore, each encoding/decoding procedure modifies the content of
the message being echoed with an indication of the operation performed.
For example, encoding a message with adl will result in a suffix:

    <orig>_to_aao

where in _to_[a][ao] the first [a] indicates adl encoding and the [ao]
indicates an adl-only type. _to and _from are encoding and decoding,
respectively. So a round-trip RPC echo might result in a final result
arriving at the client with the string:

    <orig>_to_aao_from_aao_to_aao_from_aao

This simple mechanism allows writing tests that verify low-level
behavior regarding how different types are handled without resorting to
having to track and export stats.

Signed-off-by: Noah Watkins <noahwatkins@gmail.com>
---
 src/v/rpc/test/echo_service.json       |  15 +++
 src/v/rpc/test/rpc_gen_cycling_test.cc |  45 ++++++-
 src/v/rpc/test/rpc_gen_types.h         | 173 +++++++++++++++++++++++++
 3 files changed, 227 insertions(+), 6 deletions(-)

diff --git a/src/v/rpc/test/echo_service.json b/src/v/rpc/test/echo_service.json
index ec1b0fea1e5bb..64813b3021fc9 100644
--- a/src/v/rpc/test/echo_service.json
+++ b/src/v/rpc/test/echo_service.json
@@ -34,6 +34,21 @@
             "name": "throw_exception",
             "input_type": "throw_req",
             "output_type": "throw_resp"
+        },
+        {
+            "name": "echo_adl_only",
+            "input_type": "echo_req_adl_only",
+            "output_type": "echo_resp_adl_only"
+        },
+        {
+            "name": "echo_adl_serde",
+            "input_type": "echo_req_adl_serde",
+            "output_type": "echo_resp_adl_serde"
+        },
+        {
+            "name": "echo_serde_only",
+            "input_type": "echo_req_serde_only",
+            "output_type": "echo_resp_serde_only"
         }
     ]
 }
diff --git a/src/v/rpc/test/rpc_gen_cycling_test.cc b/src/v/rpc/test/rpc_gen_cycling_test.cc
index 50a8c66446d7a..72af506dc7aac 100644
--- a/src/v/rpc/test/rpc_gen_cycling_test.cc
+++ b/src/v/rpc/test/rpc_gen_cycling_test.cc
@@ -95,6 +95,24 @@ struct echo_impl final : echo::echo_service {
         }
     }
 
+    ss::future<echo::echo_resp_adl_only> echo_adl_only(
+      echo::echo_req_adl_only&& req, rpc::streaming_context&) final {
+        return ss::make_ready_future<echo::echo_resp_adl_only>(
+          echo::echo_resp_adl_only{.str = req.str});
+    }
+
+    ss::future<echo::echo_resp_adl_serde> echo_adl_serde(
+      echo::echo_req_adl_serde&& req, rpc::streaming_context&) final {
+        return ss::make_ready_future<echo::echo_resp_adl_serde>(
+          echo::echo_resp_adl_serde{.str = req.str});
+    }
+
+    ss::future<echo::echo_resp_serde_only> echo_serde_only(
+      echo::echo_req_serde_only&& req, rpc::streaming_context&) final {
+        return ss::make_ready_future<echo::echo_resp_serde_only>(
+          echo::echo_resp_serde_only{.str = req.str});
+    }
+
     uint64_t cnt = 0;
 };
 
@@ -530,6 +548,14 @@ FIXTURE_TEST(corrupted_data_at_server, rpc_integration_fixture) {
     }
 }
 
+/*
+ * the not_supported_version test uses the echo_adl_serde variant rather than
+ * the original version whose types cause it to be treated as adl-only. Because
+ * adl-only messages are sent at v0 and the test specifically requires sending
+ * messages at an arbitrarily higher value to trigger the error, a type was
+ * needed that supports a dynamic version range. When encoding adl/serde
+ * supported types the version is passed through.
+ */
 FIXTURE_TEST(version_not_supported, rpc_integration_fixture) {
     configure_server();
     register_services();
@@ -541,9 +567,11 @@ FIXTURE_TEST(version_not_supported, rpc_integration_fixture) {
     auto client = echo::echo_client_protocol(t);
 
     const auto check_unsupported = [&] {
-        auto f = t.send_typed_versioned<echo::echo_req, echo::echo_resp>(
-          echo::echo_req{.str = "testing..."},
-          960598415,
+        auto f = t.send_typed_versioned<
+          echo::echo_req_adl_serde,
+          echo::echo_resp_adl_serde>(
+          echo::echo_req_adl_serde{.str = "testing..."},
+          echo::echo_service::echo_adl_serde_method_id,
           rpc::client_opts(rpc::no_timeout),
           rpc::transport_version::unsupported);
         return f.then([&](auto ret) {
@@ -561,12 +589,17 @@ FIXTURE_TEST(version_not_supported, rpc_integration_fixture) {
     };
 
     const auto check_supported = [&] {
-        auto f = client.echo(
-          echo::echo_req{.str = "testing..."},
+        auto f = client.echo_adl_serde(
+          echo::echo_req_adl_serde{.str = "testing..."},
           rpc::client_opts(rpc::no_timeout));
         return f.then([&](auto ret) {
             BOOST_REQUIRE(ret.has_value());
-            BOOST_REQUIRE_EQUAL(ret.value().data.str, "testing...");
+            // could be either one. depends on timing of transport upgrade
+            BOOST_REQUIRE(
+              ret.value().data.str
+                == "testing..._to_aas_from_aas_to_aas_from_aas"
+              || ret.value().data.str
+                   == "testing..._to_sas_from_sas_to_sas_from_sas");
         });
     };
 
diff --git a/src/v/rpc/test/rpc_gen_types.h b/src/v/rpc/test/rpc_gen_types.h
index cfc8cab4f0dc8..bdd61b46315b5 100644
--- a/src/v/rpc/test/rpc_gen_types.h
+++ b/src/v/rpc/test/rpc_gen_types.h
@@ -11,8 +11,11 @@
 
 #pragma once
 
+#include "reflection/adl.h"
 #include "rpc/parse_utils.h"
 #include "seastarx.h"
+#include "serde/envelope.h"
+#include "serde/serde.h"
 
 #include <seastar/core/sstring.hh>
 
@@ -64,7 +67,177 @@ enum class failure_type { throw_exception, exceptional_future, none };
 using throw_req = failure_type;
 
 struct throw_resp {
+    using rpc_serde_exempt = std::true_type;
     ss::sstring reply;
 };
 
+/*
+ * echo methods with req/resp that support encodings:
+ * - adl only
+ * - serde only
+ * - serde and adl
+ */
+struct echo_req_adl_only {
+    using rpc_serde_exempt = std::true_type;
+    ss::sstring str;
+};
+
+struct echo_resp_adl_only {
+    using rpc_serde_exempt = std::true_type;
+    ss::sstring str;
+};
+
+// an adl-only type should not have serde support
+static_assert(!serde::is_serde_compatible_v<echo_req_adl_only>);
+static_assert(!serde::is_serde_compatible_v<echo_resp_adl_only>);
+
+// an adl-only type should not be exempt from adl support
+static_assert(!rpc::is_rpc_adl_exempt<echo_req_adl_only>);
+static_assert(!rpc::is_rpc_adl_exempt<echo_resp_adl_only>);
+
+struct echo_req_adl_serde
+  : serde::envelope<echo_req_adl_serde, serde::version<1>> {
+    ss::sstring str;
+
+    void serde_write(iobuf& out) const {
+        // serialize with serde an adl-serde type
+        using serde::write;
+        write(out, str + "_to_sas");
+    }
+
+    void serde_read(iobuf_parser& in, const serde::header& h) {
+        // deserialize with serde an adl-serde type
+        using serde::read_nested;
+        str = read_nested<ss::sstring>(in, h._bytes_left_limit);
+        str += "_from_sas";
+    }
+};
+
+struct echo_resp_adl_serde
+  : serde::envelope<echo_resp_adl_serde, serde::version<1>> {
+    ss::sstring str;
+
+    void serde_write(iobuf& out) const {
+        // serialize with serde an adl-serde type
+        using serde::write;
+        write(out, str + "_to_sas");
+    }
+
+    void serde_read(iobuf_parser& in, const serde::header& h) {
+        // deserialize with serde an adl-serde type
+        using serde::read_nested;
+        str = read_nested<ss::sstring>(in, h._bytes_left_limit);
+        str += "_from_sas";
+    }
+};
+
+static_assert(serde::is_serde_compatible_v<echo_req_adl_serde>);
+static_assert(serde::is_serde_compatible_v<echo_resp_adl_serde>);
+static_assert(!rpc::is_rpc_adl_exempt<echo_req_adl_serde>);
+static_assert(!rpc::is_rpc_adl_exempt<echo_resp_adl_serde>);
+
+struct echo_req_serde_only
+  : serde::envelope<echo_req_serde_only, serde::version<1>> {
+    using rpc_adl_exempt = std::true_type;
+    ss::sstring str;
+
+    void serde_write(iobuf& out) const {
+        // serialize with serde a serde-only type
+        using serde::write;
+        write(out, str + "_to_sso");
+    }
+
+    void serde_read(iobuf_parser& in, const serde::header& h) {
+        // deserialize with serde a serde-only type
+        using serde::read_nested;
+        str = read_nested<ss::sstring>(in, h._bytes_left_limit);
+        str += "_from_sso";
+    }
+};
+
+struct echo_resp_serde_only
+  : serde::envelope<echo_resp_serde_only, serde::version<1>> {
+    using rpc_adl_exempt = std::true_type;
+    ss::sstring str;
+
+    void serde_write(iobuf& out) const {
+        // serialize with serde a serde-only type
+        using serde::write;
+        write(out, str + "_to_sso");
+    }
+
+    void serde_read(iobuf_parser& in, const serde::header& h) {
+        // deserialize with serde a serde-only type
+        using serde::read_nested;
+        str = read_nested<ss::sstring>(in, h._bytes_left_limit);
+        str += "_from_sso";
+    }
+};
+
+// serde-only type needs to have serde support
+static_assert(serde::is_serde_compatible_v<echo_req_serde_only>);
+static_assert(serde::is_serde_compatible_v<echo_resp_serde_only>);
+
+// serde-only type needs to be example from adl
+static_assert(rpc::is_rpc_adl_exempt<echo_req_serde_only>);
+static_assert(rpc::is_rpc_adl_exempt<echo_resp_serde_only>);
+
 } // namespace echo
+
+namespace reflection {
+template<>
+struct adl<echo::echo_req_adl_only> {
+    void to(iobuf& out, echo::echo_req_adl_only&& r) {
+        // serialize with adl an adl-only type
+        reflection::serialize(out, r.str + "_to_aao");
+    }
+    echo::echo_req_adl_only from(iobuf_parser& in) {
+        // deserialize with adl an adl-only type
+        return echo::echo_req_adl_only{
+          .str = adl<ss::sstring>{}.from(in) + "_from_aao",
+        };
+    }
+};
+
+template<>
+struct adl<echo::echo_resp_adl_only> {
+    void to(iobuf& out, echo::echo_resp_adl_only&& r) {
+        // serialize with adl an adl-only type
+        reflection::serialize(out, r.str + "_to_aao");
+    }
+    echo::echo_resp_adl_only from(iobuf_parser& in) {
+        // deserialize with adl an adl-only type
+        return echo::echo_resp_adl_only{
+          .str = adl<ss::sstring>{}.from(in) + "_from_aao",
+        };
+    }
+};
+
+template<>
+struct adl<echo::echo_req_adl_serde> {
+    void to(iobuf& out, echo::echo_req_adl_serde&& r) {
+        // serialize with adl an adl-serde type
+        reflection::serialize(out, r.str + "_to_aas");
+    }
+    echo::echo_req_adl_serde from(iobuf_parser& in) {
+        // deserialize with adl an adl-serde type
+        return echo::echo_req_adl_serde{
+          .str = adl<ss::sstring>{}.from(in) + "_from_aas",
+        };
+    }
+};
+
+template<>
+struct adl<echo::echo_resp_adl_serde> {
+    void to(iobuf& out, echo::echo_resp_adl_serde&& r) {
+        // serialize with adl an adl-serde type
+        reflection::serialize(out, r.str + "_to_aas");
+    }
+    echo::echo_resp_adl_serde from(iobuf_parser& in) {
+        // deserialize with adl an adl-serde type
+        return echo::echo_resp_adl_serde{
+          .str = adl<ss::sstring>{}.from(in) + "_from_aas",
+        };
+    }
+};
+} // namespace reflection

From 7f22161857d22053c93ca09653aea5ab55a5aa2d Mon Sep 17 00:00:00 2001
From: Noah Watkins <noah@redpanda.com>
Date: Thu, 26 May 2022 20:37:09 -0700
Subject: [PATCH 194/201] rpc: client transport handles version upgrade

Changes the client to upgrade its version when it finds that it is
connected to a peer that supports serde. Once the transport is upgraded
there may be a small window of time when the the transport version has
changed but inflight requests with a lower version are inflight.
Therefore it is important that message processing samples the version
and uses a consistent version throughout the lifetime of the messages,
rather than examining the active transport version on demand.

Signed-off-by: Noah Watkins <noah@redpanda.com>
---
 src/v/rpc/transport.h | 71 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 57 insertions(+), 14 deletions(-)

diff --git a/src/v/rpc/transport.h b/src/v/rpc/transport.h
index b4cbfc9892900..bd25f4fac460b 100644
--- a/src/v/rpc/transport.h
+++ b/src/v/rpc/transport.h
@@ -132,21 +132,47 @@ inline errc map_server_error(status status) {
 
 template<typename T>
 ss::future<result<rpc::client_context<T>>> parse_result(
-  ss::input_stream<char>& in, std::unique_ptr<streaming_context> sctx) {
+  ss::input_stream<char>& in,
+  std::unique_ptr<streaming_context> sctx,
+  transport_version req_ver) {
     using ret_t = result<rpc::client_context<T>>;
-    // check status first
-    auto st = static_cast<status>(sctx->get_header().meta);
-    if (st != status::success) {
-        /**
-         * signal that request body is parsed since it is empty when status
-         * indicates server error.
-         */
-        sctx->signal_body_parse();
 
+    const auto st = static_cast<status>(sctx->get_header().meta);
+    const auto rep_ver = sctx->get_header().version;
+
+    /*
+     * the reply version should always be the same as the request version,
+     * otherwise this is non-compliant behavior. the exception to this
+     * rule is a v0 reply to a v1 request (ie talking to old v0 server).
+     */
+    const auto protocol_violation
+      = rep_ver != req_ver
+        && (req_ver != transport_version::v1 || rep_ver != transport_version::v0);
+
+    if (unlikely(st != status::success || protocol_violation)) {
+        sctx->signal_body_parse();
+        if (st == status::version_not_supported) {
+            /*
+             * let version_not_supported take precedence over error handling for
+             * protocol violations because the protocol violation may be due to
+             * the unsupported version scenario.
+             */
+            return ss::make_ready_future<ret_t>(map_server_error(st));
+        }
+        if (protocol_violation) {
+            vlog(
+              rpclog.warn,
+              "Protocol violation: request version {} incompatible with "
+              "reply version {}",
+              req_ver,
+              rep_ver);
+        }
+        if (st == status::success) {
+            return ss::make_ready_future<ret_t>(errc::service_error);
+        }
         return ss::make_ready_future<ret_t>(map_server_error(st));
     }
 
-    // success case
     return parse_type<T>(in, sctx->get_header())
       .then_wrapped([sctx = std::move(sctx)](ss::future<T> data_fut) {
           if (data_fut.failed()) {
@@ -189,6 +215,7 @@ transport::send_typed_versioned(
   rpc::client_opts opts,
   transport_version version) {
     using ret_t = result<result_context<Output>>;
+    using ctx_t = result<std::unique_ptr<streaming_context>>;
     _probe.request();
 
     auto b = std::make_unique<rpc::netbuf>();
@@ -214,15 +241,31 @@ transport::send_typed_versioned(
             version,
             effective_version);
           b->set_version(effective_version);
-          return do_send(seq, std::move(*b.get()), std::move(opts));
+          return do_send(seq, std::move(*b.get()), std::move(opts))
+            .then([effective_version](ctx_t ctx) {
+                return std::make_tuple(std::move(ctx), effective_version);
+            });
       })
-      .then([this](result<std::unique_ptr<streaming_context>> sctx) mutable {
+      .then_unpack([this](ctx_t sctx, transport_version req_ver) {
           if (!sctx) {
               return ss::make_ready_future<ret_t>(sctx.error());
           }
           const auto version = sctx.value()->get_header().version;
-          return internal::parse_result<Output>(_in, std::move(sctx.value()))
-            .then([version](result<client_context<Output>> r) {
+          return internal::parse_result<Output>(
+                   _in, std::move(sctx.value()), req_ver)
+            .then([this, version](result<client_context<Output>> r) {
+                /*
+                 * upgrade transport to v2 when:
+                 * - at version v1 (do not upgrade from v0 -- for testing)
+                 * - the response was handled/contains no errors
+                 * - the response is v1,v2 (from a new server)
+                 */
+                if (
+                  _version == transport_version::v1 && r.has_value()
+                  && (version == transport_version::v1 || version == transport_version::v2)) {
+                    vlog(rpclog.debug, "Upgrading connection from v1 to v2");
+                    _version = transport_version::v2;
+                }
                 return ret_t(result_context<Output>{version, std::move(r)});
             });
       });

From 45d34c8afc27ed4090d8a8df90aad410f1049299 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noah@redpanda.com>
Date: Fri, 27 May 2022 16:05:01 -0700
Subject: [PATCH 195/201] rpc: support swappable service message handling

This commit factors out the message handling within the RPC server and
allows it to be swapped in through a template parameter. The type
default_message_encoder provides the normal version-aware handling
introduced in this patch series.

Signed-off-by: Noah Watkins <noah@redpanda.com>
---
 src/v/rpc/parse_utils.h        | 39 ++++++++++++++++++++++++++++++++--
 src/v/rpc/service.h            | 11 +++++-----
 src/v/rpc/test/netbuf_tests.cc |  5 +++--
 src/v/rpc/transport.h          |  2 +-
 tools/rpcgen.py                | 23 ++++++++++++--------
 5 files changed, 61 insertions(+), 19 deletions(-)

diff --git a/src/v/rpc/parse_utils.h b/src/v/rpc/parse_utils.h
index 8ecb81bcde91d..c788d023d03e1 100644
--- a/src/v/rpc/parse_utils.h
+++ b/src/v/rpc/parse_utils.h
@@ -189,7 +189,42 @@ decode_for_version(iobuf_parser& parser, transport_version version) {
     }
 }
 
-template<typename T>
+/*
+ * type used to factor out version-specific functionality from request handling
+ * in services. this is used so that tests can specialize behavior.
+ *
+ * this is the default mixin that is used by the code generator.
+ */
+struct default_message_codec {
+    /*
+     * decodes a request (server) or response (client)
+     */
+    template<typename T>
+    static ss::future<T>
+    decode(iobuf_parser& parser, transport_version version) {
+        return decode_for_version<T>(parser, version);
+    }
+
+    /*
+     * Used by the server to determine which version use when sending a response
+     * back to the client. The default behavior is maintain the same version as
+     * the received request.
+     */
+    static transport_version response_version(const header& h) {
+        return h.version;
+    }
+
+    /*
+     * encodes a request (client) or response (server)
+     */
+    template<typename T>
+    static ss::future<transport_version>
+    encode(iobuf& out, T msg, transport_version version) {
+        return encode_for_version(out, std::move(msg), version);
+    }
+};
+
+template<typename T, typename Codec>
 ss::future<T> parse_type(ss::input_stream<char>& in, const header& h) {
     return read_iobuf_exactly(in, h.payload_size).then([h](iobuf io) {
         validate_payload_and_header(io, h);
@@ -211,7 +246,7 @@ ss::future<T> parse_type(ss::input_stream<char>& in, const header& h) {
 
         auto p = std::make_unique<iobuf_parser>(std::move(io));
         auto raw = p.get();
-        return decode_for_version<T>(*raw, h.version)
+        return Codec::template decode<T>(*raw, h.version)
           .finally([p = std::move(p)] {});
     });
 }
diff --git a/src/v/rpc/service.h b/src/v/rpc/service.h
index 567df19f79456..e1d4abd615cd9 100644
--- a/src/v/rpc/service.h
+++ b/src/v/rpc/service.h
@@ -26,7 +26,7 @@ namespace rpc {
 
 /// \brief most service implementations will be codegenerated
 struct service {
-    template<typename Input, typename Output>
+    template<typename Input, typename Output, typename Codec>
     struct execution_helper;
 
     service() = default;
@@ -50,7 +50,7 @@ class rpc_internal_body_parsing_exception : public std::exception {
     seastar::sstring _what;
 };
 
-template<typename Input, typename Output>
+template<typename Input, typename Output, typename Codec>
 struct service::execution_helper {
     using input = Input;
     using output = Output;
@@ -63,7 +63,7 @@ struct service::execution_helper {
       Func&& f) {
         return ctx.permanent_memory_reservation(ctx.get_header().payload_size)
           .then([f = std::forward<Func>(f), method_id, &in, &ctx]() mutable {
-              return parse_type<Input>(in, ctx.get_header())
+              return parse_type<Input, Codec>(in, ctx.get_header())
                 .then_wrapped([f = std::forward<Func>(f),
                                &ctx](ss::future<Input> input_f) mutable {
                     if (input_f.failed()) {
@@ -75,12 +75,13 @@ struct service::execution_helper {
                     return f(std::move(input), ctx);
                 })
                 .then([method_id, &ctx](Output out) mutable {
-                    const auto version = ctx.get_header().version;
+                    const auto version = Codec::response_version(
+                      ctx.get_header());
                     auto b = std::make_unique<netbuf>();
                     auto raw_b = b.get();
                     raw_b->set_service_method_id(method_id);
                     raw_b->set_version(version);
-                    return encode_for_version(
+                    return Codec::encode(
                              raw_b->buffer(), std::move(out), version)
                       .then([version, b = std::move(b)](
                               transport_version effective_version) {
diff --git a/src/v/rpc/test/netbuf_tests.cc b/src/v/rpc/test/netbuf_tests.cc
index 3d8ac0112643c..0690da1a09544 100644
--- a/src/v/rpc/test/netbuf_tests.cc
+++ b/src/v/rpc/test/netbuf_tests.cc
@@ -21,8 +21,9 @@ namespace rpc {
 /// \brief expects the inputstream to be prefixed by an rpc::header
 template<typename T>
 ss::future<T> parse_framed(ss::input_stream<char>& in) {
-    return parse_header(in).then(
-      [&in](std::optional<header> o) { return parse_type<T>(in, o.value()); });
+    return parse_header(in).then([&in](std::optional<header> o) {
+        return parse_type<T, default_message_codec>(in, o.value());
+    });
 }
 } // namespace rpc
 
diff --git a/src/v/rpc/transport.h b/src/v/rpc/transport.h
index bd25f4fac460b..132451a9ef38b 100644
--- a/src/v/rpc/transport.h
+++ b/src/v/rpc/transport.h
@@ -173,7 +173,7 @@ ss::future<result<rpc::client_context<T>>> parse_result(
         return ss::make_ready_future<ret_t>(map_server_error(st));
     }
 
-    return parse_type<T>(in, sctx->get_header())
+    return parse_type<T, default_message_codec>(in, sctx->get_header())
       .then_wrapped([sctx = std::move(sctx)](ss::future<T> data_fut) {
           if (data_fut.failed()) {
               const auto ex = data_fut.get_exception();
diff --git a/tools/rpcgen.py b/tools/rpcgen.py
index f932169e2f5cf..e604604e529e0 100755
--- a/tools/rpcgen.py
+++ b/tools/rpcgen.py
@@ -54,7 +54,8 @@
 
 namespace {{namespace}} {
 
-class {{service_name}}_service : public rpc::service {
+template<typename Codec>
+class {{service_name}}_service_base : public rpc::service {
 public:
     class failure_probes;
 
@@ -62,21 +63,21 @@ class failure_probes;
     static constexpr uint32_t {{method.name}}_method_id = {{method.id}};
     {%- endfor %}
 
-    {{service_name}}_service(ss::scheduling_group sc, ss::smp_service_group ssg)
+    {{service_name}}_service_base(ss::scheduling_group sc, ss::smp_service_group ssg)
        : _sc(sc), _ssg(ssg) {}
 
-    {{service_name}}_service({{service_name}}_service&& o) noexcept
+    {{service_name}}_service_base({{service_name}}_service_base&& o) noexcept
       : _sc(std::move(o._sc)), _ssg(std::move(o._ssg)), _methods(std::move(o._methods)) {}
 
-    {{service_name}}_service& operator=({{service_name}}_service&& o) noexcept {
+    {{service_name}}_service_base& operator=({{service_name}}_service_base&& o) noexcept {
        if(this != &o){
-          this->~{{service_name}}_service();
-          new (this) {{service_name}}_service(std::move(o));
+          this->~{{service_name}}_service_base();
+          new (this) {{service_name}}_service_base(std::move(o));
        }
        return *this;
     }
 
-    virtual ~{{service_name}}_service() noexcept = default;
+    virtual ~{{service_name}}_service_base() noexcept = default;
 
     void setup_metrics() final {
         namespace sm = ss::metrics;
@@ -124,7 +125,8 @@ class failure_probes;
     virtual ss::future<rpc::netbuf>
     raw_{{method.name}}(ss::input_stream<char>& in, rpc::streaming_context& ctx) {
       return execution_helper<{{method.input_type}},
-                              {{method.output_type}}>::exec(in, ctx, {{method.id}},
+                              {{method.output_type}},
+                              Codec>::exec(in, ctx, {{method.id}},
       [this](
           {{method.input_type}}&& t, rpc::streaming_context& ctx) -> ss::future<{{method.output_type}}> {
           return {{method.name}}(std::move(t), ctx);
@@ -166,7 +168,8 @@ class {{service_name}}_client_protocol {
     rpc::transport& _transport;
 };
 
-class {{service_name}}_service::failure_probes final : public finjector::probe {
+template<typename Codec>
+class {{service_name}}_service_base<Codec>::failure_probes final : public finjector::probe {
 public:
     using type = uint32_t;
 
@@ -221,6 +224,8 @@ class {{service_name}}_service::failure_probes final : public finjector::probe {
     fast_prng _prng;
 };
 
+using {{service_name}}_service = {{service_name}}_service_base<rpc::default_message_codec>;
+
 } // namespace
 """
 

From b21c9996e6e3f5473205126385d7971d304df312 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noah@redpanda.com>
Date: Fri, 27 May 2022 16:05:14 -0700
Subject: [PATCH 196/201] rpc: add message handler to emulate v0 services

Implementation of service message handler that can be used by tests to
build services with v0 semantics. These semantics are that messages are
unconditionally encoded and decoded with adl, version information is
ignored, and responses are sent with version v0.

Signed-off-by: Noah Watkins <noah@redpanda.com>
---
 src/v/rpc/parse_utils.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/src/v/rpc/parse_utils.h b/src/v/rpc/parse_utils.h
index c788d023d03e1..313db0b334c22 100644
--- a/src/v/rpc/parse_utils.h
+++ b/src/v/rpc/parse_utils.h
@@ -224,6 +224,33 @@ struct default_message_codec {
     }
 };
 
+/*
+ * service specialization mixin to create a v0 compliant service. a v0 service
+ * encodes and decodes using adl, ignores versions on requests, and sends
+ * replies with v0 in the header.
+ *
+ * example:
+ *   using echo_service_v0 = echo_service_base<v0_message_codec>;
+ */
+struct v0_message_codec {
+    template<typename T>
+    static ss::future<T> decode(iobuf_parser& parser, transport_version) {
+        return reflection::async_adl<T>{}.from(parser);
+    }
+
+    static transport_version response_version(const header&) {
+        return transport_version::v0;
+    }
+
+    template<typename T>
+    static ss::future<transport_version>
+    encode(iobuf& out, T msg, transport_version) {
+        return reflection::async_adl<T>{}.to(out, std::move(msg)).then([] {
+            return transport_version::v0;
+        });
+    }
+};
+
 template<typename T, typename Codec>
 ss::future<T> parse_type(ss::input_stream<char>& in, const header& h) {
     return read_iobuf_exactly(in, h.payload_size).then([h](iobuf io) {

From cdf103a7760b526fe6f4beb4164ffd21100d71ef Mon Sep 17 00:00:00 2001
From: Noah Watkins <noah@redpanda.com>
Date: Fri, 27 May 2022 18:33:20 -0700
Subject: [PATCH 197/201] rpc: add v0 service creation in rpc test fixture

This allows the services used in rpc testing to be started in a v0 mode
for emulating different scenarios such as a new client connected to an
old server.

This also adds code generation for serde-only types in encode/decode
which unconditionally assert(false). This is because we combine all the
scenarios we want to test into a single service, and if we didn't do
this, then the service would not compile because it also contains
methods that are serde-only. This is disallowed at runtime--tests should
never be written that cause a service to handle a serde message in a v0
emulation scenario. The alternative would be a lot more bifurcation of
test services or some sort of sfinae magic.

This does not represent an issue for non-test cases because the default
message handler generates correct code for all scenarios.

Signed-off-by: Noah Watkins <noah@redpanda.com>
---
 src/v/rpc/parse_utils.h                | 23 +++++++++++++++++++----
 src/v/rpc/test/rpc_gen_cycling_test.cc | 24 ++++++++++++++++--------
 2 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/src/v/rpc/parse_utils.h b/src/v/rpc/parse_utils.h
index 313db0b334c22..2566533d59d3e 100644
--- a/src/v/rpc/parse_utils.h
+++ b/src/v/rpc/parse_utils.h
@@ -231,11 +231,22 @@ struct default_message_codec {
  *
  * example:
  *   using echo_service_v0 = echo_service_base<v0_message_codec>;
+ *
+ * Note that for serde-supported messages a vassert(false) is generated. First,
+ * the v0_message_encoder is only used in tests. Second, serde usage is not
+ * possible in v0 servers, so this restriction is realistic. And from a
+ * practical standpoint this allows us to avoid bifurcation of services (or more
+ * sfinae magic) in tests so that serde-only types were never present within a
+ * service configured with a v0_message_encoder.
  */
 struct v0_message_codec {
     template<typename T>
     static ss::future<T> decode(iobuf_parser& parser, transport_version) {
-        return reflection::async_adl<T>{}.from(parser);
+        if constexpr (is_rpc_adl_exempt<T>) {
+            vassert(false, "Cannot use serde-only types in v0 server");
+        } else {
+            return reflection::async_adl<T>{}.from(parser);
+        }
     }
 
     static transport_version response_version(const header&) {
@@ -245,9 +256,13 @@ struct v0_message_codec {
     template<typename T>
     static ss::future<transport_version>
     encode(iobuf& out, T msg, transport_version) {
-        return reflection::async_adl<T>{}.to(out, std::move(msg)).then([] {
-            return transport_version::v0;
-        });
+        if constexpr (is_rpc_adl_exempt<T>) {
+            vassert(false, "Cannot use serde-only types in v0 server");
+        } else {
+            return reflection::async_adl<T>{}.to(out, std::move(msg)).then([] {
+                return transport_version::v0;
+            });
+        }
     }
 };
 
diff --git a/src/v/rpc/test/rpc_gen_cycling_test.cc b/src/v/rpc/test/rpc_gen_cycling_test.cc
index 72af506dc7aac..61846210e38f2 100644
--- a/src/v/rpc/test/rpc_gen_cycling_test.cc
+++ b/src/v/rpc/test/rpc_gen_cycling_test.cc
@@ -10,6 +10,7 @@
 #include "model/timeout_clock.h"
 #include "random/generators.h"
 #include "rpc/exceptions.h"
+#include "rpc/parse_utils.h"
 #include "rpc/test/cycling_service.h"
 #include "rpc/test/echo_service.h"
 #include "rpc/test/rpc_gen_types.h"
@@ -35,9 +36,10 @@
 using namespace std::chrono_literals; // NOLINT
 
 // Test services
-struct movistar final : cycling::team_movistar_service {
+template<typename Codec>
+struct movistar final : cycling::team_movistar_service_base<Codec> {
     movistar(ss::scheduling_group& sc, ss::smp_service_group& ssg)
-      : cycling::team_movistar_service(sc, ssg) {}
+      : cycling::team_movistar_service_base<Codec>(sc, ssg) {}
     ss::future<cycling::mount_tamalpais>
     ibis_hakka(cycling::san_francisco&&, rpc::streaming_context&) final {
         return ss::make_ready_future<cycling::mount_tamalpais>(
@@ -50,9 +52,10 @@ struct movistar final : cycling::team_movistar_service {
     }
 };
 
-struct echo_impl final : echo::echo_service {
+template<typename Codec>
+struct echo_impl final : echo::echo_service_base<Codec> {
     echo_impl(ss::scheduling_group& sc, ss::smp_service_group& ssg)
-      : echo::echo_service(sc, ssg) {}
+      : echo::echo_service_base<Codec>(sc, ssg) {}
     ss::future<echo::echo_resp>
     echo(echo::echo_req&& req, rpc::streaming_context&) final {
         return ss::make_ready_future<echo::echo_resp>(
@@ -122,8 +125,13 @@ class rpc_integration_fixture : public rpc_simple_integration_fixture {
       : rpc_simple_integration_fixture(redpanda_rpc_port) {}
 
     void register_services() {
-        register_service<movistar>();
-        register_service<echo_impl>();
+        register_service<movistar<rpc::default_message_codec>>();
+        register_service<echo_impl<rpc::default_message_codec>>();
+    }
+
+    void register_services_v0() {
+        register_service<movistar<rpc::v0_message_codec>>();
+        register_service<echo_impl<rpc::v0_message_codec>>();
     }
 
     static constexpr uint16_t redpanda_rpc_port = 32147;
@@ -658,8 +666,8 @@ class erroneous_service_fixture
       : rpc_fixture_swappable_proto(redpanda_rpc_port) {}
 
     void register_services() {
-        register_service<movistar>();
-        register_service<echo_impl>();
+        register_service<movistar<rpc::default_message_codec>>();
+        register_service<echo_impl<rpc::default_message_codec>>();
     }
 
     static constexpr uint16_t redpanda_rpc_port = 32147;

From 333c8a115c5b1b37f0917d481a15b87256ba15f4 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noah@redpanda.com>
Date: Sat, 28 May 2022 10:16:03 -0700
Subject: [PATCH 198/201] rpc: expose transport version

Used in tests to verify transport version upgrades.

Signed-off-by: Noah Watkins <noah@redpanda.com>
---
 src/v/rpc/transport.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/v/rpc/transport.h b/src/v/rpc/transport.h
index 132451a9ef38b..b08b60d182c7e 100644
--- a/src/v/rpc/transport.h
+++ b/src/v/rpc/transport.h
@@ -68,6 +68,8 @@ class transport final : public net::base_transport {
 
     void reset_state() final;
 
+    transport_version version() const { return _version; }
+
 private:
     using sequence_t = named_type<uint64_t, struct sequence_tag>;
     struct entry {

From 43ee8ee499b1dd859e7d67b3ba6b922c90bc1c49 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noah@redpanda.com>
Date: Sat, 28 May 2022 10:16:16 -0700
Subject: [PATCH 199/201] rpc: enable support for new transport versions

Transport is initially at v1 which the rpc framework allow upgrading
from.

Signed-off-by: Noah Watkins <noah@redpanda.com>
---
 src/v/rpc/transport.h | 2 +-
 src/v/rpc/types.h     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/v/rpc/transport.h b/src/v/rpc/transport.h
index b08b60d182c7e..b9160d80ab86d 100644
--- a/src/v/rpc/transport.h
+++ b/src/v/rpc/transport.h
@@ -110,7 +110,7 @@ class transport final : public net::base_transport {
      * during the lifetime of the transport. for example the version may be
      * upgraded if it is discovered that a server supports a newer version.
      */
-    transport_version _version{transport_version::v0};
+    transport_version _version{transport_version::v1};
 
     friend std::ostream& operator<<(std::ostream&, const transport&);
 };
diff --git a/src/v/rpc/types.h b/src/v/rpc/types.h
index 669718ba553c4..987cb08b6480d 100644
--- a/src/v/rpc/types.h
+++ b/src/v/rpc/types.h
@@ -87,7 +87,7 @@ enum class transport_version : uint8_t {
     v1 = 1,
     v2 = 2,
 
-    max_supported = v0,
+    max_supported = v2,
 
     /*
      * unsupported is a convenience name used in tests to construct a message

From 5d201d73988b32fad59493e2e09f34c13dd590e0 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noah@redpanda.com>
Date: Sat, 28 May 2022 18:55:16 -0700
Subject: [PATCH 200/201] rpc: add tests for adaptive rpc encoding

Signed-off-by: Noah Watkins <noah@redpanda.com>
---
 src/v/rpc/test/rpc_gen_cycling_test.cc | 250 +++++++++++++++++++++++++
 src/v/rpc/transport.h                  |   7 +
 2 files changed, 257 insertions(+)

diff --git a/src/v/rpc/test/rpc_gen_cycling_test.cc b/src/v/rpc/test/rpc_gen_cycling_test.cc
index 61846210e38f2..b95c5b893ef35 100644
--- a/src/v/rpc/test/rpc_gen_cycling_test.cc
+++ b/src/v/rpc/test/rpc_gen_cycling_test.cc
@@ -689,3 +689,253 @@ FIXTURE_TEST(unhandled_throw_in_proto_apply, erroneous_service_fixture) {
       .get();
     t.stop().get();
 }
+
+/*
+ * new client, new server
+ * client has initial transport version v1
+ * sends adl+serde message at (adl,v1)
+ * client has transport upgraded to v2
+ * client transport remains at v2
+ */
+FIXTURE_TEST(nc_ns_adl_serde_client_upgraded, rpc_integration_fixture) {
+    configure_server();
+    register_services();
+    start_server();
+
+    rpc::transport t(client_config());
+    t.connect(model::no_timeout).get();
+    auto stop = ss::defer([&t] { t.stop().get(); });
+    auto client = echo::echo_client_protocol(t);
+
+    BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v1);
+
+    // first messages are sent with adl
+    {
+        const auto payload = random_generators::gen_alphanum_string(100);
+        auto f = client.echo_adl_serde(
+          echo::echo_req_adl_serde{.str = payload},
+          rpc::client_opts(rpc::no_timeout));
+        auto ret = f.get();
+        BOOST_REQUIRE(ret.has_value());
+        BOOST_REQUIRE_EQUAL(
+          ret.value().data.str, payload + "_to_aas_from_aas_to_aas_from_aas");
+    }
+
+    // subsequent messages use serde
+    for (int i = 0; i < 10; i++) {
+        const auto payload = random_generators::gen_alphanum_string(100);
+        auto f = client.echo_adl_serde(
+          echo::echo_req_adl_serde{.str = payload},
+          rpc::client_opts(rpc::no_timeout));
+        auto ret = f.get();
+        BOOST_REQUIRE(ret.has_value());
+        BOOST_REQUIRE_EQUAL(
+          ret.value().data.str, payload + "_to_sas_from_sas_to_sas_from_sas");
+
+        // upgraded and remains at v2
+        BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v2);
+    }
+}
+
+/*
+ * new client, new server
+ * client has initial transport version v1
+ * sends serde-only message at (serde,v2)
+ * client has transport upgraded to v2
+ * client transport remains at v2
+ */
+FIXTURE_TEST(nc_ns_serde_only_client_upgraded, rpc_integration_fixture) {
+    configure_server();
+    register_services();
+    start_server();
+
+    rpc::transport t(client_config());
+    t.connect(model::no_timeout).get();
+    auto stop = ss::defer([&t] { t.stop().get(); });
+    auto client = echo::echo_client_protocol(t);
+
+    BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v1);
+
+    for (int i = 0; i < 10; i++) {
+        const auto payload = random_generators::gen_alphanum_string(100);
+        auto f = client.echo_serde_only(
+          echo::echo_req_serde_only{.str = payload},
+          rpc::client_opts(rpc::no_timeout));
+        auto ret = f.get();
+        BOOST_REQUIRE(ret.has_value());
+        BOOST_REQUIRE_EQUAL(
+          ret.value().data.str, payload + "_to_sso_from_sso_to_sso_from_sso");
+
+        // upgraded and remains at v2
+        BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v2);
+    }
+}
+
+/*
+ * new client, new server
+ * client sends adl-only message (adl,v1)
+ * client remains pinned at v1
+ *
+ * client will not be upgraded. adl-only messages are always set at v0 and the
+ * server will always respond with v0 messages. upgrade doesn't happen because
+ * client only upgrades in response to a v1 or v2 message.
+ *
+ * this case is for the interim development period where we are allowing types
+ * with only adl support until all types have serde support added.
+ */
+FIXTURE_TEST(nc_ns_adl_only_no_client_upgrade, rpc_integration_fixture) {
+    configure_server();
+    register_services();
+    start_server();
+
+    rpc::transport t(client_config());
+    t.connect(model::no_timeout).get();
+    auto stop = ss::defer([&t] { t.stop().get(); });
+    auto client = echo::echo_client_protocol(t);
+
+    BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v1);
+
+    for (int i = 0; i < 10; i++) {
+        const auto payload = random_generators::gen_alphanum_string(100);
+        auto f = client.echo_adl_only(
+          echo::echo_req_adl_only{.str = payload},
+          rpc::client_opts(rpc::no_timeout));
+        auto ret = f.get();
+        BOOST_REQUIRE(ret.has_value());
+        BOOST_REQUIRE_EQUAL(
+          ret.value().data.str, payload + "_to_aao_from_aao_to_aao_from_aao");
+
+        // no upgrade
+        BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v1);
+    }
+}
+
+/*
+ * new client, old server
+ * client has initial transport version v1
+ * [sends adl+serde message at (adl,v1)] * N
+ * client transport version is not upgraded
+ */
+FIXTURE_TEST(nc_os_adl_serde_no_client_upgrade, rpc_integration_fixture) {
+    configure_server();
+    register_services_v0();
+    start_server();
+
+    rpc::transport t(client_config());
+    t.connect(model::no_timeout).get();
+    auto stop = ss::defer([&t] { t.stop().get(); });
+    auto client = echo::echo_client_protocol(t);
+
+    // client initially at v1
+    BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v1);
+
+    for (int i = 0; i < 10; i++) {
+        const auto payload = random_generators::gen_alphanum_string(100);
+        auto f = client.echo_adl_serde(
+          echo::echo_req_adl_serde{.str = payload},
+          rpc::client_opts(rpc::no_timeout));
+        auto ret = f.get();
+        BOOST_REQUIRE(ret.has_value());
+        BOOST_REQUIRE_EQUAL(
+          ret.value().data.str, payload + "_to_aas_from_aas_to_aas_from_aas");
+
+        // client stays at v1 without upgrade to v2
+        BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v1);
+    }
+}
+
+/*
+ * new client, old server
+ * client has initial transport version v1
+ * [sends adl-only message at (adl,v1)] * N
+ * client transport verison is not upgraded
+ */
+FIXTURE_TEST(nc_os_adl_only_no_client_upgrade, rpc_integration_fixture) {
+    configure_server();
+    register_services_v0();
+    start_server();
+
+    rpc::transport t(client_config());
+    t.connect(model::no_timeout).get();
+    auto stop = ss::defer([&t] { t.stop().get(); });
+    auto client = echo::echo_client_protocol(t);
+
+    // client initially at v1
+    BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v1);
+
+    for (int i = 0; i < 10; i++) {
+        const auto payload = random_generators::gen_alphanum_string(100);
+        auto f = client.echo_adl_only(
+          echo::echo_req_adl_only{.str = payload},
+          rpc::client_opts(rpc::no_timeout));
+        auto ret = f.get();
+        BOOST_REQUIRE(ret.has_value());
+        BOOST_REQUIRE_EQUAL(
+          ret.value().data.str, payload + "_to_aao_from_aao_to_aao_from_aao");
+
+        // client stays at v1 without upgrade to v2
+        BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v1);
+    }
+}
+
+/*
+ * old client, new server
+ * sends an adl encoded message which the server understands but also has serde
+ * support for. communication should continue to use adl.
+ */
+FIXTURE_TEST(oc_ns_adl_serde_no_upgrade, rpc_integration_fixture) {
+    configure_server();
+    register_services();
+    start_server();
+
+    rpc::transport t(client_config());
+    t.set_version(rpc::transport_version::v0);
+    t.connect(model::no_timeout).get();
+    auto stop = ss::defer([&t] { t.stop().get(); });
+    auto client = echo::echo_client_protocol(t);
+
+    BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v0);
+
+    for (int i = 0; i < 10; i++) {
+        const auto payload = random_generators::gen_alphanum_string(100);
+        auto f = client.echo_adl_serde(
+          echo::echo_req_adl_serde{.str = payload},
+          rpc::client_opts(rpc::no_timeout));
+        auto ret = f.get();
+        BOOST_REQUIRE(ret.has_value());
+        BOOST_REQUIRE_EQUAL(
+          ret.value().data.str, payload + "_to_aas_from_aas_to_aas_from_aas");
+        BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v0);
+    }
+}
+
+/*
+ * old client, new server
+ * adl-only. verifies behavior for intermediate state when we support adl-only
+ * messages.
+ */
+FIXTURE_TEST(oc_ns_adl_only_no_upgrade, rpc_integration_fixture) {
+    configure_server();
+    register_services();
+    start_server();
+
+    rpc::transport t(client_config());
+    t.set_version(rpc::transport_version::v0);
+    t.connect(model::no_timeout).get();
+    auto stop = ss::defer([&t] { t.stop().get(); });
+    auto client = echo::echo_client_protocol(t);
+
+    BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v0);
+
+    for (int i = 0; i < 10; i++) {
+        const auto payload = random_generators::gen_alphanum_string(100);
+        auto f = client.echo_adl_only(
+          echo::echo_req_adl_only{.str = payload},
+          rpc::client_opts(rpc::no_timeout));
+        auto ret = f.get();
+        BOOST_REQUIRE(ret.has_value());
+        BOOST_REQUIRE_EQUAL(
+          ret.value().data.str, payload + "_to_aao_from_aao_to_aao_from_aao");
+        BOOST_REQUIRE_EQUAL(t.version(), rpc::transport_version::v0);
+    }
+}
diff --git a/src/v/rpc/transport.h b/src/v/rpc/transport.h
index b9160d80ab86d..abbdf6d078cd0 100644
--- a/src/v/rpc/transport.h
+++ b/src/v/rpc/transport.h
@@ -38,6 +38,9 @@
 #include <optional>
 #include <utility>
 
+class rpc_integration_fixture_oc_ns_adl_serde_no_upgrade;
+class rpc_integration_fixture_oc_ns_adl_only_no_upgrade;
+
 namespace rpc {
 struct client_context_impl;
 
@@ -112,6 +115,10 @@ class transport final : public net::base_transport {
      */
     transport_version _version{transport_version::v1};
 
+    friend class ::rpc_integration_fixture_oc_ns_adl_serde_no_upgrade;
+    friend class ::rpc_integration_fixture_oc_ns_adl_only_no_upgrade;
+    void set_version(transport_version v) { _version = v; }
+
     friend std::ostream& operator<<(std::ostream&, const transport&);
 };
 

From 7551561a19af33496af3164c76df8ec72d210ab8 Mon Sep 17 00:00:00 2001
From: Rogger Vasquez <rvasque3@gmail.com>
Date: Mon, 18 Jul 2022 13:21:46 -0500
Subject: [PATCH 201/201] rpk: sample config to include empty structs

Sample config file had empty properties:
pandaproxy and schema_registry; rpk strips them
out due to the omitempty struct tag.

In the past viper assumed the presence of a
property was enough to include it in the file,
that didn't respect the omitempty tag.
---
 conf/redpanda.yaml                   |   4 +-
 src/go/rpk/pkg/config/params_test.go | 101 +++++++++++++++++++++++++++
 2 files changed, 103 insertions(+), 2 deletions(-)

diff --git a/conf/redpanda.yaml b/conf/redpanda.yaml
index fee180320f1ae..4dd0f404a883f 100644
--- a/conf/redpanda.yaml
+++ b/conf/redpanda.yaml
@@ -45,10 +45,10 @@ redpanda:
   developer_mode: true
 
 # Enable Pandaproxy
-pandaproxy:
+pandaproxy: {}
 
 # Enable Schema Registry
-schema_registry:
+schema_registry: {}
 
 rpk:
   # TLS configuration.
diff --git a/src/go/rpk/pkg/config/params_test.go b/src/go/rpk/pkg/config/params_test.go
index e3de9f09430fc..e25f0e4a03492 100644
--- a/src/go/rpk/pkg/config/params_test.go
+++ b/src/go/rpk/pkg/config/params_test.go
@@ -1,10 +1,12 @@
 package config
 
 import (
+	"os"
 	"strings"
 	"testing"
 
 	"github.com/spf13/afero"
+	"github.com/stretchr/testify/require"
 )
 
 func TestParams_Write(t *testing.T) {
@@ -129,3 +131,102 @@ rpk:
 		})
 	}
 }
+
+func TestRedpandaSampleFile(t *testing.T) {
+	// Config from 'redpanda/conf/redpanda.yaml'.
+	sample, err := os.ReadFile("../../../../../conf/redpanda.yaml")
+	if err != nil {
+		t.Errorf("unexpected error while reading sample config file: %s", err)
+		return
+	}
+	fs := afero.NewMemMapFs()
+	err = afero.WriteFile(fs, "/etc/redpanda/redpanda.yaml", sample, 0o644)
+	if err != nil {
+		t.Errorf("unexpected error while writing sample config file: %s", err)
+		return
+	}
+	expCfg := &Config{
+		ConfigFile: "/etc/redpanda/redpanda.yaml",
+		loadedPath: "/etc/redpanda/redpanda.yaml",
+		Redpanda: RedpandaConfig{
+			Directory: "/var/lib/redpanda/data",
+			RPCServer: SocketAddress{
+				Address: "0.0.0.0",
+				Port:    33145,
+			},
+			KafkaAPI: []NamedSocketAddress{{
+				Address: "0.0.0.0",
+				Port:    9092,
+			}},
+			AdminAPI: []NamedSocketAddress{{
+				Address: "0.0.0.0",
+				Port:    9644,
+			}},
+			ID:            1,
+			SeedServers:   []SeedServer{},
+			DeveloperMode: true,
+		},
+		Rpk: RpkConfig{
+			CoredumpDir:      "/var/lib/redpanda/coredump",
+			EnableUsageStats: true,
+		},
+		Pandaproxy:     &Pandaproxy{},
+		SchemaRegistry: &SchemaRegistry{},
+	}
+	// Load and check we load it correctly
+	cfg, err := new(Params).Load(fs)
+	if err != nil {
+		t.Errorf("unexpected error while loading sample config file: %s", err)
+		return
+	}
+	cfg = cfg.FileOrDefaults() // we want to check that we correctly load the raw file
+	require.Equal(t, expCfg, cfg)
+
+	// Write to the file and check we don't mangle the config properties
+	err = cfg.Write(fs)
+	if err != nil {
+		t.Errorf("unexpected error while writing config file: %s", err)
+		return
+	}
+	file, err := afero.ReadFile(fs, "/etc/redpanda/redpanda.yaml")
+	if err != nil {
+		t.Errorf("unexpected error while reading config file from fs: %s", err)
+		return
+	}
+	require.Equal(t, `config_file: /etc/redpanda/redpanda.yaml
+redpanda:
+    data_directory: /var/lib/redpanda/data
+    node_id: 1
+    seed_servers: []
+    rpc_server:
+        address: 0.0.0.0
+        port: 33145
+    kafka_api:
+        - address: 0.0.0.0
+          port: 9092
+    admin:
+        - address: 0.0.0.0
+          port: 9644
+    developer_mode: true
+rpk:
+    enable_usage_stats: true
+    tune_network: false
+    tune_disk_scheduler: false
+    tune_disk_nomerges: false
+    tune_disk_write_cache: false
+    tune_disk_irq: false
+    tune_fstrim: false
+    tune_cpu: false
+    tune_aio_events: false
+    tune_clocksource: false
+    tune_swappiness: false
+    tune_transparent_hugepages: false
+    enable_memory_locking: false
+    tune_coredump: false
+    coredump_dir: /var/lib/redpanda/coredump
+    tune_ballast_file: false
+    overprovisioned: false
+pandaproxy: {}
+schema_registry: {}
+`, string(file))
+}