From 4a98a1ae8dee1442febddb8b1ba4fc68cf93fec7 Mon Sep 17 00:00:00 2001 From: Elena Anyusheva Date: Thu, 28 Jul 2022 14:45:48 +0200 Subject: [PATCH 1/5] cloud_storage: use stable iterator absl::btree_map doesn't provide a pointer stability, so incerementing iterator in a loop that has a scheduling point can lead to segfault. remote_partition uses stable iterator that make a lookup on every iterator increment. --- src/v/cloud_storage/remote_partition.cc | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/v/cloud_storage/remote_partition.cc b/src/v/cloud_storage/remote_partition.cc index b8e6a342a64a..3a000e72a76c 100644 --- a/src/v/cloud_storage/remote_partition.cc +++ b/src/v/cloud_storage/remote_partition.cc @@ -505,11 +505,13 @@ remote_partition::aborted_transactions(offset_range offsets) { // redpanda offsets to extract aborted transactions metadata because // tx-manifests contains redpanda offsets. std::vector result; - auto first_it = _segments.upper_bound(offsets.begin); - if (first_it != _segments.begin()) { + + // that's a stable btree iterator that makes key lookup on increment + auto first_it = upper_bound(offsets.begin); + if (first_it != begin()) { first_it = std::prev(first_it); } - for (auto it = first_it; it != _segments.end(); it++) { + for (auto it = first_it; it != end(); it++) { if (it->first > offsets.end) { break; } @@ -560,9 +562,9 @@ ss::future<> remote_partition::stop() { co_await std::visit([](auto&& rs) { return rs->stop(); }, rs); } - for (auto& [offset, seg] : _segments) { - vlog(_ctxlog.debug, "remote partition stop {}", offset); - co_await std::visit([](auto&& st) { return st->stop(); }, seg); + for (auto it = begin(); it != end(); it++) { + vlog(_ctxlog.debug, "remote partition stop {}", it->first); + co_await std::visit([](auto&& st) { return st->stop(); }, it->second); } } From 76cd89970a58c2e20a7b1e0655af2098a8de8509 Mon Sep 17 00:00:00 2001 From: Elena Anyusheva Date: Thu, 28 Jul 2022 15:29:00 +0200 Subject: [PATCH 2/5] cloud_storage: add gate guard to aborted_transactions --- src/v/cloud_storage/remote_partition.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/v/cloud_storage/remote_partition.cc b/src/v/cloud_storage/remote_partition.cc index 3a000e72a76c..8684572e68ab 100644 --- a/src/v/cloud_storage/remote_partition.cc +++ b/src/v/cloud_storage/remote_partition.cc @@ -16,6 +16,7 @@ #include "cloud_storage/types.h" #include "storage/parser_errc.h" #include "storage/types.h" +#include "utils/gate_guard.h" #include "utils/retry_chain_node.h" #include "utils/stream_utils.h" @@ -501,6 +502,7 @@ remote_partition::get_term_last_offset(model::term_id term) const { ss::future> remote_partition::aborted_transactions(offset_range offsets) { + gate_guard guard(_gate); // Here we have to use kafka offsets to locate the segments and // redpanda offsets to extract aborted transactions metadata because // tx-manifests contains redpanda offsets. From dddff6e20ced39bc40f101cdf6b6d2fdb4ca3440 Mon Sep 17 00:00:00 2001 From: Elena Anyusheva Date: Thu, 28 Jul 2022 14:38:13 +0200 Subject: [PATCH 3/5] cloud_storage: fix typo in log --- src/v/cloud_storage/remote_partition.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/v/cloud_storage/remote_partition.cc b/src/v/cloud_storage/remote_partition.cc index 8684572e68ab..54cb5529d535 100644 --- a/src/v/cloud_storage/remote_partition.cc +++ b/src/v/cloud_storage/remote_partition.cc @@ -544,10 +544,10 @@ remote_partition::aborted_transactions(offset_range offsets) { "found {} aborted transactions for {}-{} offset range ({}-{} before " "offset translaction)", result.size(), - offsets.begin_rp, offsets.begin, - offsets.end_rp, - offsets.end); + offsets.end, + offsets.begin_rp, + offsets.end_rp); co_return result; } From b6ca1cfda5a4dc7a96372ac953017e9fc1830bca Mon Sep 17 00:00:00 2001 From: Elena Anyusheva Date: Fri, 29 Jul 2022 13:45:11 +0200 Subject: [PATCH 4/5] cloud_storage: assert pointer in stable iterator increment remote_partition uses stable iterator wrapper for absl::btree. However stable iterator doesn't support deletions from the underlying btree_map. Add a comment about that. --- src/v/cloud_storage/remote_partition.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/v/cloud_storage/remote_partition.h b/src/v/cloud_storage/remote_partition.h index 3d14fce35729..8878d2326f09 100644 --- a/src/v/cloud_storage/remote_partition.h +++ b/src/v/cloud_storage/remote_partition.h @@ -48,8 +48,7 @@ namespace details { /// iterator stability guarantee by caching the key and /// doing a lookup on every increment. /// This turns iterator increment into O(logN) operation -/// but this is OK since the underlying btree_map has high -/// fan-out ratio so the logarithm base is relatively large. +/// Deleting from underlying btree_map is not supported. template class btree_map_stable_iterator : public boost::iterator_facade< @@ -93,6 +92,10 @@ class btree_map_stable_iterator vassert( _key.has_value(), "btree_map_stable_iterator can't be incremented"); auto it = _map.get().find(*_key); + // _key should be present since deletions are not supported + vassert( + it != _map.get().end(), + "btree_map_stable_iterator can't be incremented"); ++it; if (it == _map.get().end()) { set_end(); @@ -326,6 +329,10 @@ class remote_partition cache& _cache; const partition_manifest& _manifest; s3::bucket_name _bucket; + + // Deleting from _segments is not supported. + // absl::btree_map doesn't provide a pointer stabilty. We are + // using remote_partition::btree_map_stable_iterator to work around this. segment_map_t _segments; eviction_list_t _eviction_list; intrusive_list< From 28aab3f35c40c9a5e36b847bc8fdc0e8efe1fe18 Mon Sep 17 00:00:00 2001 From: Elena Anyusheva Date: Mon, 1 Aug 2022 13:25:30 +0200 Subject: [PATCH 5/5] cloud_storage: don't capture reference to btree element Reference to the absl::btree_map is not stable, so don't use a reference that was captured before the scheduling point. --- src/v/cloud_storage/remote_partition.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/v/cloud_storage/remote_partition.cc b/src/v/cloud_storage/remote_partition.cc index 54cb5529d535..75bd9bbbae4a 100644 --- a/src/v/cloud_storage/remote_partition.cc +++ b/src/v/cloud_storage/remote_partition.cc @@ -517,15 +517,14 @@ remote_partition::aborted_transactions(offset_range offsets) { if (it->first > offsets.end) { break; } - auto& st = it->second; auto tx = co_await ss::visit( - st, - [this, &st, offsets, offset_key = it->first]( + it->second, + [this, offsets, offset_key = it->first]( offloaded_segment_state& off_state) { auto tmp = off_state->materialize(*this, offset_key); auto res = tmp->segment->aborted_transactions( offsets.begin_rp, offsets.end_rp); - st = std::move(tmp); + _segments.insert_or_assign(offset_key, std::move(tmp)); return res; }, [offsets](materialized_segment_ptr& m_state) {