From 14b45a47e5c266d52bf30b4dff35b16646c373db Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Mon, 8 Aug 2022 13:19:28 +0100 Subject: [PATCH 1/7] cpu_scheduling: expose all scheduling groups This patch adds a public method that returns a list containing constant references to all the scheduling groups created by redpanda. --- src/v/resource_mgmt/cpu_scheduling.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/v/resource_mgmt/cpu_scheduling.h b/src/v/resource_mgmt/cpu_scheduling.h index 0d07761e068f..796ef761449b 100644 --- a/src/v/resource_mgmt/cpu_scheduling.h +++ b/src/v/resource_mgmt/cpu_scheduling.h @@ -65,7 +65,24 @@ class scheduling_groups final { } ss::scheduling_group archival_upload() { return _archival_upload; } + std::vector> + all_scheduling_groups() const { + return { + std::cref(_default), + std::cref(_admin), + std::cref(_raft), + std::cref(_kafka), + std::cref(_cluster), + std::cref(_coproc), + std::cref(_cache_background_reclaim), + std::cref(_compaction), + std::cref(_raft_learner_recovery), + std::cref(_archival_upload)}; + } + private: + ss::scheduling_group _default{ + seastar::default_scheduling_group()}; // created and managed by seastar ss::scheduling_group _admin; ss::scheduling_group _raft; ss::scheduling_group _kafka; From 1c3bbb4d43fe6085e3dc4b2b18254819ea14f9d3 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Mon, 8 Aug 2022 13:21:36 +0100 Subject: [PATCH 2/7] cpu_scheduling: add scheduling_groups probe This patch introduces a probe that queries each scheduling group for its usage stats and publishes metrics based on that. The following new metric is introduced: Name: redpanda_scheduler_runtime_seconds_total Description: Accumulated runtime of task queue associated with this scheduling group Labels: - redpanda_scheduling_group - shard --- src/v/resource_mgmt/scheduling_groups_probe.h | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 src/v/resource_mgmt/scheduling_groups_probe.h diff --git a/src/v/resource_mgmt/scheduling_groups_probe.h b/src/v/resource_mgmt/scheduling_groups_probe.h new file mode 100644 index 000000000000..013d5a1b851c --- /dev/null +++ b/src/v/resource_mgmt/scheduling_groups_probe.h @@ -0,0 +1,52 @@ +/* + * Copyright 2022 Redpanda Data, Inc. + * + * Use of this software is governed by the Business Source License + * included in the file licenses/BSL.md + * + * As of the Change Date specified in that file, in accordance with + * the Business Source License, use of this software will be governed + * by the Apache License, Version 2.0 + */ + +#pragma once + +#include "cluster/partition_leaders_table.h" +#include "config/configuration.h" +#include "prometheus/prometheus_sanitize.h" +#include "resource_mgmt/cpu_scheduling.h" +#include "ssx/metrics.h" + +#include + +class scheduling_groups_probe { +public: + void wire_up(const scheduling_groups& scheduling_groups) { + if (config::shard_local_cfg().disable_public_metrics()) { + return; + } + + auto groups = scheduling_groups.all_scheduling_groups(); + for (const auto& group_ref : groups) { + _public_metrics.add_group( + prometheus_sanitize::metrics_name("scheduler"), + {seastar::metrics::make_counter( + "runtime_seconds_total", + [group_ref] { + auto runtime_duration = group_ref.get().get_stats().runtime; + return std::chrono::duration(runtime_duration).count(); + }, + seastar::metrics::description( + "Accumulated runtime of task queue associated with this " + "scheduling group"), + {ssx::metrics::make_namespaced_label("scheduling_group")( + group_ref.get().name())})}); + } + } + + void clear() { _public_metrics.clear(); } + +private: + seastar::metrics::metric_groups _public_metrics{ + ssx::metrics::public_metrics_handle}; +}; From 1be0e7c01ccb3f34d325b4a771350f7670180ac2 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Mon, 8 Aug 2022 13:26:04 +0100 Subject: [PATCH 3/7] redpanda/main: use scheduling groups probe This commit wires up a scheduling_groups_probe in order to publish metrics based on the scheduling groups stats. Note how the probe is cleared before the scheduling groups are destroyed to prevent publishing metrics from a destroyed group. --- src/v/redpanda/application.cc | 7 +++++-- src/v/redpanda/application.h | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/v/redpanda/application.cc b/src/v/redpanda/application.cc index 780a7c23a56c..18c16c0b755e 100644 --- a/src/v/redpanda/application.cc +++ b/src/v/redpanda/application.cc @@ -294,8 +294,11 @@ void application::initialize( } _scheduling_groups.create_groups().get(); - _deferred.emplace_back( - [this] { _scheduling_groups.destroy_groups().get(); }); + _scheduling_groups_probe.wire_up(_scheduling_groups); + _deferred.emplace_back([this] { + _scheduling_groups_probe.clear(); + _scheduling_groups.destroy_groups().get(); + }); if (proxy_cfg) { _proxy_config.emplace(*proxy_cfg); diff --git a/src/v/redpanda/application.h b/src/v/redpanda/application.h index c88bc13e916d..f089fa6c3a0c 100644 --- a/src/v/redpanda/application.h +++ b/src/v/redpanda/application.h @@ -31,6 +31,7 @@ #include "redpanda/admin_server.h" #include "resource_mgmt/cpu_scheduling.h" #include "resource_mgmt/memory_groups.h" +#include "resource_mgmt/scheduling_groups_probe.h" #include "resource_mgmt/smp_groups.h" #include "rpc/fwd.h" #include "seastarx.h" @@ -157,6 +158,7 @@ class application { _schema_reg_config; std::optional _schema_reg_client_config; scheduling_groups _scheduling_groups; + scheduling_groups_probe _scheduling_groups_probe; ss::logger _log; ss::sharded _connection_cache; From a092bab7c92c3558d9f3eb95978a2b7e62ffc3de Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 11 Aug 2022 11:31:59 +0100 Subject: [PATCH 4/7] application: add up and busy time public metrics This commit splits the registration of internal an public metrics into two separate methods. It also adds two new metrics to the "public_metrics" endpoint: redpanda_application_uptime_total_seconds Description: Redpanda uptime in seconds Labels: none redpanda_application_busy_total_seconds Description: Total CPU busy time in seconds Labels: none --- src/v/redpanda/application.cc | 56 +++++++++++++++++++++++++++-------- src/v/redpanda/application.h | 5 ++++ 2 files changed, 49 insertions(+), 12 deletions(-) diff --git a/src/v/redpanda/application.cc b/src/v/redpanda/application.cc index 18c16c0b755e..3d4d2aff6cbd 100644 --- a/src/v/redpanda/application.cc +++ b/src/v/redpanda/application.cc @@ -58,7 +58,6 @@ #include "redpanda/admin_server.h" #include "resource_mgmt/io_priority.h" #include "rpc/simple_protocol.h" -#include "ssx/metrics.h" #include "storage/backlog_controller.h" #include "storage/chunk_cache.h" #include "storage/compaction_controller.h" @@ -317,23 +316,56 @@ void application::initialize( } void application::setup_metrics() { - if (!config::shard_local_cfg().disable_public_metrics()) { - seastar::metrics::replicate_metric_families( - seastar::metrics::default_handle(), - {{"scheduler_runtime_ms", ssx::metrics::public_metrics_handle}, - {"io_queue_total_read_ops", ssx::metrics::public_metrics_handle}, - {"io_queue_total_write_ops", ssx::metrics::public_metrics_handle}, - {"memory_allocated_memory", ssx::metrics::public_metrics_handle}, - {"memory_free_memory", ssx::metrics::public_metrics_handle}}) - .get(); - } + setup_internal_metrics(); + setup_public_metrics(); +} - if (config::shard_local_cfg().disable_metrics()) { +void application::setup_public_metrics() { + namespace sm = ss::metrics; + + if (config::shard_local_cfg().disable_public_metrics()) { return; } + seastar::metrics::replicate_metric_families( + seastar::metrics::default_handle(), + {{"scheduler_runtime_ms", ssx::metrics::public_metrics_handle}, + {"io_queue_total_read_ops", ssx::metrics::public_metrics_handle}, + {"io_queue_total_write_ops", ssx::metrics::public_metrics_handle}, + {"memory_allocated_memory", ssx::metrics::public_metrics_handle}, + {"memory_free_memory", ssx::metrics::public_metrics_handle}}) + .get(); + + _public_metrics.add_group( + "application", + { + sm::make_gauge( + "uptime_seconds_total", + [] { + return std::chrono::duration(ss::engine().uptime()) + .count(); + }, + sm::description("Redpanda uptime in seconds")) + .aggregate({sm::shard_label}), + sm::make_gauge( + "busy_seconds_total", + [] { + return std::chrono::duration( + ss::engine().total_busy_time()) + .count(); + }, + sm::description("Total CPU busy time in seconds")) + .aggregate({sm::shard_label}), + }); +} + +void application::setup_internal_metrics() { namespace sm = ss::metrics; + if (config::shard_local_cfg().disable_metrics()) { + return; + } + // build info auto version_label = sm::label("version"); auto revision_label = sm::label("revision"); diff --git a/src/v/redpanda/application.h b/src/v/redpanda/application.h index f089fa6c3a0c..1e810fa3f364 100644 --- a/src/v/redpanda/application.h +++ b/src/v/redpanda/application.h @@ -35,6 +35,7 @@ #include "resource_mgmt/smp_groups.h" #include "rpc/fwd.h" #include "seastarx.h" +#include "ssx/metrics.h" #include "storage/fwd.h" #include "v8_engine/fwd.h" @@ -149,6 +150,8 @@ class application { } void setup_metrics(); + void setup_public_metrics(); + void setup_internal_metrics(); std::unique_ptr _app; bool _redpanda_enabled{true}; cluster::config_manager::preload_result _config_preload; @@ -176,6 +179,8 @@ class application { ss::sharded _archival_upload_controller; ss::metrics::metric_groups _metrics; + ss::metrics::metric_groups _public_metrics{ + ssx::metrics::public_metrics_handle}; std::unique_ptr _rm_group_proxy; // run these first on destruction deferred_actions _deferred; From 1c87686ec6c7424cc157eb75f562831114b82d54 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 11 Aug 2022 11:36:33 +0100 Subject: [PATCH 5/7] application: remove scheduler_runtime_ms metric scheduler_runtime_ms used to be replicated from the seastar metrics. Previous patches introduced redpanda_scheduler_runtime_seconds_total as a replacement. --- src/v/redpanda/application.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/v/redpanda/application.cc b/src/v/redpanda/application.cc index 3d4d2aff6cbd..20ae140a1ce1 100644 --- a/src/v/redpanda/application.cc +++ b/src/v/redpanda/application.cc @@ -329,8 +329,7 @@ void application::setup_public_metrics() { seastar::metrics::replicate_metric_families( seastar::metrics::default_handle(), - {{"scheduler_runtime_ms", ssx::metrics::public_metrics_handle}, - {"io_queue_total_read_ops", ssx::metrics::public_metrics_handle}, + {{"io_queue_total_read_ops", ssx::metrics::public_metrics_handle}, {"io_queue_total_write_ops", ssx::metrics::public_metrics_handle}, {"memory_allocated_memory", ssx::metrics::public_metrics_handle}, {"memory_free_memory", ssx::metrics::public_metrics_handle}}) From 30ce207ec9b6425acccd252a6179f3ed5e521aca Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 11 Aug 2022 14:37:20 +0100 Subject: [PATCH 6/7] ssx: add metric_groups wrapper to use as service This patch adds a wrapper for seastar::metrics::metric_groups which is intended for usage with seastar::sharded. The only interesting thing about it is that it clears the metrics on stop. --- src/v/ssx/metrics.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/v/ssx/metrics.h b/src/v/ssx/metrics.h index 4f3afd9ae8db..b1c65a5fafe2 100644 --- a/src/v/ssx/metrics.h +++ b/src/v/ssx/metrics.h @@ -13,6 +13,7 @@ #include "utils/hdr_hist.h" +#include #include namespace ssx::metrics { @@ -40,4 +41,12 @@ inline ss::metrics::label make_namespaced_label(const seastar::sstring& name) { return ss::metrics::label(ssx::sformat("{}_{}", label_namespace, name)); } +struct public_metrics_group { + ss::metrics::metric_groups groups{public_metrics_handle}; + ss::future<> stop() { + groups.clear(); + return ss::make_ready_future<>(); + } +}; + } // namespace ssx::metrics From 5c94d72e222361d31f8bd13649c1929c8a87d067 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 11 Aug 2022 14:39:19 +0100 Subject: [PATCH 7/7] application: public metrics as a sharded service This patch wraps the metric_groups object owned by the application into a sharded service. This change allows us to register metrics on specific shards where required. For instance, redpanda_application_uptime_seconds_total is only registered on one shard, while redpanda_cpu_busy_seconds_total is registered on all shards. --- src/v/redpanda/application.cc | 56 ++++++++++++++++++++++------------- src/v/redpanda/application.h | 3 +- 2 files changed, 36 insertions(+), 23 deletions(-) diff --git a/src/v/redpanda/application.cc b/src/v/redpanda/application.cc index 20ae140a1ce1..d3e7cc20b5e3 100644 --- a/src/v/redpanda/application.cc +++ b/src/v/redpanda/application.cc @@ -335,27 +335,41 @@ void application::setup_public_metrics() { {"memory_free_memory", ssx::metrics::public_metrics_handle}}) .get(); - _public_metrics.add_group( - "application", - { - sm::make_gauge( - "uptime_seconds_total", - [] { - return std::chrono::duration(ss::engine().uptime()) - .count(); - }, - sm::description("Redpanda uptime in seconds")) - .aggregate({sm::shard_label}), - sm::make_gauge( - "busy_seconds_total", - [] { - return std::chrono::duration( - ss::engine().total_busy_time()) - .count(); - }, - sm::description("Total CPU busy time in seconds")) - .aggregate({sm::shard_label}), - }); + _public_metrics.start().get(); + + _public_metrics + .invoke_on( + ss::this_shard_id(), + [](auto& public_metrics) { + public_metrics.groups.add_group( + "application", + {sm::make_gauge( + "uptime_seconds_total", + [] { + return std::chrono::duration(ss::engine().uptime()) + .count(); + }, + sm::description("Redpanda uptime in seconds")) + .aggregate({sm::shard_label})}); + }) + .get(); + + _public_metrics.invoke_on_all([](auto& public_metrics) { + public_metrics.groups.add_group( + "cpu", + {sm::make_gauge( + "busy_seconds_total", + [] { + return std::chrono::duration( + ss::engine().total_busy_time()) + .count(); + }, + sm::description("Total CPU busy time in seconds"))}); + }).get(); + + _deferred.emplace_back([this] { + _public_metrics.stop().get(); + }); } void application::setup_internal_metrics() { diff --git a/src/v/redpanda/application.h b/src/v/redpanda/application.h index 1e810fa3f364..01a9bb602140 100644 --- a/src/v/redpanda/application.h +++ b/src/v/redpanda/application.h @@ -179,8 +179,7 @@ class application { ss::sharded _archival_upload_controller; ss::metrics::metric_groups _metrics; - ss::metrics::metric_groups _public_metrics{ - ssx::metrics::public_metrics_handle}; + ss::sharded _public_metrics; std::unique_ptr _rm_group_proxy; // run these first on destruction deferred_actions _deferred;