From 550a448203203a6e825d100e969372ef649218cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Ma=C5=9Blanka?= Date: Tue, 9 Apr 2024 19:47:19 +0200 Subject: [PATCH] c/hm_backend: use cached local health node report if it is available MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When node is requesting a local node health report we may use a cached value instead of collecting a new node health report every time. Signed-off-by: Michał Maślanka --- src/v/cluster/health_monitor_backend.cc | 27 +++++++++++++++++++++- src/v/cluster/health_monitor_backend.h | 7 ++++++ src/v/cluster/health_monitor_frontend.cc | 7 +++--- src/v/cluster/tests/health_monitor_test.cc | 17 ++++++++++++++ 4 files changed, 54 insertions(+), 4 deletions(-) diff --git a/src/v/cluster/health_monitor_backend.cc b/src/v/cluster/health_monitor_backend.cc index 2a3df80aabc48..45bf36c4687d7 100644 --- a/src/v/cluster/health_monitor_backend.cc +++ b/src/v/cluster/health_monitor_backend.cc @@ -434,7 +434,8 @@ ss::future health_monitor_backend::collect_cluster_health() { auto reports = co_await ssx::async_transform( ids.begin(), ids.end(), [this](model::node_id id) { if (id == _self) { - return collect_current_node_health(); + return _report_collection_mutex.with( + [this] { return collect_current_node_health(); }); } return collect_remote_node_health(id); }); @@ -532,6 +533,30 @@ health_monitor_backend::collect_current_node_health() { co_return ret; } +ss::future> +health_monitor_backend::get_current_node_health() { + vlog(clusterlog.debug, "getting current node health"); + + auto it = _reports.find(_self); + if (it != _reports.end()) { + co_return it->second; + } + + auto u = _report_collection_mutex.try_get_units(); + if (!u) { + vlog( + clusterlog.debug, + "report collection in progress, waiting for report to be available"); + u.emplace(co_await _report_collection_mutex.get_units()); + auto it = _reports.find(_self); + if (it != _reports.end()) { + co_return it->second; + } + } + + co_return co_await collect_current_node_health(); +} + namespace { struct ntp_report { diff --git a/src/v/cluster/health_monitor_backend.h b/src/v/cluster/health_monitor_backend.h index 3e82d5fb37582..3e7a6cecbd44b 100644 --- a/src/v/cluster/health_monitor_backend.h +++ b/src/v/cluster/health_monitor_backend.h @@ -69,6 +69,11 @@ class health_monitor_backend { force_refresh refresh, model::timeout_clock::time_point deadline); ss::future> collect_current_node_health(); + /** + * Return cached version of current node health of collects it if it is not + * available in cache. + */ + ss::future> get_current_node_health(); cluster::notification_id_type register_node_callback(health_node_cb_t cb); void unregister_node_callback(cluster::notification_id_type id); @@ -190,6 +195,8 @@ class health_monitor_backend { _node_callbacks; cluster::notification_id_type _next_callback_id{0}; + mutex _report_collection_mutex{"health_report_collection"}; + friend struct health_report_accessor; }; } // namespace cluster diff --git a/src/v/cluster/health_monitor_frontend.cc b/src/v/cluster/health_monitor_frontend.cc index 87e4a8eb6dc20..5dae6b782f827 100644 --- a/src/v/cluster/health_monitor_frontend.cc +++ b/src/v/cluster/health_monitor_frontend.cc @@ -64,12 +64,13 @@ storage::disk_space_alert health_monitor_frontend::get_cluster_disk_health() { return _cluster_disk_health; } -// Collcts and returns current node health report according to provided -// filters list +/** + * Gets cached or collects a node health report. + */ ss::future> health_monitor_frontend::collect_node_health() { return dispatch_to_backend([](health_monitor_backend& be) mutable { - return be.collect_current_node_health(); + return be.get_current_node_health(); }); } std::optional diff --git a/src/v/cluster/tests/health_monitor_test.cc b/src/v/cluster/tests/health_monitor_test.cc index 294e18a8f0722..69f2c9221a98f 100644 --- a/src/v/cluster/tests/health_monitor_test.cc +++ b/src/v/cluster/tests/health_monitor_test.cc @@ -21,6 +21,7 @@ #include "test_utils/fixture.h" #include +#include #include #include @@ -550,3 +551,19 @@ FIXTURE_TEST(test_report_truncation, health_report_unit) { test_unhealthy(max_count + 1, LEADERLESS); test_unhealthy(max_count + 1, URP); } + +FIXTURE_TEST( + test_requesting_collection_at_the_same_time, cluster_test_fixture) { + auto n1 = create_node_application(model::node_id{0}); + /** + * Request reports + */ + auto f_h_1 + = n1->controller->get_health_monitor().local().get_current_node_health(); + auto f_h_2 + = n1->controller->get_health_monitor().local().get_current_node_health(); + + auto results = ss::when_all(std::move(f_h_1), std::move(f_h_2)).get(); + BOOST_REQUIRE( + std::get<0>(results).get().value() == std::get<1>(results).get().value()); +}