Skip to content

Commit

Permalink
c/hm_backend: use cached local health node report if it is available
Browse files Browse the repository at this point in the history
When node is requesting a local node health report we may use a cached
value instead of collecting a new node health report every time.

Signed-off-by: Michał Maślanka <michal@redpanda.com>
  • Loading branch information
mmaslankaprv committed Apr 10, 2024
1 parent ab5a19c commit 550a448
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 4 deletions.
27 changes: 26 additions & 1 deletion src/v/cluster/health_monitor_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,8 @@ ss::future<std::error_code> health_monitor_backend::collect_cluster_health() {
auto reports = co_await ssx::async_transform(
ids.begin(), ids.end(), [this](model::node_id id) {
if (id == _self) {
return collect_current_node_health();
return _report_collection_mutex.with(
[this] { return collect_current_node_health(); });
}
return collect_remote_node_health(id);
});
Expand Down Expand Up @@ -532,6 +533,30 @@ health_monitor_backend::collect_current_node_health() {

co_return ret;
}
ss::future<result<node_health_report>>
health_monitor_backend::get_current_node_health() {
vlog(clusterlog.debug, "getting current node health");

auto it = _reports.find(_self);
if (it != _reports.end()) {
co_return it->second;
}

auto u = _report_collection_mutex.try_get_units();
if (!u) {
vlog(
clusterlog.debug,
"report collection in progress, waiting for report to be available");
u.emplace(co_await _report_collection_mutex.get_units());
auto it = _reports.find(_self);
if (it != _reports.end()) {
co_return it->second;
}
}

co_return co_await collect_current_node_health();
}

namespace {

struct ntp_report {
Expand Down
7 changes: 7 additions & 0 deletions src/v/cluster/health_monitor_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,11 @@ class health_monitor_backend {
force_refresh refresh, model::timeout_clock::time_point deadline);

ss::future<result<node_health_report>> collect_current_node_health();
/**
* Return cached version of current node health of collects it if it is not
* available in cache.
*/
ss::future<result<node_health_report>> get_current_node_health();

cluster::notification_id_type register_node_callback(health_node_cb_t cb);
void unregister_node_callback(cluster::notification_id_type id);
Expand Down Expand Up @@ -190,6 +195,8 @@ class health_monitor_backend {
_node_callbacks;
cluster::notification_id_type _next_callback_id{0};

mutex _report_collection_mutex{"health_report_collection"};

friend struct health_report_accessor;
};
} // namespace cluster
7 changes: 4 additions & 3 deletions src/v/cluster/health_monitor_frontend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,13 @@ storage::disk_space_alert health_monitor_frontend::get_cluster_disk_health() {
return _cluster_disk_health;
}

// Collcts and returns current node health report according to provided
// filters list
/**
* Gets cached or collects a node health report.
*/
ss::future<result<node_health_report>>
health_monitor_frontend::collect_node_health() {
return dispatch_to_backend([](health_monitor_backend& be) mutable {
return be.collect_current_node_health();
return be.get_current_node_health();
});
}
std::optional<alive>
Expand Down
17 changes: 17 additions & 0 deletions src/v/cluster/tests/health_monitor_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "test_utils/fixture.h"

#include <seastar/core/sstring.hh>
#include <seastar/core/when_all.hh>

#include <boost/test/tools/interface.hpp>
#include <boost/test/tools/old/interface.hpp>
Expand Down Expand Up @@ -550,3 +551,19 @@ FIXTURE_TEST(test_report_truncation, health_report_unit) {
test_unhealthy(max_count + 1, LEADERLESS);
test_unhealthy(max_count + 1, URP);
}

FIXTURE_TEST(
test_requesting_collection_at_the_same_time, cluster_test_fixture) {
auto n1 = create_node_application(model::node_id{0});
/**
* Request reports
*/
auto f_h_1
= n1->controller->get_health_monitor().local().get_current_node_health();
auto f_h_2
= n1->controller->get_health_monitor().local().get_current_node_health();

auto results = ss::when_all(std::move(f_h_1), std::move(f_h_2)).get();
BOOST_REQUIRE(
std::get<0>(results).get().value() == std::get<1>(results).get().value());
}

0 comments on commit 550a448

Please sign in to comment.