From 510fd0d3affdb3b9c32a7cfeb9ea1a04c612ed83 Mon Sep 17 00:00:00 2001 From: Dan Wang Date: Tue, 27 Dec 2022 19:30:31 +0800 Subject: [PATCH 01/17] feat(new_metrics): retire old metrics --- src/utils/metrics.cpp | 62 ++++++++++++++++++++++++++++++++++++++++++- src/utils/metrics.h | 15 +++++++++++ 2 files changed, 76 insertions(+), 1 deletion(-) diff --git a/src/utils/metrics.cpp b/src/utils/metrics.cpp index 6e83324d60..c073c809a7 100644 --- a/src/utils/metrics.cpp +++ b/src/utils/metrics.cpp @@ -17,6 +17,7 @@ #include "utils/metrics.h" +#include "runtime/api_layer1.h" #include "utils/api_utilities.h" #include "utils/rand.h" #include "utils/shared_io_service.h" @@ -177,6 +178,65 @@ void metric_entity::take_snapshot(metric_json_writer &writer, const metric_filte writer.EndObject(); } +metric_entity::old_metrics_type metric_entity::collect_old_metrics() const +{ + old_metrics_type old_metrics; + + utils::auto_read_lock l(_lock); + + for (const auto &m : _metrics) { + if (m.second->get_count() <= 1) { + old_metrics.insert(m.first); + continue; + } + + if (m.second->retire_time_ns() > 0) { + old_metrics.insert(m.first); + } + } + + return old_metrics; +} + +void metric_entity::retire_old_metrics(const old_metrics_type &old_metrics) +{ + if (old_metrics.empty()) { + return; + } + + utils::auto_write_lock l(_lock); + + for (const auto &m : old_metrics) { + auto iter = _metrics.find(m); + if (iter == _metrics.end()) { + continue; + } + + if (iter->second->get_count() > 1) { + if (iter->second->_retire_time_ns > 0) { + iter->second->_retire_time_ns = 0; + } + } + + auto now = dsn_now_ns(); + if (iter->second->_retire_time_ns == 0) { + iter->second->_retire_time_ns = now + ; + continue; + } + + if (iter->second->_retire_time_ns < now) { + continue; + } + + _metrics.erase(iter); + } +} + +void metric_entity::process_old_metrics() +{ + retire_old_metrics(collect_old_metrics()); +} + void metric_filters::extract_entity_metrics(const metric_entity::metric_map &candidates, metric_entity::metric_map &target_metrics) const { @@ -387,7 +447,7 @@ metric_prototype::metric_prototype(const ctor_args &args) : _args(args) {} metric_prototype::~metric_prototype() {} -metric::metric(const metric_prototype *prototype) : _prototype(prototype) {} +metric::metric(const metric_prototype *prototype) : _prototype(prototype), _retire_time_ns(0) {} closeable_metric::closeable_metric(const metric_prototype *prototype) : metric(prototype) {} diff --git a/src/utils/metrics.h b/src/utils/metrics.h index c7c1bda18d..4a1e1d4f98 100644 --- a/src/utils/metrics.h +++ b/src/utils/metrics.h @@ -153,6 +153,7 @@ class metric_entity : public ref_counter public: using attr_map = std::unordered_map; using metric_map = std::unordered_map; + using old_metrics_type = std::unordered_set; const metric_entity_prototype *prototype() const { return _prototype; } @@ -197,6 +198,12 @@ class metric_entity : public ref_counter void encode_id(metric_json_writer &writer) const; + old_metrics_type collect_old_metrics() const; + + void retire_old_metrics(const old_metrics_type &old_metrics); + + void process_old_metrics(); + const metric_entity_prototype *const _prototype; const std::string _id; @@ -524,6 +531,8 @@ class metric : public ref_counter public: const metric_prototype *prototype() const { return _prototype; } + uint64_t retire_time_ns() const { return _retire_time_ns; } + // Take snapshot of each metric to collect current values as json format with fields chosen // by `filters`. virtual void take_snapshot(metric_json_writer &writer, const metric_filters &filters) = 0; @@ -592,7 +601,11 @@ class metric : public ref_counter const metric_prototype *const _prototype; + uint64_t _retire_time_ns; + private: + friend class metric_entity; + DISALLOW_COPY_AND_ASSIGN(metric); }; @@ -1022,6 +1035,8 @@ class percentile : public closeable_metric // interval_ms is the interval between the computations for percentiles. Its unit is // milliseconds. It's suggested that interval_ms should be near the period between pulls // from or pushes to the monitoring system. + // TODO(wangdan): we can also support constructing percentiles from the parameters in + // the configuration file. percentile(const metric_prototype *prototype, uint64_t interval_ms = 10000, const std::set &kth_percentiles = kAllKthPercentileTypes, From 1e4cee396df6aa3a933f3a6cc0c0c252a21092ce Mon Sep 17 00:00:00 2001 From: Dan Wang Date: Wed, 28 Dec 2022 17:39:00 +0800 Subject: [PATCH 02/17] feat(new_metrics): retire old metrics --- src/utils/metrics.cpp | 127 ++++++++++++++++++++++++++++++++++++------ src/utils/metrics.h | 32 ++++++++--- 2 files changed, 133 insertions(+), 26 deletions(-) diff --git a/src/utils/metrics.cpp b/src/utils/metrics.cpp index c073c809a7..622f192441 100644 --- a/src/utils/metrics.cpp +++ b/src/utils/metrics.cpp @@ -19,6 +19,7 @@ #include "runtime/api_layer1.h" #include "utils/api_utilities.h" +#include "utils/flags.h" #include "utils/rand.h" #include "utils/shared_io_service.h" #include "utils/string_conv.h" @@ -26,6 +27,10 @@ namespace dsn { +DSN_DEFINE_uint64("metrics", metrics_retirement_delay_ms, 120 * 1000, + "The minimum number of milliseconds a metric will be kept for after it is " + "no longer active."); + metric_entity::metric_entity(const metric_entity_prototype *prototype, const std::string &id, const attr_map &attrs) @@ -42,6 +47,12 @@ metric_entity::~metric_entity() close(close_option::kWait); } +metric_entity::operator bool() const +{ + utils::auto_read_lock l(_lock); + return !_metrics.empty(); +} + void metric_entity::close(close_option option) { utils::auto_write_lock l(_lock); @@ -178,19 +189,24 @@ void metric_entity::take_snapshot(metric_json_writer &writer, const metric_filte writer.EndObject(); } -metric_entity::old_metrics_type metric_entity::collect_old_metrics() const +metric_entity::old_metric_list metric_entity::collect_old_metrics() const { - old_metrics_type old_metrics; + old_metric_list old_metrics; utils::auto_read_lock l(_lock); for (const auto &m : _metrics) { - if (m.second->get_count() <= 1) { + if (is_metric_stale(m.second)) { + // There's only one reference for this metric that is kept in the entity. Thus + // this metric is not considered to be in use. old_metrics.insert(m.first); continue; } - if (m.second->retire_time_ns() > 0) { + if (m.second->retire_time_ms() > 0) { + // This metric has previously been scheduled to be retired. However, now this + // metric is used again since its reference count is more than 1. Thus its + // delay time for retirement should be cleared. old_metrics.insert(m.first); } } @@ -198,9 +214,10 @@ metric_entity::old_metrics_type metric_entity::collect_old_metrics() const return old_metrics; } -void metric_entity::retire_old_metrics(const old_metrics_type &old_metrics) +void metric_entity::retire_old_metrics(const old_metric_list &old_metrics) { if (old_metrics.empty()) { + // Do not lock for empty list. return; } @@ -209,34 +226,38 @@ void metric_entity::retire_old_metrics(const old_metrics_type &old_metrics) for (const auto &m : old_metrics) { auto iter = _metrics.find(m); if (iter == _metrics.end()) { + // This metric has been removed from the entity. continue; } - if (iter->second->get_count() > 1) { - if (iter->second->_retire_time_ns > 0) { - iter->second->_retire_time_ns = 0; + if (!is_metric_stale(iter->second)) { + if (iter->second->_retire_time_ms > 0) { + // For those metrics which are still in use, their delay time for retirement + // should be cleared since previously they could have been scheduled to be + // retired. + iter->second->_retire_time_ms = 0; } + continue; } - auto now = dsn_now_ns(); - if (iter->second->_retire_time_ns == 0) { - iter->second->_retire_time_ns = now + ; + auto now = dsn_now_ms(); + if (iter->second->_retire_time_ms == 0) { + // This metric is not in use, thus should be marked with a delay time for + // retirement. + iter->second->_retire_time_ms = now + FLAGS_metrics_retirement_delay_ms; continue; } - if (iter->second->_retire_time_ns < now) { + if (iter->second->_retire_time_ms < now) { + // This metric has been scheduled to be retired, but has not taken effect. continue; } + // Retire the metric from the entity after the delay time. _metrics.erase(iter); } } -void metric_entity::process_old_metrics() -{ - retire_old_metrics(collect_old_metrics()); -} - void metric_filters::extract_entity_metrics(const metric_entity::metric_map &candidates, metric_entity::metric_map &target_metrics) const { @@ -311,6 +332,18 @@ dsn::metric_filters::metric_fields_type get_brief_metric_fields() const dsn::metric_filters::metric_fields_type kBriefMetricFields = get_brief_metric_fields(); +bool is_metric_stale(const dsn::metric_ptr &m) +{ + CHECK_GE(m.get_count(), 1); + return m.get_count() == 1; +} + +bool is_entity_stale(const dsn::metric_entity_ptr &entity) +{ + CHECK_GE(entity.second.get_count(), 1); + return *entity && entity.get_count() == 1; +} + } // anonymous namespace void metrics_http_service::get_metrics_handler(const http_request &req, http_response &resp) @@ -443,11 +476,69 @@ metric_entity_ptr metric_registry::find_or_create_entity(const metric_entity_pro return entity; } +metric_registry::old_entity_map metric_registry::collect_old_metrics() const +{ + old_entity_map old_entities; + + utils::auto_read_lock l(_lock); + + for (const auto &entity : _entities) { + const auto &old_metrics = entity.second->collect_old_metrics(); + if (!old_metrics.empty()) { + // Those entities which have stale metrics that should be retired will be + // collected. + old_entities.emplace(entity.first, std::move(old_metrics)); + continue; + } + + if (is_entity_stale(entity.second)) { + // Since this entity itself is stale, it will be collected without any + // metric. + (void)old_entities[entity.first]; + } + } + + return old_entities; +} + +void metric_registry::retire_old_metrics(const old_entity_map &old_entities) +{ + if (old_entities.empty()) { + // Do not lock for empty list. + return; + } + + utils::auto_write_lock l(_lock); + + for (const auto &old_entity : old_entities) { + auto iter = _entities.find(old_entity.first); + if (iter == _entities.end()) { + // This entity has been removed from the registry. + continue; + } + + // Try to retire the stale metrics from this entity. Notice that some entities + // may have no metric in which case it will never be locked. + iter->second->retire_old_metrics(old_entity.second); + + if (is_entity_stale(iter->second)) { + // The entity itself is stale and will be retired immediately, for the reason that + // each metric in it has been delayed for a long enough time before retirement. + _entities.erase(iter); + } + } +} + +void metric_registry::process_old_metrics() +{ + retire_old_metrics(collect_old_metrics()); +} + metric_prototype::metric_prototype(const ctor_args &args) : _args(args) {} metric_prototype::~metric_prototype() {} -metric::metric(const metric_prototype *prototype) : _prototype(prototype), _retire_time_ns(0) {} +metric::metric(const metric_prototype *prototype) : _prototype(prototype), _retire_time_ms(0) {} closeable_metric::closeable_metric(const metric_prototype *prototype) : metric(prototype) {} diff --git a/src/utils/metrics.h b/src/utils/metrics.h index 4a1e1d4f98..d702ccb962 100644 --- a/src/utils/metrics.h +++ b/src/utils/metrics.h @@ -153,7 +153,7 @@ class metric_entity : public ref_counter public: using attr_map = std::unordered_map; using metric_map = std::unordered_map; - using old_metrics_type = std::unordered_set; + using old_metric_list = std::unordered_set; const metric_entity_prototype *prototype() const { return _prototype; } @@ -179,6 +179,9 @@ class metric_entity : public ref_counter ~metric_entity(); + // Return true if the stored metrics are not empty. + explicit operator bool() const; + // Close all "closeable" metrics owned by this entity. // // `option` is used to control how the close operations are performed: @@ -198,11 +201,10 @@ class metric_entity : public ref_counter void encode_id(metric_json_writer &writer) const; - old_metrics_type collect_old_metrics() const; - - void retire_old_metrics(const old_metrics_type &old_metrics); - - void process_old_metrics(); + // The whole retirement process is divided two phases "collect" and "retire", please see + // `collect_old_metrics()` and `retire_old_metrics()` of registry for details. + old_metric_list collect_old_metrics() const; + void retire_old_metrics(const old_metric_list &old_metrics); const metric_entity_prototype *const _prototype; const std::string _id; @@ -365,6 +367,7 @@ class metric_registry : public utils::singleton { public: using entity_map = std::unordered_map; + using old_entity_map = std::unordered_map; entity_map entities() const; @@ -383,6 +386,19 @@ class metric_registry : public utils::singleton const std::string &id, const metric_entity::attr_map &attrs); + // Since retirements do not happen frequently, while we traverse over the registry there + // tend to be no metric and entity that should be retired. Therefore, we divide the whole + // retirement process into two phases: "collect" and "retire". + // + // In the first phase "collect", we just read and check which metrics and entitie should + // be retired, thus it just needs read lock that is lightweight. + // + // Once some metrics and entities are collected to be retired, in the second phase "retire", + // they will be removed according to the specific rules. + old_entity_map collect_old_metrics() const; + void retire_old_metrics(const old_entity_map &old_entities); + void process_old_metrics(); + mutable utils::rw_lock_nr _lock; entity_map _entities; @@ -531,7 +547,7 @@ class metric : public ref_counter public: const metric_prototype *prototype() const { return _prototype; } - uint64_t retire_time_ns() const { return _retire_time_ns; } + uint64_t retire_time_ms() const { return _retire_time_ms; } // Take snapshot of each metric to collect current values as json format with fields chosen // by `filters`. @@ -601,7 +617,7 @@ class metric : public ref_counter const metric_prototype *const _prototype; - uint64_t _retire_time_ns; + uint64_t _retire_time_ms; private: friend class metric_entity; From a0bd674d60afd186f5eba08ca84ea6c2338430e1 Mon Sep 17 00:00:00 2001 From: Dan Wang Date: Wed, 28 Dec 2022 18:56:44 +0800 Subject: [PATCH 03/17] feat(new_metrics): retire old metrics --- src/utils/metrics.cpp | 20 ++++---- src/utils/metrics.h | 104 +++++++++++++++++++++--------------------- 2 files changed, 62 insertions(+), 62 deletions(-) diff --git a/src/utils/metrics.cpp b/src/utils/metrics.cpp index 622f192441..b67d2b066b 100644 --- a/src/utils/metrics.cpp +++ b/src/utils/metrics.cpp @@ -405,7 +405,7 @@ metric_registry::metric_registry() : _http_service(this) { // We should ensure that metric_registry is destructed before shared_io_service is destructed. // Once shared_io_service is destructed before metric_registry is destructed, - // boost::asio::io_service needed by metrics in metric_registry such as percentile_timer will + // boost::asio::io_service needed by metrics in metric_registry such as metric_timer will // be released firstly, then will lead to heap-use-after-free error since percentiles in // metric_registry are still running but the resources they needed have been released. tools::shared_io_service::instance(); @@ -542,7 +542,7 @@ metric::metric(const metric_prototype *prototype) : _prototype(prototype), _reti closeable_metric::closeable_metric(const metric_prototype *prototype) : metric(prototype) {} -uint64_t percentile_timer::generate_initial_delay_ms(uint64_t interval_ms) +uint64_t metric_timer::generate_initial_delay_ms(uint64_t interval_ms) { CHECK_GT(interval_ms, 0); @@ -554,7 +554,7 @@ uint64_t percentile_timer::generate_initial_delay_ms(uint64_t interval_ms) return (rand::next_u64() % interval_seconds + 1) * 1000 + rand::next_u64() % 1000; } -percentile_timer::percentile_timer(uint64_t interval_ms, on_exec_fn on_exec, on_close_fn on_close) +metric_timer::metric_timer(uint64_t interval_ms, on_exec_fn on_exec, on_close_fn on_close) : _initial_delay_ms(generate_initial_delay_ms(interval_ms)), _interval_ms(interval_ms), _on_exec(on_exec), @@ -564,10 +564,10 @@ percentile_timer::percentile_timer(uint64_t interval_ms, on_exec_fn on_exec, on_ _timer(new boost::asio::deadline_timer(tools::shared_io_service::instance().ios)) { _timer->expires_from_now(boost::posix_time::milliseconds(_initial_delay_ms)); - _timer->async_wait(std::bind(&percentile_timer::on_timer, this, std::placeholders::_1)); + _timer->async_wait(std::bind(&metric_timer::on_timer, this, std::placeholders::_1)); } -void percentile_timer::close() +void metric_timer::close() { // If the timer has already expired when cancel() is called, then the handlers for asynchronous // wait operations will: @@ -584,15 +584,15 @@ void percentile_timer::close() } } -void percentile_timer::wait() { _completed.wait(); } +void metric_timer::wait() { _completed.wait(); } -void percentile_timer::on_close() +void metric_timer::on_close() { _on_close(); _completed.notify(); } -void percentile_timer::on_timer(const boost::system::error_code &ec) +void metric_timer::on_timer(const boost::system::error_code &ec) { // This macro is defined for the case that handlers for asynchronous wait operations are no // longer cancelled. It just checks the internal state atomically (since close() can also be @@ -616,7 +616,7 @@ void percentile_timer::on_timer(const boost::system::error_code &ec) // Cancel can only be launched by close(). auto expected_state = state::kClosing; CHECK(_state.compare_exchange_strong(expected_state, state::kClosed), - "wrong state for percentile_timer: {}, while expecting closing state", + "wrong state for metric_timer: {}, while expecting closing state", static_cast(expected_state)); on_close(); @@ -628,7 +628,7 @@ void percentile_timer::on_timer(const boost::system::error_code &ec) TRY_PROCESS_TIMER_CLOSING(); _timer->expires_from_now(boost::posix_time::milliseconds(_interval_ms)); - _timer->async_wait(std::bind(&percentile_timer::on_timer, this, std::placeholders::_1)); + _timer->async_wait(std::bind(&metric_timer::on_timer, this, std::placeholders::_1)); #undef TRY_PROCESS_TIMER_CLOSING } diff --git a/src/utils/metrics.h b/src/utils/metrics.h index d702ccb962..450d86242c 100644 --- a/src/utils/metrics.h +++ b/src/utils/metrics.h @@ -363,6 +363,56 @@ class metrics_http_service : public http_server_base DISALLOW_COPY_AND_ASSIGN(metrics_http_service); }; +// `metric_timer` is a timer class that encapsulates the details how each percentile is +// computed periodically. +// +// To be instantiated, it requires `interval_ms` at which a percentile is computed and `exec` +// which is used to compute percentile. +// +// In case that all percentiles are computed at the same time and lead to very high load, +// first computation for percentile will be delayed at a random interval. +class metric_timer +{ +public: + enum class state : int + { + kRunning, + kClosing, + kClosed, + }; + + using on_exec_fn = std::function; + using on_close_fn = std::function; + + metric_timer(uint64_t interval_ms, on_exec_fn on_exec, on_close_fn on_close); + ~metric_timer() = default; + + void close(); + void wait(); + + // Get the initial delay that is randomly generated by `generate_initial_delay_ms()`. + uint64_t get_initial_delay_ms() const { return _initial_delay_ms; } + +private: + // Generate an initial delay randomly in case that all percentiles are computed at the + // same time. + static uint64_t generate_initial_delay_ms(uint64_t interval_ms); + + void on_close(); + + void on_timer(const boost::system::error_code &ec); + + const uint64_t _initial_delay_ms; + const uint64_t _interval_ms; + const on_exec_fn _on_exec; + const on_close_fn _on_close; + std::atomic _state; + utils::notify_event _completed; + std::unique_ptr _timer; + + DISALLOW_COPY_AND_ASSIGN(metric_timer); +}; + class metric_registry : public utils::singleton { public: @@ -921,56 +971,6 @@ inline size_t kth_percentile_to_nth_index(size_t size, kth_percentile_type type) return kth_percentile_to_nth_index(size, static_cast(type)); } -// `percentile_timer` is a timer class that encapsulates the details how each percentile is -// computed periodically. -// -// To be instantiated, it requires `interval_ms` at which a percentile is computed and `exec` -// which is used to compute percentile. -// -// In case that all percentiles are computed at the same time and lead to very high load, -// first computation for percentile will be delayed at a random interval. -class percentile_timer -{ -public: - enum class state : int - { - kRunning, - kClosing, - kClosed, - }; - - using on_exec_fn = std::function; - using on_close_fn = std::function; - - percentile_timer(uint64_t interval_ms, on_exec_fn on_exec, on_close_fn on_close); - ~percentile_timer() = default; - - void close(); - void wait(); - - // Get the initial delay that is randomly generated by `generate_initial_delay_ms()`. - uint64_t get_initial_delay_ms() const { return _initial_delay_ms; } - -private: - // Generate an initial delay randomly in case that all percentiles are computed at the - // same time. - static uint64_t generate_initial_delay_ms(uint64_t interval_ms); - - void on_close(); - - void on_timer(const boost::system::error_code &ec); - - const uint64_t _initial_delay_ms; - const uint64_t _interval_ms; - const on_exec_fn _on_exec; - const on_close_fn _on_close; - std::atomic _state; - utils::notify_event _completed; - std::unique_ptr _timer; - - DISALLOW_COPY_AND_ASSIGN(percentile_timer); -}; - // The percentile is a metric type that samples observations. The size of samples has an upper // bound. Once the maximum size is reached, the earliest observations will be overwritten. // @@ -1097,7 +1097,7 @@ class percentile : public closeable_metric // See on_close() for details which is registered in timer and will be called // back once close() is invoked. add_ref(); - _timer.reset(new percentile_timer( + _timer.reset(new metric_timer( interval_ms, std::bind(&percentile::find_nth_elements, this), std::bind(&percentile::on_close, this))); @@ -1190,7 +1190,7 @@ class percentile : public closeable_metric std::vector> _full_nth_elements; NthElementFinder _nth_element_finder; - std::unique_ptr _timer; + std::unique_ptr _timer; DISALLOW_COPY_AND_ASSIGN(percentile); }; From 1f3992ef29783affa3811fa77e26d814fe9079f3 Mon Sep 17 00:00:00 2001 From: Dan Wang Date: Wed, 28 Dec 2022 20:08:22 +0800 Subject: [PATCH 04/17] feat(new_metrics): retire old metrics --- src/utils/metrics.cpp | 55 ++++++++++++++++++++++++++++++------------- src/utils/metrics.h | 4 ++++ 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/src/utils/metrics.cpp b/src/utils/metrics.cpp index b67d2b066b..ffd5f54311 100644 --- a/src/utils/metrics.cpp +++ b/src/utils/metrics.cpp @@ -27,7 +27,7 @@ namespace dsn { -DSN_DEFINE_uint64("metrics", metrics_retirement_delay_ms, 120 * 1000, +DSN_DEFINE_uint64("metrics", metrics_retirement_delay_ms, 5 * 60 * 1000, "The minimum number of milliseconds a metric will be kept for after it is " "no longer active."); @@ -193,22 +193,30 @@ metric_entity::old_metric_list metric_entity::collect_old_metrics() const { old_metric_list old_metrics; + auto now = dsn_now_ms(); + utils::auto_read_lock l(_lock); for (const auto &m : _metrics) { - if (is_metric_stale(m.second)) { - // There's only one reference for this metric that is kept in the entity. Thus - // this metric is not considered to be in use. - old_metrics.insert(m.first); + if (!is_metric_stale(m.second)) { + if (m.second->retire_time_ms() > 0) { + // This metric has previously been scheduled to be retired. However, now + // this metric is used again since its reference count is more than 1. + // Thus its delay time for retirement should be cleared. + old_metrics.insert(m.first); + } continue; } - if (m.second->retire_time_ms() > 0) { - // This metric has previously been scheduled to be retired. However, now this - // metric is used again since its reference count is more than 1. Thus its - // delay time for retirement should be cleared. - old_metrics.insert(m.first); + if (iter->second->retire_time_ms() > 0) { + if (iter->second->retire_time_ms() < now) { + // This metric has been scheduled to be retired, but retirement has not + // taken effect. + continue; + } } + + old_metrics.insert(m.first); } return old_metrics; @@ -221,6 +229,8 @@ void metric_entity::retire_old_metrics(const old_metric_list &old_metrics) return; } + auto now = dsn_now_ms(); + utils::auto_write_lock l(_lock); for (const auto &m : old_metrics) { @@ -240,7 +250,6 @@ void metric_entity::retire_old_metrics(const old_metric_list &old_metrics) continue; } - auto now = dsn_now_ms(); if (iter->second->_retire_time_ms == 0) { // This metric is not in use, thus should be marked with a delay time for // retirement. @@ -248,13 +257,10 @@ void metric_entity::retire_old_metrics(const old_metric_list &old_metrics) continue; } - if (iter->second->_retire_time_ms < now) { - // This metric has been scheduled to be retired, but has not taken effect. - continue; + if (dsn_likely(iter->second->_retire_time_ms >= now)) { + // Retire the metric from the entity after the delay time. + _metrics.erase(iter); } - - // Retire the metric from the entity after the delay time. - _metrics.erase(iter); } } @@ -335,12 +341,18 @@ const dsn::metric_filters::metric_fields_type kBriefMetricFields = get_brief_met bool is_metric_stale(const dsn::metric_ptr &m) { CHECK_GE(m.get_count(), 1); + + // There's only one reference for this metric that is kept in the entity. Thus + // this metric is not considered to be in use. return m.get_count() == 1; } bool is_entity_stale(const dsn::metric_entity_ptr &entity) { CHECK_GE(entity.second.get_count(), 1); + + // The entity has no metric and there's only one reference for the entity that is kept in the + // registry. Thus this entity is not considered to be in use. return *entity && entity.get_count() == 1; } @@ -409,6 +421,10 @@ metric_registry::metric_registry() : _http_service(this) // be released firstly, then will lead to heap-use-after-free error since percentiles in // metric_registry are still running but the resources they needed have been released. tools::shared_io_service::instance(); + + _timer.reset(new metric_timer(FLAGS_metrics_retirement_delay_ms, + std::bind(&metric_registry::process_old_metrics(), this), + std::bind(&metric_registry::on_close, this))); } metric_registry::~metric_registry() @@ -429,8 +445,13 @@ metric_registry::~metric_registry() for (auto &entity : _entities) { entity.second->close(metric_entity::close_option::kNoWait); } + + _timer->close(); + _timer->wait(); } +void metric_registry::on_close() {} + metric_registry::entity_map metric_registry::entities() const { utils::auto_read_lock l(_lock); diff --git a/src/utils/metrics.h b/src/utils/metrics.h index 450d86242c..e507d42749 100644 --- a/src/utils/metrics.h +++ b/src/utils/metrics.h @@ -432,6 +432,8 @@ class metric_registry : public utils::singleton metric_registry(); ~metric_registry(); + void on_close(); + metric_entity_ptr find_or_create_entity(const metric_entity_prototype *prototype, const std::string &id, const metric_entity::attr_map &attrs); @@ -454,6 +456,8 @@ class metric_registry : public utils::singleton metrics_http_service _http_service; + std::unique_ptr _timer; + DISALLOW_COPY_AND_ASSIGN(metric_registry); }; From af636ed13f8d35dc62ea90d14a3590db317b9407 Mon Sep 17 00:00:00 2001 From: Dan Wang Date: Thu, 29 Dec 2022 15:54:08 +0800 Subject: [PATCH 05/17] feat(new_metrics): retire old metrics --- src/utils/metrics.cpp | 72 +++++++++++++++++++++++++++------ src/utils/metrics.h | 9 ++++- src/utils/test/metrics_test.cpp | 31 ++++++++++++++ 3 files changed, 98 insertions(+), 14 deletions(-) diff --git a/src/utils/metrics.cpp b/src/utils/metrics.cpp index ffd5f54311..8e4c22c137 100644 --- a/src/utils/metrics.cpp +++ b/src/utils/metrics.cpp @@ -27,7 +27,7 @@ namespace dsn { -DSN_DEFINE_uint64("metrics", metrics_retirement_delay_ms, 5 * 60 * 1000, +DSN_DEFINE_uint64("metrics", metrics_retirement_delay_ms, 10 * 60 * 1000, "The minimum number of milliseconds a metric will be kept for after it is " "no longer active."); @@ -222,11 +222,13 @@ metric_entity::old_metric_list metric_entity::collect_old_metrics() const return old_metrics; } -void metric_entity::retire_old_metrics(const old_metric_list &old_metrics) +std::tuple metric_entity::retire_old_metrics(const old_metric_list &old_metrics) { + size_t num_retired = 0; + size_t num_potential = 0; if (old_metrics.empty()) { // Do not lock for empty list. - return; + return std::make_tuple(num_retired, num_potential); } auto now = dsn_now_ms(); @@ -254,14 +256,18 @@ void metric_entity::retire_old_metrics(const old_metric_list &old_metrics) // This metric is not in use, thus should be marked with a delay time for // retirement. iter->second->_retire_time_ms = now + FLAGS_metrics_retirement_delay_ms; + ++num_potential; continue; } if (dsn_likely(iter->second->_retire_time_ms >= now)) { // Retire the metric from the entity after the delay time. _metrics.erase(iter); + ++num_retired; } } + + return std::make_tuple(num_retired, num_potential); } void metric_filters::extract_entity_metrics(const metric_entity::metric_map &candidates, @@ -422,9 +428,7 @@ metric_registry::metric_registry() : _http_service(this) // metric_registry are still running but the resources they needed have been released. tools::shared_io_service::instance(); - _timer.reset(new metric_timer(FLAGS_metrics_retirement_delay_ms, - std::bind(&metric_registry::process_old_metrics(), this), - std::bind(&metric_registry::on_close, this))); + start_timer(); } metric_registry::~metric_registry() @@ -446,12 +450,33 @@ metric_registry::~metric_registry() entity.second->close(metric_entity::close_option::kNoWait); } - _timer->close(); - _timer->wait(); + stop_timer(); } void metric_registry::on_close() {} +void metric_registry::start_timer() +{ + if (_timer) { + return; + } + + _timer.reset(new metric_timer(FLAGS_metrics_retirement_delay_ms, + std::bind(&metric_registry::process_old_metrics(), this), + std::bind(&metric_registry::on_close, this))); +} + +void metric_registry::stop_timer() +{ + if (!_timer) { + return; + } + + _timer->close(); + _timer->wait(); + _timer->reset(); +} + metric_registry::entity_map metric_registry::entities() const { utils::auto_read_lock l(_lock); @@ -522,11 +547,14 @@ metric_registry::old_entity_map metric_registry::collect_old_metrics() const return old_entities; } -void metric_registry::retire_old_metrics(const old_entity_map &old_entities) +std::tuple metric_registry::retire_old_metrics(const old_entity_map &old_entities) { + size_t num_retired_entities = 0; + size_t num_retired_metrics = 0; + size_t num_potential_metrics = 0; if (old_entities.empty()) { // Do not lock for empty list. - return; + return std::make_tuple(num_retired_entities, num_retired_metrics, num_potential_metrics); } utils::auto_write_lock l(_lock); @@ -540,19 +568,39 @@ void metric_registry::retire_old_metrics(const old_entity_map &old_entities) // Try to retire the stale metrics from this entity. Notice that some entities // may have no metric in which case it will never be locked. - iter->second->retire_old_metrics(old_entity.second); + size_t num_retired; + size_t num_potential; + std::tie(num_retired, num_potential) = iter->second->retire_old_metrics(old_entity.second); + num_retired_metrics += num_retired; + num_potential_metrics += num_potential; if (is_entity_stale(iter->second)) { // The entity itself is stale and will be retired immediately, for the reason that // each metric in it has been delayed for a long enough time before retirement. _entities.erase(iter); + ++num_retired_entities; } } + + return std::make_tuple(num_retired_entities, num_retired_metrics, num_potential_metrics); } void metric_registry::process_old_metrics() { - retire_old_metrics(collect_old_metrics()); + LOG_INFO("begin to process old metrics"); + + size_t num_collected_metrics = 0; + const auto &old_entities = collect_old_metrics(); + for (const auto &old_entity : old_entities) { + num_collected_metrics += old_entity.second.size(); + } + LOG_INFO_F("collected_entities={}, collected_metrics={}", old_entities.size(), num_collected_metrics); + + size_t num_retired_entities; + size_t num_retired_metrics; + size_t num_potential_metrics; + std::tie(num_retired_entities, num_retired_metrics) = retire_old_metrics(old_entities); + LOG_INFO_F("retired_entities={}, retired_metrics={}, potential_metrics={}", num_retired_entities, num_retired_metrics, num_potential_metrics); } metric_prototype::metric_prototype(const ctor_args &args) : _args(args) {} diff --git a/src/utils/metrics.h b/src/utils/metrics.h index e507d42749..fc24fc6832 100644 --- a/src/utils/metrics.h +++ b/src/utils/metrics.h @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -204,7 +205,7 @@ class metric_entity : public ref_counter // The whole retirement process is divided two phases "collect" and "retire", please see // `collect_old_metrics()` and `retire_old_metrics()` of registry for details. old_metric_list collect_old_metrics() const; - void retire_old_metrics(const old_metric_list &old_metrics); + std::tuple retire_old_metrics(const old_metric_list &old_metrics); const metric_entity_prototype *const _prototype; const std::string _id; @@ -428,12 +429,16 @@ class metric_registry : public utils::singleton friend class utils::singleton; friend void test_get_metrics_handler(const http_request &req, http_response &resp); + friend void test_restart_metric_registry_timer(uint64_t interval_ms); metric_registry(); ~metric_registry(); void on_close(); + void start_timer(); + void stop_timer(); + metric_entity_ptr find_or_create_entity(const metric_entity_prototype *prototype, const std::string &id, const metric_entity::attr_map &attrs); @@ -448,7 +453,7 @@ class metric_registry : public utils::singleton // Once some metrics and entities are collected to be retired, in the second phase "retire", // they will be removed according to the specific rules. old_entity_map collect_old_metrics() const; - void retire_old_metrics(const old_entity_map &old_entities); + std::tuple retire_old_metrics(const old_entity_map &old_entities); void process_old_metrics(); mutable utils::rw_lock_nr _lock; diff --git a/src/utils/test/metrics_test.cpp b/src/utils/test/metrics_test.cpp index 9b12a61a6e..a8480364a0 100644 --- a/src/utils/test/metrics_test.cpp +++ b/src/utils/test/metrics_test.cpp @@ -31,6 +31,8 @@ namespace dsn { +DSN_DECLARE_uint64(metrics_retirement_delay_ms); + class my_gauge : public metric { public: @@ -2828,4 +2830,33 @@ TEST(metrics_test, http_get_metrics) } } +void test_restart_metric_registry_timer(uint64_t interval_ms) +{ + metric_registry::instance()::stop_timer(); + FLAGS_metrics_retirement_delay_ms = interval_ms; + metric_registry::instance()::start_timer(); +} + +TEST(metrics_test, retire_old_metrics) +{ + auto reserved_metrics_retirement_delay_ms = FLAGS_metrics_retirement_delay_ms; + test_restart_metric_registry_timer(100); + + auto my_entity = METRIC_ENTITY_my_server.instantiate("server_117"); + + auto my_gauge_int64 = METRIC_test_server_gauge_int64.instantiate(my_entity); + my_gauge_int64->set(5); + + auto my_counter = METRIC_test_server_counter.instantiate(my_entity); + my_counter->increment(); + + auto my_percentile_int64 = METRIC_test_server_percentile_int64.instantiate(my_entity); + my_percentile_int64->set(5); + + std::this_thread::sleep_for( + std::chrono::milliseconds(500)); + + test_restart_metric_registry_timer(reserved_metrics_retirement_delay_ms); +} + } // namespace dsn From 59677544816c6f78c213298903007346a5a666cd Mon Sep 17 00:00:00 2001 From: Dan Wang Date: Thu, 29 Dec 2022 18:26:27 +0800 Subject: [PATCH 06/17] feat(new_metrics): retire the old metrics and entities that are not in use --- src/utils/metrics.cpp | 82 +++++++++++++++++---------------- src/utils/metrics.h | 13 +++--- src/utils/test/metrics_test.cpp | 7 ++- 3 files changed, 51 insertions(+), 51 deletions(-) diff --git a/src/utils/metrics.cpp b/src/utils/metrics.cpp index 8e4c22c137..d9309b46b7 100644 --- a/src/utils/metrics.cpp +++ b/src/utils/metrics.cpp @@ -27,9 +27,11 @@ namespace dsn { -DSN_DEFINE_uint64("metrics", metrics_retirement_delay_ms, 10 * 60 * 1000, - "The minimum number of milliseconds a metric will be kept for after it is " - "no longer active."); +DSN_DEFINE_uint64("metrics", + metrics_retirement_delay_ms, + 10 * 60 * 1000, + "The minimum number of milliseconds a metric will be kept for after it is " + "no longer active."); metric_entity::metric_entity(const metric_entity_prototype *prototype, const std::string &id, @@ -47,12 +49,6 @@ metric_entity::~metric_entity() close(close_option::kWait); } -metric_entity::operator bool() const -{ - utils::auto_read_lock l(_lock); - return !_metrics.empty(); -} - void metric_entity::close(close_option option) { utils::auto_write_lock l(_lock); @@ -189,6 +185,15 @@ void metric_entity::take_snapshot(metric_json_writer &writer, const metric_filte writer.EndObject(); } +bool metric_entity::is_stale() const +{ + CHECK_GE(get_count(), 1); + + // The entity has no metric and there's only one reference for the entity that is kept in the + // registry. Thus this entity is not considered to be in use. + return _metrics.empty() && get_count() == 1; +} + metric_entity::old_metric_list metric_entity::collect_old_metrics() const { old_metric_list old_metrics; @@ -198,8 +203,8 @@ metric_entity::old_metric_list metric_entity::collect_old_metrics() const utils::auto_read_lock l(_lock); for (const auto &m : _metrics) { - if (!is_metric_stale(m.second)) { - if (m.second->retire_time_ms() > 0) { + if (!m.second->is_stale()) { + if (m.second->_retire_time_ms > 0) { // This metric has previously been scheduled to be retired. However, now // this metric is used again since its reference count is more than 1. // Thus its delay time for retirement should be cleared. @@ -208,8 +213,8 @@ metric_entity::old_metric_list metric_entity::collect_old_metrics() const continue; } - if (iter->second->retire_time_ms() > 0) { - if (iter->second->retire_time_ms() < now) { + if (m.second->_retire_time_ms > 0) { + if (m.second->_retire_time_ms < now) { // This metric has been scheduled to be retired, but retirement has not // taken effect. continue; @@ -242,7 +247,7 @@ std::tuple metric_entity::retire_old_metrics(const old_metric_li continue; } - if (!is_metric_stale(iter->second)) { + if (!iter->second->is_stale()) { if (iter->second->_retire_time_ms > 0) { // For those metrics which are still in use, their delay time for retirement // should be cleared since previously they could have been scheduled to be @@ -344,24 +349,6 @@ dsn::metric_filters::metric_fields_type get_brief_metric_fields() const dsn::metric_filters::metric_fields_type kBriefMetricFields = get_brief_metric_fields(); -bool is_metric_stale(const dsn::metric_ptr &m) -{ - CHECK_GE(m.get_count(), 1); - - // There's only one reference for this metric that is kept in the entity. Thus - // this metric is not considered to be in use. - return m.get_count() == 1; -} - -bool is_entity_stale(const dsn::metric_entity_ptr &entity) -{ - CHECK_GE(entity.second.get_count(), 1); - - // The entity has no metric and there's only one reference for the entity that is kept in the - // registry. Thus this entity is not considered to be in use. - return *entity && entity.get_count() == 1; -} - } // anonymous namespace void metrics_http_service::get_metrics_handler(const http_request &req, http_response &resp) @@ -462,7 +449,7 @@ void metric_registry::start_timer() } _timer.reset(new metric_timer(FLAGS_metrics_retirement_delay_ms, - std::bind(&metric_registry::process_old_metrics(), this), + std::bind(&metric_registry::process_old_metrics, this), std::bind(&metric_registry::on_close, this))); } @@ -474,7 +461,7 @@ void metric_registry::stop_timer() _timer->close(); _timer->wait(); - _timer->reset(); + _timer.reset(); } metric_registry::entity_map metric_registry::entities() const @@ -537,7 +524,7 @@ metric_registry::old_entity_map metric_registry::collect_old_metrics() const continue; } - if (is_entity_stale(entity.second)) { + if (entity.second->is_stale()) { // Since this entity itself is stale, it will be collected without any // metric. (void)old_entities[entity.first]; @@ -547,7 +534,8 @@ metric_registry::old_entity_map metric_registry::collect_old_metrics() const return old_entities; } -std::tuple metric_registry::retire_old_metrics(const old_entity_map &old_entities) +std::tuple +metric_registry::retire_old_metrics(const old_entity_map &old_entities) { size_t num_retired_entities = 0; size_t num_retired_metrics = 0; @@ -574,7 +562,7 @@ std::tuple metric_registry::retire_old_metrics(const old num_retired_metrics += num_retired; num_potential_metrics += num_potential; - if (is_entity_stale(iter->second)) { + if (iter->second->is_stale()) { // The entity itself is stale and will be retired immediately, for the reason that // each metric in it has been delayed for a long enough time before retirement. _entities.erase(iter); @@ -594,13 +582,18 @@ void metric_registry::process_old_metrics() for (const auto &old_entity : old_entities) { num_collected_metrics += old_entity.second.size(); } - LOG_INFO_F("collected_entities={}, collected_metrics={}", old_entities.size(), num_collected_metrics); + LOG_INFO_F( + "collected_entities={}, collected_metrics={}", old_entities.size(), num_collected_metrics); size_t num_retired_entities; size_t num_retired_metrics; size_t num_potential_metrics; - std::tie(num_retired_entities, num_retired_metrics) = retire_old_metrics(old_entities); - LOG_INFO_F("retired_entities={}, retired_metrics={}, potential_metrics={}", num_retired_entities, num_retired_metrics, num_potential_metrics); + std::tie(num_retired_entities, num_retired_metrics, num_potential_metrics) = + retire_old_metrics(old_entities); + LOG_INFO_F("retired_entities={}, retired_metrics={}, potential_metrics={}", + num_retired_entities, + num_retired_metrics, + num_potential_metrics); } metric_prototype::metric_prototype(const ctor_args &args) : _args(args) {} @@ -609,6 +602,15 @@ metric_prototype::~metric_prototype() {} metric::metric(const metric_prototype *prototype) : _prototype(prototype), _retire_time_ms(0) {} +bool metric::is_stale() const +{ + CHECK_GE(get_count(), 1); + + // There's only one reference for this metric that is kept in the entity. Thus + // this metric is not considered to be in use. + return get_count() == 1; +} + closeable_metric::closeable_metric(const metric_prototype *prototype) : metric(prototype) {} uint64_t metric_timer::generate_initial_delay_ms(uint64_t interval_ms) diff --git a/src/utils/metrics.h b/src/utils/metrics.h index fc24fc6832..0f8b3cfd98 100644 --- a/src/utils/metrics.h +++ b/src/utils/metrics.h @@ -180,9 +180,6 @@ class metric_entity : public ref_counter ~metric_entity(); - // Return true if the stored metrics are not empty. - explicit operator bool() const; - // Close all "closeable" metrics owned by this entity. // // `option` is used to control how the close operations are performed: @@ -202,6 +199,8 @@ class metric_entity : public ref_counter void encode_id(metric_json_writer &writer) const; + bool is_stale() const; + // The whole retirement process is divided two phases "collect" and "retire", please see // `collect_old_metrics()` and `retire_old_metrics()` of registry for details. old_metric_list collect_old_metrics() const; @@ -606,8 +605,6 @@ class metric : public ref_counter public: const metric_prototype *prototype() const { return _prototype; } - uint64_t retire_time_ms() const { return _retire_time_ms; } - // Take snapshot of each metric to collect current values as json format with fields chosen // by `filters`. virtual void take_snapshot(metric_json_writer &writer, const metric_filters &filters) = 0; @@ -676,11 +673,13 @@ class metric : public ref_counter const metric_prototype *const _prototype; - uint64_t _retire_time_ms; - private: friend class metric_entity; + bool is_stale() const; + + uint64_t _retire_time_ms; + DISALLOW_COPY_AND_ASSIGN(metric); }; diff --git a/src/utils/test/metrics_test.cpp b/src/utils/test/metrics_test.cpp index a8480364a0..c7ad5dbbcf 100644 --- a/src/utils/test/metrics_test.cpp +++ b/src/utils/test/metrics_test.cpp @@ -2832,9 +2832,9 @@ TEST(metrics_test, http_get_metrics) void test_restart_metric_registry_timer(uint64_t interval_ms) { - metric_registry::instance()::stop_timer(); + metric_registry::instance().stop_timer(); FLAGS_metrics_retirement_delay_ms = interval_ms; - metric_registry::instance()::start_timer(); + metric_registry::instance().start_timer(); } TEST(metrics_test, retire_old_metrics) @@ -2853,8 +2853,7 @@ TEST(metrics_test, retire_old_metrics) auto my_percentile_int64 = METRIC_test_server_percentile_int64.instantiate(my_entity); my_percentile_int64->set(5); - std::this_thread::sleep_for( - std::chrono::milliseconds(500)); + std::this_thread::sleep_for(std::chrono::milliseconds(500)); test_restart_metric_registry_timer(reserved_metrics_retirement_delay_ms); } From 0cf2c9c776a47cb380b51a9844a1d3fca2040c6e Mon Sep 17 00:00:00 2001 From: Dan Wang Date: Fri, 30 Dec 2022 18:51:27 +0800 Subject: [PATCH 07/17] feat(new_metrics): retire the old metrics and entities that are not in use --- src/utils/test/metrics_test.cpp | 142 +++++++++++++++++++++++++++++--- 1 file changed, 132 insertions(+), 10 deletions(-) diff --git a/src/utils/test/metrics_test.cpp b/src/utils/test/metrics_test.cpp index c7ad5dbbcf..d767288525 100644 --- a/src/utils/test/metrics_test.cpp +++ b/src/utils/test/metrics_test.cpp @@ -2837,24 +2837,146 @@ void test_restart_metric_registry_timer(uint64_t interval_ms) metric_registry::instance().start_timer(); } -TEST(metrics_test, retire_old_metrics) +class scoped_entity { - auto reserved_metrics_retirement_delay_ms = FLAGS_metrics_retirement_delay_ms; - test_restart_metric_registry_timer(100); +public: + using surviving_metric_map = std::unordered_map; + + scoped_entity(const std::string &entity_id, + bool is_entity_surviving, + bool is_gauge_surviving, + bool is_counter_surviving, + bool is_percentile_surviving); + + void test_survival_after_retirement() const; + +private: + template + void instantiate_metric(bool is_surviving, const MetricPrototype &prototype, MetricPtr &m) + { + CHECK_NOTNULL(_my_entity, + "entity {} of {} should also set to be surviving since {} is surviving", + _my_entity->id(), + _my_entity->prototype()->name(), + prototype.name().data()); + + auto temp_m = prototype.instantiate(_my_entity); + _expected_all_metrics.emplace(&prototype, temp_m.get()); + + if (is_surviving) { + m = temp_m; + _expected_surviving_metrics.emplace(&prototype, m.get()); + } + } + + surviving_metric_map get_actual_surviving_metrics(const metric_entity_ptr &entity) const; + void test_survival_immediately_after_initialization() const; + + std::string _entity_id; + metric_entity *_expected_entity_raw_ptr; + metric_entity_ptr _my_entity; - auto my_entity = METRIC_ENTITY_my_server.instantiate("server_117"); + gauge_ptr _my_gauge_int64; + counter_ptr<> _my_counter; + percentile_ptr _my_percentile_int64; - auto my_gauge_int64 = METRIC_test_server_gauge_int64.instantiate(my_entity); - my_gauge_int64->set(5); + surviving_metric_map _expected_all_metrics; + surviving_metric_map _expected_surviving_metrics; +}; + +scoped_entity::scoped_entity(const std::string &entity_id, + bool is_entity_surviving, + bool is_gauge_surviving, + bool is_counter_surviving, + bool is_percentile_surviving) + : _entity_id(entity_id) +{ + auto my_entity = METRIC_ENTITY_my_server.instantiate(entity_id); + _expected_entity_raw_ptr = my_entity.get(); + if (is_entity_surviving) { + _my_entity = my_entity; + } - auto my_counter = METRIC_test_server_counter.instantiate(my_entity); - my_counter->increment(); + instantiate_metric(is_gauge_surviving, METRIC_test_server_gauge_int64, _my_gauge_int64); + instantiate_metric(is_counter_surviving, METRIC_test_server_counter, _my_counter); + instantiate_metric( + is_percentile_surviving, METRIC_test_server_percentile_int64, _my_percentile_int64); - auto my_percentile_int64 = METRIC_test_server_percentile_int64.instantiate(my_entity); - my_percentile_int64->set(5); + test_survival_immediately_after_initialization(); +} + +scoped_entity::surviving_metric_map +scoped_entity::get_actual_surviving_metrics(const metric_entity_ptr &entity) const +{ + surviving_metric_map actual_surviving_metrics; + const auto &metrics = entity->metrics(); + for (const auto &m : metrics) { + actual_surviving_metrics.emplace(m.first, m.second.get()); + } + + return actual_surviving_metrics; +} + +void scoped_entity::test_survival_immediately_after_initialization() const +{ + const auto &entities = metric_registry::instance().entities(); + auto iter = entities.find(_entity_id); + ASSERT_NE(entities.end(), iter); + ASSERT_EQ(_expected_entity_raw_ptr, iter->second.get()); + + const auto &actual_surviving_metrics = get_actual_surviving_metrics(iter->second); + ASSERT_EQ(_expected_all_metrics, actual_surviving_metrics); +} + +void scoped_entity::test_survival_after_retirement() const +{ std::this_thread::sleep_for(std::chrono::milliseconds(500)); + const auto &entities = metric_registry::instance().entities(); + auto iter = entities.find(_entity_id); + if (_my_entity == nullptr) { + ASSERT_EQ(entities.end(), iter); + ASSERT_TRUE(_expected_surviving_metrics.empty()); + return; + } + + ASSERT_NE(entities.end(), iter); + ASSERT_EQ(_expected_entity_raw_ptr, iter->second.get()); + + const auto &actual_surviving_metrics = get_actual_surviving_metrics(iter->second); + ASSERT_EQ(_expected_surviving_metrics, actual_surviving_metrics); +} + +TEST(metrics_test, retire_old_metrics) +{ + auto reserved_metrics_retirement_delay_ms = FLAGS_metrics_retirement_delay_ms; + test_restart_metric_registry_timer(100); + + struct test_case + { + std::string entity_id; + bool is_entity_surviving; + bool is_gauge_surviving; + bool is_counter_surviving; + bool is_percentile_surviving; + } tests[] = { + {"server_117", true, true, true, true}, + {"server_118", true, true, true, false}, + {"server_119", true, true, false, false}, + {"server_120", true, false, false, false}, + {"server_121", false, false, false, false}, + }; + + for (const auto &test : tests) { + scoped_entity entity(test.entity_id, + test.is_entity_surviving, + test.is_gauge_surviving, + test.is_counter_surviving, + test.is_percentile_surviving); + entity.test_survival_after_retirement(); + } + test_restart_metric_registry_timer(reserved_metrics_retirement_delay_ms); } From b48b7b6ae6cc85d6b82b968fd9a57f5de2da1b13 Mon Sep 17 00:00:00 2001 From: Dan Wang Date: Tue, 3 Jan 2023 23:58:12 +0800 Subject: [PATCH 08/17] feat(new_metrics): retire the old metrics and entities that are not in use --- src/utils/metrics.cpp | 73 ++++++++++++++++++++++++--------- src/utils/metrics.h | 2 +- src/utils/test/metrics_test.cpp | 53 +++++++++++++----------- 3 files changed, 83 insertions(+), 45 deletions(-) diff --git a/src/utils/metrics.cpp b/src/utils/metrics.cpp index d9309b46b7..6d2309f6b8 100644 --- a/src/utils/metrics.cpp +++ b/src/utils/metrics.cpp @@ -49,7 +49,27 @@ metric_entity::~metric_entity() close(close_option::kWait); } -void metric_entity::close(close_option option) +namespace { + +void close_metric(const dsn::metric_ptr &m) +{ + if (m->prototype()->type() == dsn::metric_type::kPercentile) { + auto p = dsn::down_cast(m.get()); + p->close(); + } +} + +void wait_metric(const dsn::metric_ptr &m) +{ + if (m->prototype()->type() == dsn::metric_type::kPercentile) { + auto p = dsn::down_cast(m.get()); + p->wait(); + } +} + +} // anonymous namespace + +void metric_entity::close(close_option option) const { utils::auto_write_lock l(_lock); @@ -61,10 +81,7 @@ void metric_entity::close(close_option option) // request for all metrics; then, just wait for all of the close operations to be finished. // It's inefficient to wait for each metric to be closed one by one. for (auto &m : _metrics) { - if (m.second->prototype()->type() == metric_type::kPercentile) { - auto p = down_cast(m.second.get()); - p->close(); - } + close_metric(m.second); } if (option == close_option::kNoWait) { @@ -73,10 +90,7 @@ void metric_entity::close(close_option option) // Wait for all of the close operations to be finished. for (auto &m : _metrics) { - if (m.second->prototype()->type() == metric_type::kPercentile) { - auto p = down_cast(m.second.get()); - p->wait(); - } + wait_metric(m.second); } } @@ -200,6 +214,7 @@ metric_entity::old_metric_list metric_entity::collect_old_metrics() const auto now = dsn_now_ms(); + LOG_INFO_F("in metric_entity::collect_old_metrics for entity {}", _id); utils::auto_read_lock l(_lock); for (const auto &m : _metrics) { @@ -209,19 +224,19 @@ metric_entity::old_metric_list metric_entity::collect_old_metrics() const // this metric is used again since its reference count is more than 1. // Thus its delay time for retirement should be cleared. old_metrics.insert(m.first); + LOG_INFO_F("change old metric {} to new", m.second->prototype()->name().data()); } continue; } - if (m.second->_retire_time_ms > 0) { - if (m.second->_retire_time_ms < now) { - // This metric has been scheduled to be retired, but retirement has not - // taken effect. - continue; - } + if (m.second->_retire_time_ms > now) { + // This metric has been scheduled to be retired, but retirement has not + // taken effect. + continue; } old_metrics.insert(m.first); + LOG_INFO_F("add metric {} to retire", m.second->prototype()->name().data()); } return old_metrics; @@ -238,16 +253,19 @@ std::tuple metric_entity::retire_old_metrics(const old_metric_li auto now = dsn_now_ms(); + LOG_INFO_F("in metric_entity::retire_old_metrics for entity {}", _id); utils::auto_write_lock l(_lock); for (const auto &m : old_metrics) { auto iter = _metrics.find(m); if (iter == _metrics.end()) { // This metric has been removed from the entity. + LOG_INFO_F("metric {} not found", m->name().data()); continue; } if (!iter->second->is_stale()) { + LOG_INFO_F("not stale for metric {} while count={}, retire_time_ms={}", iter->second->prototype()->name().data(), iter->second->get_count(), iter->second->_retire_time_ms); if (iter->second->_retire_time_ms > 0) { // For those metrics which are still in use, their delay time for retirement // should be cleared since previously they could have been scheduled to be @@ -257,6 +275,12 @@ std::tuple metric_entity::retire_old_metrics(const old_metric_li continue; } + LOG_INFO_F("is stale for metric {} while count={}, retire_time_ms={}", iter->second->prototype()->name().data(), iter->second->get_count(), iter->second->_retire_time_ms); + + if (dsn_unlikely(iter->second->_retire_time_ms > now)) { + continue; + } + if (iter->second->_retire_time_ms == 0) { // This metric is not in use, thus should be marked with a delay time for // retirement. @@ -265,11 +289,13 @@ std::tuple metric_entity::retire_old_metrics(const old_metric_li continue; } - if (dsn_likely(iter->second->_retire_time_ms >= now)) { - // Retire the metric from the entity after the delay time. - _metrics.erase(iter); - ++num_retired; - } + close_metric(iter->second); + wait_metric(iter->second); + + // Retire the metric from the entity after the delay time. + _metrics.erase(iter); + LOG_INFO_F("retire metric {}", iter->second->prototype()->name().data()); + ++num_retired; } return std::make_tuple(num_retired, num_potential); @@ -521,6 +547,7 @@ metric_registry::old_entity_map metric_registry::collect_old_metrics() const // Those entities which have stale metrics that should be retired will be // collected. old_entities.emplace(entity.first, std::move(old_metrics)); + LOG_INFO_F("collect entity {} for metrics", entity.first); continue; } @@ -528,6 +555,7 @@ metric_registry::old_entity_map metric_registry::collect_old_metrics() const // Since this entity itself is stale, it will be collected without any // metric. (void)old_entities[entity.first]; + LOG_INFO_F("collect entity {} for stale", entity.first); } } @@ -566,6 +594,7 @@ metric_registry::retire_old_metrics(const old_entity_map &old_entities) // The entity itself is stale and will be retired immediately, for the reason that // each metric in it has been delayed for a long enough time before retirement. _entities.erase(iter); + LOG_INFO_F("retire entity {}", iter->second->id()); ++num_retired_entities; } } @@ -606,6 +635,10 @@ bool metric::is_stale() const { CHECK_GE(get_count(), 1); + if (_prototype->type() == metric_type::kPercentile && get_count() == 2) { + return true; + } + // There's only one reference for this metric that is kept in the entity. Thus // this metric is not considered to be in use. return get_count() == 1; diff --git a/src/utils/metrics.h b/src/utils/metrics.h index 0f8b3cfd98..4d4067266f 100644 --- a/src/utils/metrics.h +++ b/src/utils/metrics.h @@ -191,7 +191,7 @@ class metric_entity : public ref_counter kWait, kNoWait, }; - void close(close_option option); + void close(close_option option) const; void set_attributes(const attr_map &attrs); diff --git a/src/utils/test/metrics_test.cpp b/src/utils/test/metrics_test.cpp index d767288525..bbc416c276 100644 --- a/src/utils/test/metrics_test.cpp +++ b/src/utils/test/metrics_test.cpp @@ -2852,28 +2852,26 @@ class scoped_entity private: template - void instantiate_metric(bool is_surviving, const MetricPrototype &prototype, MetricPtr &m) + void instantiate_metric(const metric_entity_ptr &my_entity, + bool is_surviving, + const MetricPrototype &prototype, + MetricPtr &m) { - CHECK_NOTNULL(_my_entity, - "entity {} of {} should also set to be surviving since {} is surviving", - _my_entity->id(), - _my_entity->prototype()->name(), - prototype.name().data()); - - auto temp_m = prototype.instantiate(_my_entity); + auto temp_m = prototype.instantiate(my_entity); _expected_all_metrics.emplace(&prototype, temp_m.get()); if (is_surviving) { m = temp_m; _expected_surviving_metrics.emplace(&prototype, m.get()); + std::cout << "add expected surviving metrics: " << prototype.name() << std::endl; } } - surviving_metric_map get_actual_surviving_metrics(const metric_entity_ptr &entity) const; + surviving_metric_map get_actual_surviving_metrics(const metric_entity_ptr &my_entity) const; void test_survival_immediately_after_initialization() const; - std::string _entity_id; - metric_entity *_expected_entity_raw_ptr; + std::string _my_entity_id; + metric_entity *_expected_my_entity_raw_ptr; metric_entity_ptr _my_entity; gauge_ptr _my_gauge_int64; @@ -2889,31 +2887,38 @@ scoped_entity::scoped_entity(const std::string &entity_id, bool is_gauge_surviving, bool is_counter_surviving, bool is_percentile_surviving) - : _entity_id(entity_id) + : _my_entity_id(entity_id) { auto my_entity = METRIC_ENTITY_my_server.instantiate(entity_id); - _expected_entity_raw_ptr = my_entity.get(); + _expected_my_entity_raw_ptr = my_entity.get(); + + instantiate_metric( + my_entity, is_gauge_surviving, METRIC_test_server_gauge_int64, _my_gauge_int64); + instantiate_metric(my_entity, is_counter_surviving, METRIC_test_server_counter, _my_counter); + instantiate_metric(my_entity, + is_percentile_surviving, + METRIC_test_server_percentile_int64, + _my_percentile_int64); + std::cout << std::endl; + if (is_entity_surviving) { _my_entity = my_entity; } - instantiate_metric(is_gauge_surviving, METRIC_test_server_gauge_int64, _my_gauge_int64); - instantiate_metric(is_counter_surviving, METRIC_test_server_counter, _my_counter); - instantiate_metric( - is_percentile_surviving, METRIC_test_server_percentile_int64, _my_percentile_int64); - test_survival_immediately_after_initialization(); } scoped_entity::surviving_metric_map -scoped_entity::get_actual_surviving_metrics(const metric_entity_ptr &entity) const +scoped_entity::get_actual_surviving_metrics(const metric_entity_ptr &my_entity) const { surviving_metric_map actual_surviving_metrics; - const auto &metrics = entity->metrics(); + const auto &metrics = my_entity->metrics(); for (const auto &m : metrics) { actual_surviving_metrics.emplace(m.first, m.second.get()); + std::cout << "add actual surviving metrics: " << m.first->name() << std::endl; } + std::cout << std::endl; return actual_surviving_metrics; } @@ -2921,9 +2926,9 @@ scoped_entity::get_actual_surviving_metrics(const metric_entity_ptr &entity) con void scoped_entity::test_survival_immediately_after_initialization() const { const auto &entities = metric_registry::instance().entities(); - auto iter = entities.find(_entity_id); + const auto &iter = entities.find(_my_entity_id); ASSERT_NE(entities.end(), iter); - ASSERT_EQ(_expected_entity_raw_ptr, iter->second.get()); + ASSERT_EQ(_expected_my_entity_raw_ptr, iter->second.get()); const auto &actual_surviving_metrics = get_actual_surviving_metrics(iter->second); ASSERT_EQ(_expected_all_metrics, actual_surviving_metrics); @@ -2934,7 +2939,7 @@ void scoped_entity::test_survival_after_retirement() const std::this_thread::sleep_for(std::chrono::milliseconds(500)); const auto &entities = metric_registry::instance().entities(); - auto iter = entities.find(_entity_id); + const auto &iter = entities.find(_my_entity_id); if (_my_entity == nullptr) { ASSERT_EQ(entities.end(), iter); ASSERT_TRUE(_expected_surviving_metrics.empty()); @@ -2942,7 +2947,7 @@ void scoped_entity::test_survival_after_retirement() const } ASSERT_NE(entities.end(), iter); - ASSERT_EQ(_expected_entity_raw_ptr, iter->second.get()); + ASSERT_EQ(_expected_my_entity_raw_ptr, iter->second.get()); const auto &actual_surviving_metrics = get_actual_surviving_metrics(iter->second); ASSERT_EQ(_expected_surviving_metrics, actual_surviving_metrics); From 2ec92350eee2c153f4ea88938971110b640caf82 Mon Sep 17 00:00:00 2001 From: Dan Wang Date: Wed, 4 Jan 2023 01:45:01 +0800 Subject: [PATCH 09/17] feat(new_metrics): retire the old metrics and entities that are not in use --- src/utils/metrics.cpp | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/utils/metrics.cpp b/src/utils/metrics.cpp index 6d2309f6b8..10395dc198 100644 --- a/src/utils/metrics.cpp +++ b/src/utils/metrics.cpp @@ -51,6 +51,9 @@ metric_entity::~metric_entity() namespace { +// Before destructing or retiring a metric, close it first. For example, stop the timer of +// percentile. Generally close operations in this function are asynchronous, wait_metric() +// can be used to wait until all close operations are finished. void close_metric(const dsn::metric_ptr &m) { if (m->prototype()->type() == dsn::metric_type::kPercentile) { @@ -59,6 +62,7 @@ void close_metric(const dsn::metric_ptr &m) } } +// Wait until all operations in close_metric() are finished. void wait_metric(const dsn::metric_ptr &m) { if (m->prototype()->type() == dsn::metric_type::kPercentile) { @@ -201,10 +205,11 @@ void metric_entity::take_snapshot(metric_json_writer &writer, const metric_filte bool metric_entity::is_stale() const { + // Since this entity itself is still be accessed, its reference count should be 1 at least. CHECK_GE(get_count(), 1); - // The entity has no metric and there's only one reference for the entity that is kept in the - // registry. Thus this entity is not considered to be in use. + // Once this entity does not have any metric, and has only one reference that must be + // kept in the registry, this entity will be considered to be useless. return _metrics.empty() && get_count() == 1; } @@ -265,7 +270,10 @@ std::tuple metric_entity::retire_old_metrics(const old_metric_li } if (!iter->second->is_stale()) { - LOG_INFO_F("not stale for metric {} while count={}, retire_time_ms={}", iter->second->prototype()->name().data(), iter->second->get_count(), iter->second->_retire_time_ms); + LOG_INFO_F("not stale for metric {} while count={}, retire_time_ms={}", + iter->second->prototype()->name().data(), + iter->second->get_count(), + iter->second->_retire_time_ms); if (iter->second->_retire_time_ms > 0) { // For those metrics which are still in use, their delay time for retirement // should be cleared since previously they could have been scheduled to be @@ -275,7 +283,10 @@ std::tuple metric_entity::retire_old_metrics(const old_metric_li continue; } - LOG_INFO_F("is stale for metric {} while count={}, retire_time_ms={}", iter->second->prototype()->name().data(), iter->second->get_count(), iter->second->_retire_time_ms); + LOG_INFO_F("is stale for metric {} while count={}, retire_time_ms={}", + iter->second->prototype()->name().data(), + iter->second->get_count(), + iter->second->_retire_time_ms); if (dsn_unlikely(iter->second->_retire_time_ms > now)) { continue; From 9d77373fd8ea0bf9af7abd20db8222edc371f657 Mon Sep 17 00:00:00 2001 From: Dan Wang Date: Thu, 5 Jan 2023 19:14:19 +0800 Subject: [PATCH 10/17] feat(new_metrics): retire the old metrics and entities that are not in use --- src/utils/metrics.cpp | 174 ++++++++++++++++++-------------- src/utils/metrics.h | 50 +++++++-- src/utils/test/metrics_test.cpp | 96 ++++++++++++------ 3 files changed, 204 insertions(+), 116 deletions(-) diff --git a/src/utils/metrics.cpp b/src/utils/metrics.cpp index 10395dc198..1d57ad9c5c 100644 --- a/src/utils/metrics.cpp +++ b/src/utils/metrics.cpp @@ -205,17 +205,17 @@ void metric_entity::take_snapshot(metric_json_writer &writer, const metric_filte bool metric_entity::is_stale() const { - // Since this entity itself is still be accessed, its reference count should be 1 at least. + // Since this entity itself is still being accessed, its reference count should be 1 at least. CHECK_GE(get_count(), 1); - // Once this entity does not have any metric, and has only one reference that must be - // kept in the registry, this entity will be considered to be useless. + // Once this entity did not have any metric, and had the only one reference that is + // kept in the registry, this entity would be considered useless. return _metrics.empty() && get_count() == 1; } -metric_entity::old_metric_list metric_entity::collect_old_metrics() const +metric_entity::collected_old_metrics_info metric_entity::collect_old_metrics() const { - old_metric_list old_metrics; + collected_old_metrics_info collected_info; auto now = dsn_now_ms(); @@ -225,37 +225,40 @@ metric_entity::old_metric_list metric_entity::collect_old_metrics() const for (const auto &m : _metrics) { if (!m.second->is_stale()) { if (m.second->_retire_time_ms > 0) { - // This metric has previously been scheduled to be retired. However, now - // this metric is used again since its reference count is more than 1. - // Thus its delay time for retirement should be cleared. - old_metrics.insert(m.first); + // Previously this metric must have been scheduled to be retired. However, + // since this metric is reemployed now, its scheduled time for retirement + // should be reset to 0. + collected_info.old_metrics.insert(m.first); LOG_INFO_F("change old metric {} to new", m.second->prototype()->name().data()); } continue; } if (m.second->_retire_time_ms > now) { - // This metric has been scheduled to be retired, but retirement has not - // taken effect. + // This metric has been scheduled to be retired, however it is still within + // the retention interval. Thus do not collect it. + ++collected_info.num_scheduled_metrics; continue; } - old_metrics.insert(m.first); + collected_info.old_metrics.insert(m.first); LOG_INFO_F("add metric {} to retire", m.second->prototype()->name().data()); } - return old_metrics; + collected_info.num_all_metrics = _metrics.size(); + return collected_info; } -std::tuple metric_entity::retire_old_metrics(const old_metric_list &old_metrics) +metric_entity::retired_metrics_stat +metric_entity::retire_old_metrics(const old_metric_list &old_metrics) { - size_t num_retired = 0; - size_t num_potential = 0; if (old_metrics.empty()) { // Do not lock for empty list. - return std::make_tuple(num_retired, num_potential); + return retired_metrics_stat(); } + retired_metrics_stat retired_stat; + auto now = dsn_now_ms(); LOG_INFO_F("in metric_entity::retire_old_metrics for entity {}", _id); @@ -263,8 +266,8 @@ std::tuple metric_entity::retire_old_metrics(const old_metric_li for (const auto &m : old_metrics) { auto iter = _metrics.find(m); - if (iter == _metrics.end()) { - // This metric has been removed from the entity. + if (dsn_unlikely(iter == _metrics.end())) { + // This metric has been removed from the entity for some reason. LOG_INFO_F("metric {} not found", m->name().data()); continue; } @@ -275,10 +278,11 @@ std::tuple metric_entity::retire_old_metrics(const old_metric_li iter->second->get_count(), iter->second->_retire_time_ms); if (iter->second->_retire_time_ms > 0) { - // For those metrics which are still in use, their delay time for retirement - // should be cleared since previously they could have been scheduled to be - // retired. + // For those metrics which are still in use, their scheduled time for + // retirement should be reset to 0 though previously they could have + // been scheduled to be retired. iter->second->_retire_time_ms = 0; + ++retired_stat.num_reemployed_metrics; } continue; } @@ -289,27 +293,31 @@ std::tuple metric_entity::retire_old_metrics(const old_metric_li iter->second->_retire_time_ms); if (dsn_unlikely(iter->second->_retire_time_ms > now)) { + // Since in collect_old_metrics() we've filtered the metrics which have been + // outside the retention interval, this is unlikely to happen. However, we + // still check here for some special cases. continue; } if (iter->second->_retire_time_ms == 0) { - // This metric is not in use, thus should be marked with a delay time for - // retirement. + // This metric has already not been in use, thus should be marked with a + // scheduled time for retirement. iter->second->_retire_time_ms = now + FLAGS_metrics_retirement_delay_ms; - ++num_potential; + ++retired_stat.num_recently_scheduled_metrics; continue; } + // Once this metric is outside the retention interval, close it synchronously first. close_metric(iter->second); wait_metric(iter->second); - // Retire the metric from the entity after the delay time. + // Then retire it from the entity. _metrics.erase(iter); + ++retired_stat.num_retired_metrics; LOG_INFO_F("retire metric {}", iter->second->prototype()->name().data()); - ++num_retired; } - return std::make_tuple(num_retired, num_potential); + return retired_stat; } void metric_filters::extract_entity_metrics(const metric_entity::metric_map &candidates, @@ -546,71 +554,79 @@ metric_entity_ptr metric_registry::find_or_create_entity(const metric_entity_pro return entity; } -metric_registry::old_entity_map metric_registry::collect_old_metrics() const +metric_registry::collected_old_entities_info metric_registry::collect_old_metrics() const { - old_entity_map old_entities; + collected_old_entities_info entities_info; utils::auto_read_lock l(_lock); for (const auto &entity : _entities) { - const auto &old_metrics = entity.second->collect_old_metrics(); - if (!old_metrics.empty()) { - // Those entities which have stale metrics that should be retired will be - // collected. - old_entities.emplace(entity.first, std::move(old_metrics)); + const auto &metrics_info = entity.second->collect_old_metrics(); + + entities_info.num_all_metrics += metrics_info.num_all_metrics; + entities_info.num_scheduled_metrics += metrics_info.num_scheduled_metrics; + if (!metrics_info.old_metrics.empty()) { + // Those entities which have metrics that should be retired will be collected. + entities_info.old_entities.emplace(entity.first, std::move(metrics_info.old_metrics)); LOG_INFO_F("collect entity {} for metrics", entity.first); continue; } - if (entity.second->is_stale()) { - // Since this entity itself is stale, it will be collected without any - // metric. - (void)old_entities[entity.first]; - LOG_INFO_F("collect entity {} for stale", entity.first); + if (!entity.second->is_stale()) { + continue; } + + // Since this entity itself should be retired, it will be collected without any + // metric. Actually it has already not had any metric itself. + (void)entities_info.old_entities[entity.first]; + LOG_INFO_F("collect entity {} for stale", entity.first); } - return old_entities; + entities_info.num_all_entities = _entities.size(); + return entities_info; } -std::tuple +metric_registry::retired_entities_stat metric_registry::retire_old_metrics(const old_entity_map &old_entities) { - size_t num_retired_entities = 0; - size_t num_retired_metrics = 0; - size_t num_potential_metrics = 0; if (old_entities.empty()) { // Do not lock for empty list. - return std::make_tuple(num_retired_entities, num_retired_metrics, num_potential_metrics); + return retired_entities_stat(); } + retired_entities_stat entities_stat; + utils::auto_write_lock l(_lock); for (const auto &old_entity : old_entities) { auto iter = _entities.find(old_entity.first); - if (iter == _entities.end()) { - // This entity has been removed from the registry. + if (dsn_unlikely(iter == _entities.end())) { + // This entity has been removed from the registry for some reason. continue; } - // Try to retire the stale metrics from this entity. Notice that some entities - // may have no metric in which case it will never be locked. - size_t num_retired; - size_t num_potential; - std::tie(num_retired, num_potential) = iter->second->retire_old_metrics(old_entity.second); - num_retired_metrics += num_retired; - num_potential_metrics += num_potential; - - if (iter->second->is_stale()) { - // The entity itself is stale and will be retired immediately, for the reason that - // each metric in it has been delayed for a long enough time before retirement. - _entities.erase(iter); - LOG_INFO_F("retire entity {}", iter->second->id()); - ++num_retired_entities; + // Try to retire metrics from this entity. Note that since the entities that should + // be retired wouldn't have any metric, they will never get locked in + // metric_entity::retire_old_metrics(). + const auto &metrics_stat = iter->second->retire_old_metrics(old_entity.second); + + entities_stat.num_retired_metrics += metrics_stat.num_retired_metrics; + entities_stat.num_recently_scheduled_metrics += metrics_stat.num_recently_scheduled_metrics; + entities_stat.num_reemployed_metrics += metrics_stat.num_reemployed_metrics; + + if (!iter->second->is_stale()) { + continue; } + + // Once the entity itself should be retired, it will be retired immediately without + // waiting for any retention interval, for the reason that each metric in it has + // been delayed for a long enough time before retirement. + _entities.erase(iter); + ++entities_stat.num_retired_entities; + LOG_INFO_F("retire entity {}", iter->second->id()); } - return std::make_tuple(num_retired_entities, num_retired_metrics, num_potential_metrics); + return entities_stat; } void metric_registry::process_old_metrics() @@ -618,22 +634,26 @@ void metric_registry::process_old_metrics() LOG_INFO("begin to process old metrics"); size_t num_collected_metrics = 0; - const auto &old_entities = collect_old_metrics(); - for (const auto &old_entity : old_entities) { + const auto &collected_info = collect_old_metrics(); + for (const auto &old_entity : collected_info.old_entities) { num_collected_metrics += old_entity.second.size(); } - LOG_INFO_F( - "collected_entities={}, collected_metrics={}", old_entities.size(), num_collected_metrics); - - size_t num_retired_entities; - size_t num_retired_metrics; - size_t num_potential_metrics; - std::tie(num_retired_entities, num_retired_metrics, num_potential_metrics) = - retire_old_metrics(old_entities); - LOG_INFO_F("retired_entities={}, retired_metrics={}, potential_metrics={}", - num_retired_entities, - num_retired_metrics, - num_potential_metrics); + + const auto &retired_stat = retire_old_metrics(collected_info.old_entities); + + LOG_INFO_F("stat for entities: total={}, collected={}, retired={}", + collected_info.num_all_entities, + collected_info.old_entities.size(), + retired_stat.num_retired_entities); + + LOG_INFO_F("stat for metrics: total={}, collected={}, retired={}, scheduled={}, " + "recently_scheduled={}, reemployed={}", + collected_info.num_all_metrics, + num_collected_metrics, + retired_stat.num_retired_metrics, + collected_info.num_scheduled_metrics, + retired_stat.num_recently_scheduled_metrics, + retired_stat.num_reemployed_metrics); } metric_prototype::metric_prototype(const ctor_args &args) : _args(args) {} diff --git a/src/utils/metrics.h b/src/utils/metrics.h index 4d4067266f..6fb6330c89 100644 --- a/src/utils/metrics.h +++ b/src/utils/metrics.h @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -156,6 +155,24 @@ class metric_entity : public ref_counter using metric_map = std::unordered_map; using old_metric_list = std::unordered_set; + struct collected_old_metrics_info + { + old_metric_list old_metrics; + size_t num_all_metrics = 0; + size_t num_scheduled_metrics = 0; + + collected_old_metrics_info() = default; + }; + + struct retired_metrics_stat + { + size_t num_retired_metrics = 0; + size_t num_recently_scheduled_metrics = 0; + size_t num_reemployed_metrics = 0; + + retired_metrics_stat() = default; + }; + const metric_entity_prototype *prototype() const { return _prototype; } const std::string &id() const { return _id; } @@ -173,6 +190,7 @@ class metric_entity : public ref_counter private: friend class metric_registry; friend class ref_ptr; + friend class scoped_entity; metric_entity(const metric_entity_prototype *prototype, const std::string &id, @@ -203,8 +221,8 @@ class metric_entity : public ref_counter // The whole retirement process is divided two phases "collect" and "retire", please see // `collect_old_metrics()` and `retire_old_metrics()` of registry for details. - old_metric_list collect_old_metrics() const; - std::tuple retire_old_metrics(const old_metric_list &old_metrics); + collected_old_metrics_info collect_old_metrics() const; + retired_metrics_stat retire_old_metrics(const old_metric_list &old_metrics); const metric_entity_prototype *const _prototype; const std::string _id; @@ -419,6 +437,26 @@ class metric_registry : public utils::singleton using entity_map = std::unordered_map; using old_entity_map = std::unordered_map; + struct collected_old_entities_info + { + old_entity_map old_entities; + size_t num_all_entities = 0; + size_t num_all_metrics = 0; + size_t num_scheduled_metrics = 0; + + collected_old_entities_info() = default; + }; + + struct retired_entities_stat + { + size_t num_retired_entities = 0; + size_t num_retired_metrics = 0; + size_t num_recently_scheduled_metrics = 0; + size_t num_reemployed_metrics = 0; + + retired_entities_stat() = default; + }; + entity_map entities() const; void take_snapshot(metric_json_writer &writer, const metric_filters &filters) const; @@ -428,7 +466,7 @@ class metric_registry : public utils::singleton friend class utils::singleton; friend void test_get_metrics_handler(const http_request &req, http_response &resp); - friend void test_restart_metric_registry_timer(uint64_t interval_ms); + friend class MetricsRetirementTest; metric_registry(); ~metric_registry(); @@ -451,8 +489,8 @@ class metric_registry : public utils::singleton // // Once some metrics and entities are collected to be retired, in the second phase "retire", // they will be removed according to the specific rules. - old_entity_map collect_old_metrics() const; - std::tuple retire_old_metrics(const old_entity_map &old_entities); + collected_old_entities_info collect_old_metrics() const; + retired_entities_stat retire_old_metrics(const old_entity_map &old_entities); void process_old_metrics(); mutable utils::rw_lock_nr _lock; diff --git a/src/utils/test/metrics_test.cpp b/src/utils/test/metrics_test.cpp index bbc416c276..2be9e17532 100644 --- a/src/utils/test/metrics_test.cpp +++ b/src/utils/test/metrics_test.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -2830,13 +2831,6 @@ TEST(metrics_test, http_get_metrics) } } -void test_restart_metric_registry_timer(uint64_t interval_ms) -{ - metric_registry::instance().stop_timer(); - FLAGS_metrics_retirement_delay_ms = interval_ms; - metric_registry::instance().start_timer(); -} - class scoped_entity { public: @@ -2913,8 +2907,9 @@ scoped_entity::get_actual_surviving_metrics(const metric_entity_ptr &my_entity) { surviving_metric_map actual_surviving_metrics; - const auto &metrics = my_entity->metrics(); - for (const auto &m : metrics) { + utils::auto_read_lock l(my_entity->_lock); + + for (const auto &m : my_entity->_metrics) { actual_surviving_metrics.emplace(m.first, m.second.get()); std::cout << "add actual surviving metrics: " << m.first->name() << std::endl; } @@ -2953,36 +2948,71 @@ void scoped_entity::test_survival_after_retirement() const ASSERT_EQ(_expected_surviving_metrics, actual_surviving_metrics); } -TEST(metrics_test, retire_old_metrics) +using surviving_metrics_case = std::tuple; + +class MetricsRetirementTest : public testing::TestWithParam { - auto reserved_metrics_retirement_delay_ms = FLAGS_metrics_retirement_delay_ms; - test_restart_metric_registry_timer(100); +public: + // static void SetUpTestSuite() + static void SetUpTestCase() + { + _reserved_metrics_retirement_delay_ms = FLAGS_metrics_retirement_delay_ms; + restart_metric_registry_timer(100); + } - struct test_case + // static void TearDownTestSuite() + static void TearDownTestCase() { - std::string entity_id; - bool is_entity_surviving; - bool is_gauge_surviving; - bool is_counter_surviving; - bool is_percentile_surviving; - } tests[] = { - {"server_117", true, true, true, true}, - {"server_118", true, true, true, false}, - {"server_119", true, true, false, false}, - {"server_120", true, false, false, false}, - {"server_121", false, false, false, false}, - }; + restart_metric_registry_timer(_reserved_metrics_retirement_delay_ms); + } - for (const auto &test : tests) { - scoped_entity entity(test.entity_id, - test.is_entity_surviving, - test.is_gauge_surviving, - test.is_counter_surviving, - test.is_percentile_surviving); - entity.test_survival_after_retirement(); +private: + static void restart_metric_registry_timer(uint64_t interval_ms) + { + metric_registry::instance().stop_timer(); + FLAGS_metrics_retirement_delay_ms = interval_ms; + metric_registry::instance().start_timer(); + + std::cout << "restart the timer of metric registry at interval " << interval_ms << " ms." + << std::endl; } - test_restart_metric_registry_timer(reserved_metrics_retirement_delay_ms); + static uint64_t _reserved_metrics_retirement_delay_ms; +}; + +uint64_t MetricsRetirementTest::_reserved_metrics_retirement_delay_ms; + +TEST_P(MetricsRetirementTest, RetireOldMetrics) +{ + std::string entity_id; + bool is_entity_surviving; + bool is_gauge_surviving; + bool is_counter_surviving; + bool is_percentile_surviving; + std::tie(entity_id, + is_entity_surviving, + is_gauge_surviving, + is_counter_surviving, + is_percentile_surviving) = GetParam(); + + scoped_entity entity(entity_id, + is_entity_surviving, + is_gauge_surviving, + is_counter_surviving, + is_percentile_surviving); + entity.test_survival_after_retirement(); } +const std::vector metrics_retirement_tests = { + {"server_117", true, true, true, true}, + {"server_118", true, true, true, false}, + {"server_119", true, true, false, false}, + {"server_120", true, false, false, false}, + {"server_121", false, false, false, false}, +}; + +INSTANTIATE_TEST_CASE_P(MetricsTest, + MetricsRetirementTest, + testing::ValuesIn(metrics_retirement_tests)); + } // namespace dsn From 2aeee49b6a6e7af45b73a2a1384933cfefedcbc6 Mon Sep 17 00:00:00 2001 From: Dan Wang Date: Fri, 6 Jan 2023 01:23:04 +0800 Subject: [PATCH 11/17] feat(new_metrics): retire the old metrics and entities that are not in use --- src/utils/metrics.cpp | 15 ++++- src/utils/metrics.h | 27 ++++++++ src/utils/test/metrics_test.cpp | 113 ++++++++++++++++++++------------ 3 files changed, 113 insertions(+), 42 deletions(-) diff --git a/src/utils/metrics.cpp b/src/utils/metrics.cpp index 1d57ad9c5c..a736e06246 100644 --- a/src/utils/metrics.cpp +++ b/src/utils/metrics.cpp @@ -493,6 +493,10 @@ void metric_registry::start_timer() return; } + // Once a metric is considered useless, it will retire formally after the retention + // interval, namely FLAGS_metrics_retirement_delay_ms milliseconds. Therefore, if + // the interval of the timer is also set to FLAGS_metrics_retirement_delay_ms, in + // the next round, it's just about time to retire this metric. _timer.reset(new metric_timer(FLAGS_metrics_retirement_delay_ms, std::bind(&metric_registry::process_old_metrics, this), std::bind(&metric_registry::on_close, this))); @@ -504,8 +508,11 @@ void metric_registry::stop_timer() return; } + // Close the timer synchronously. _timer->close(); _timer->wait(); + + // Reset the timer to mark that it has been stopped, now it could be started. _timer.reset(); } @@ -639,6 +646,7 @@ void metric_registry::process_old_metrics() num_collected_metrics += old_entity.second.size(); } + // Try to process the collected metrics. const auto &retired_stat = retire_old_metrics(collected_info.old_entities); LOG_INFO_F("stat for entities: total={}, collected={}, retired={}", @@ -664,14 +672,19 @@ metric::metric(const metric_prototype *prototype) : _prototype(prototype), _reti bool metric::is_stale() const { + // Since this metric itself is still being accessed, its reference count should be 1 at least. CHECK_GE(get_count(), 1); + // Each metric (any type of metric) will be referenced by its entity. For a percentile, + // it will also be referenced by its timer. Thus the reference count for a percentile + // will be at least 2. However, once a percentile has only 2 references, it will be + // considered useless since it is not used by any other class. if (_prototype->type() == metric_type::kPercentile && get_count() == 2) { return true; } // There's only one reference for this metric that is kept in the entity. Thus - // this metric is not considered to be in use. + // this metric is considered useless. return get_count() == 1; } diff --git a/src/utils/metrics.h b/src/utils/metrics.h index 6fb6330c89..1fc2e2ce5f 100644 --- a/src/utils/metrics.h +++ b/src/utils/metrics.h @@ -158,7 +158,11 @@ class metric_entity : public ref_counter struct collected_old_metrics_info { old_metric_list old_metrics; + + // The number of all metrics in this entity. size_t num_all_metrics = 0; + + // The number of the metrics that have been scheduled to be retired for this entity. size_t num_scheduled_metrics = 0; collected_old_metrics_info() = default; @@ -166,8 +170,15 @@ class metric_entity : public ref_counter struct retired_metrics_stat { + // The number of retired metrics for this entity. size_t num_retired_metrics = 0; + + // The number of the metrics that were just considered useless and scheduled to + // be retired in this round for this entity. size_t num_recently_scheduled_metrics = 0; + + // The number of the metrics that had previously been scheduled to be retired and + // were recently reemployed for this entity. size_t num_reemployed_metrics = 0; retired_metrics_stat() = default; @@ -440,8 +451,14 @@ class metric_registry : public utils::singleton struct collected_old_entities_info { old_entity_map old_entities; + + // The number of all entities in the registry. size_t num_all_entities = 0; + + // The number of all metrics in the registry. size_t num_all_metrics = 0; + + // The number of the metrics that have been scheduled to be retired for the registry. size_t num_scheduled_metrics = 0; collected_old_entities_info() = default; @@ -449,9 +466,18 @@ class metric_registry : public utils::singleton struct retired_entities_stat { + // The number of retired entities for the registry. size_t num_retired_entities = 0; + + // The number of retired metrics for the registry. size_t num_retired_metrics = 0; + + // The number of the metrics that were just considered useless and scheduled to + // be retired in this round for the registry. size_t num_recently_scheduled_metrics = 0; + + // The number of the metrics that had previously been scheduled to be retired and + // were recently reemployed for the registry. size_t num_reemployed_metrics = 0; retired_entities_stat() = default; @@ -466,6 +492,7 @@ class metric_registry : public utils::singleton friend class utils::singleton; friend void test_get_metrics_handler(const http_request &req, http_response &resp); + friend class scoped_entity; friend class MetricsRetirementTest; metric_registry(); diff --git a/src/utils/test/metrics_test.cpp b/src/utils/test/metrics_test.cpp index 2be9e17532..e4d10e8a47 100644 --- a/src/utils/test/metrics_test.cpp +++ b/src/utils/test/metrics_test.cpp @@ -2831,9 +2831,51 @@ TEST(metrics_test, http_get_metrics) } } +using surviving_metrics_case = std::tuple; + +class MetricsRetirementTest : public testing::TestWithParam +{ +public: + // For higher version of googletest, use `static void SetUpTestSuite()` instead. + static void SetUpTestCase() + { + // Restart the timer of registry with shorter interval to reduce the test time. + _reserved_metrics_retirement_delay_ms = FLAGS_metrics_retirement_delay_ms; + restart_metric_registry_timer(kMetricsRetirementDelayMsForTest); + } + + // For higher version of googletest, use `static void TearDownTestSuite()` instead. + static void TearDownTestCase() + { + // Recover the timer of registry with the original interval. + restart_metric_registry_timer(_reserved_metrics_retirement_delay_ms); + } + + static const uint64_t kMetricsRetirementDelayMsForTest; + +private: + static void restart_metric_registry_timer(uint64_t interval_ms) + { + metric_registry::instance().stop_timer(); + FLAGS_metrics_retirement_delay_ms = interval_ms; + metric_registry::instance().start_timer(); + + std::cout << "restart the timer of metric registry at interval " << interval_ms << " ms." + << std::endl; + } + + static uint64_t _reserved_metrics_retirement_delay_ms; +}; + +const uint64_t MetricsRetirementTest::kMetricsRetirementDelayMsForTest = 100; +uint64_t MetricsRetirementTest::_reserved_metrics_retirement_delay_ms; + +// This class helps to test retirement of metrics and entities, by creating temporary +// variables or reference them as members of this class to control their lifetime. class scoped_entity { public: + // Use the raw pointer to hold metric without any reference which may affect the test results. using surviving_metric_map = std::unordered_map; scoped_entity(const std::string &entity_id, @@ -2842,6 +2884,8 @@ class scoped_entity bool is_counter_surviving, bool is_percentile_surviving); + // After a long enough time, check if temporary metrics and entities are retired while + // long-life ones still survive. void test_survival_after_retirement() const; private: @@ -2851,17 +2895,24 @@ class scoped_entity const MetricPrototype &prototype, MetricPtr &m) { + // Create a temporary variable for the metric. auto temp_m = prototype.instantiate(my_entity); _expected_all_metrics.emplace(&prototype, temp_m.get()); - if (is_surviving) { - m = temp_m; - _expected_surviving_metrics.emplace(&prototype, m.get()); - std::cout << "add expected surviving metrics: " << prototype.name() << std::endl; + if (!is_surviving) { + return; } + + // Extend the lifetime of the metric since it's marked as "surviving". + m = temp_m; + _expected_surviving_metrics.emplace(&prototype, m.get()); + std::cout << "add expected surviving metrics: " << prototype.name() << std::endl; } surviving_metric_map get_actual_surviving_metrics(const metric_entity_ptr &my_entity) const; + + // Check if all metrics and entities still survive no matter whether they are temporary or + // long-life. void test_survival_immediately_after_initialization() const; std::string _my_entity_id; @@ -2883,9 +2934,11 @@ scoped_entity::scoped_entity(const std::string &entity_id, bool is_percentile_surviving) : _my_entity_id(entity_id) { + // Create a temporary variabl for the entity. auto my_entity = METRIC_ENTITY_my_server.instantiate(entity_id); _expected_my_entity_raw_ptr = my_entity.get(); + // Create temporary or long-life variables for metrics, depending on what is_*_surviving is. instantiate_metric( my_entity, is_gauge_surviving, METRIC_test_server_gauge_int64, _my_gauge_int64); instantiate_metric(my_entity, is_counter_surviving, METRIC_test_server_counter, _my_counter); @@ -2896,6 +2949,7 @@ scoped_entity::scoped_entity(const std::string &entity_id, std::cout << std::endl; if (is_entity_surviving) { + // Extend the lifetime of the entity since it's marked as "surviving". _my_entity = my_entity; } @@ -2909,6 +2963,8 @@ scoped_entity::get_actual_surviving_metrics(const metric_entity_ptr &my_entity) utils::auto_read_lock l(my_entity->_lock); + // Use internal member directly instead of calling metrics(). We don't want to have + // any reference which may affect the test results. for (const auto &m : my_entity->_metrics) { actual_surviving_metrics.emplace(m.first, m.second.get()); std::cout << "add actual surviving metrics: " << m.first->name() << std::endl; @@ -2920,7 +2976,11 @@ scoped_entity::get_actual_surviving_metrics(const metric_entity_ptr &my_entity) void scoped_entity::test_survival_immediately_after_initialization() const { - const auto &entities = metric_registry::instance().entities(); + utils::auto_read_lock l(metric_registry::instance()._lock); + + // Use internal member directly instead of calling entities(). We don't want to have + // any reference which may affect the test results. + const auto &entities = metric_registry::instance()._entities; const auto &iter = entities.find(_my_entity_id); ASSERT_NE(entities.end(), iter); ASSERT_EQ(_expected_my_entity_raw_ptr, iter->second.get()); @@ -2931,9 +2991,14 @@ void scoped_entity::test_survival_immediately_after_initialization() const void scoped_entity::test_survival_after_retirement() const { - std::this_thread::sleep_for(std::chrono::milliseconds(500)); + std::this_thread::sleep_for( + std::chrono::milliseconds(MetricsRetirementTest::kMetricsRetirementDelayMsForTest * 2)); - const auto &entities = metric_registry::instance().entities(); + utils::auto_read_lock l(metric_registry::instance()._lock); + + // Use internal member directly instead of calling entities(). We don't want to have + // any reference which may affect the test results. + const auto &entities = metric_registry::instance()._entities; const auto &iter = entities.find(_my_entity_id); if (_my_entity == nullptr) { ASSERT_EQ(entities.end(), iter); @@ -2948,40 +3013,6 @@ void scoped_entity::test_survival_after_retirement() const ASSERT_EQ(_expected_surviving_metrics, actual_surviving_metrics); } -using surviving_metrics_case = std::tuple; - -class MetricsRetirementTest : public testing::TestWithParam -{ -public: - // static void SetUpTestSuite() - static void SetUpTestCase() - { - _reserved_metrics_retirement_delay_ms = FLAGS_metrics_retirement_delay_ms; - restart_metric_registry_timer(100); - } - - // static void TearDownTestSuite() - static void TearDownTestCase() - { - restart_metric_registry_timer(_reserved_metrics_retirement_delay_ms); - } - -private: - static void restart_metric_registry_timer(uint64_t interval_ms) - { - metric_registry::instance().stop_timer(); - FLAGS_metrics_retirement_delay_ms = interval_ms; - metric_registry::instance().start_timer(); - - std::cout << "restart the timer of metric registry at interval " << interval_ms << " ms." - << std::endl; - } - - static uint64_t _reserved_metrics_retirement_delay_ms; -}; - -uint64_t MetricsRetirementTest::_reserved_metrics_retirement_delay_ms; - TEST_P(MetricsRetirementTest, RetireOldMetrics) { std::string entity_id; From 99d5e68e29cbdfdcb54dacf94c7eada25709c446 Mon Sep 17 00:00:00 2001 From: Dan Wang Date: Fri, 6 Jan 2023 11:59:35 +0800 Subject: [PATCH 12/17] feat(new_metrics): retire the old metrics and entities that are not in use --- src/utils/metrics.cpp | 32 ++++++++------------------------ src/utils/test/metrics_test.cpp | 4 ---- 2 files changed, 8 insertions(+), 28 deletions(-) diff --git a/src/utils/metrics.cpp b/src/utils/metrics.cpp index a736e06246..43ce1ffe70 100644 --- a/src/utils/metrics.cpp +++ b/src/utils/metrics.cpp @@ -205,11 +205,12 @@ void metric_entity::take_snapshot(metric_json_writer &writer, const metric_filte bool metric_entity::is_stale() const { - // Since this entity itself is still being accessed, its reference count should be 1 at least. + // Since this entity itself is still being accessed, its reference count should be 1 + // at least. CHECK_GE(get_count(), 1); - // Once this entity did not have any metric, and had the only one reference that is - // kept in the registry, this entity would be considered useless. + // Once this entity did not have any metric, and had only one reference kept in the + // registry, this entity would be considered useless. return _metrics.empty() && get_count() == 1; } @@ -219,7 +220,6 @@ metric_entity::collected_old_metrics_info metric_entity::collect_old_metrics() c auto now = dsn_now_ms(); - LOG_INFO_F("in metric_entity::collect_old_metrics for entity {}", _id); utils::auto_read_lock l(_lock); for (const auto &m : _metrics) { @@ -229,7 +229,6 @@ metric_entity::collected_old_metrics_info metric_entity::collect_old_metrics() c // since this metric is reemployed now, its scheduled time for retirement // should be reset to 0. collected_info.old_metrics.insert(m.first); - LOG_INFO_F("change old metric {} to new", m.second->prototype()->name().data()); } continue; } @@ -242,7 +241,6 @@ metric_entity::collected_old_metrics_info metric_entity::collect_old_metrics() c } collected_info.old_metrics.insert(m.first); - LOG_INFO_F("add metric {} to retire", m.second->prototype()->name().data()); } collected_info.num_all_metrics = _metrics.size(); @@ -261,22 +259,16 @@ metric_entity::retire_old_metrics(const old_metric_list &old_metrics) auto now = dsn_now_ms(); - LOG_INFO_F("in metric_entity::retire_old_metrics for entity {}", _id); utils::auto_write_lock l(_lock); for (const auto &m : old_metrics) { auto iter = _metrics.find(m); if (dsn_unlikely(iter == _metrics.end())) { // This metric has been removed from the entity for some reason. - LOG_INFO_F("metric {} not found", m->name().data()); continue; } if (!iter->second->is_stale()) { - LOG_INFO_F("not stale for metric {} while count={}, retire_time_ms={}", - iter->second->prototype()->name().data(), - iter->second->get_count(), - iter->second->_retire_time_ms); if (iter->second->_retire_time_ms > 0) { // For those metrics which are still in use, their scheduled time for // retirement should be reset to 0 though previously they could have @@ -287,11 +279,6 @@ metric_entity::retire_old_metrics(const old_metric_list &old_metrics) continue; } - LOG_INFO_F("is stale for metric {} while count={}, retire_time_ms={}", - iter->second->prototype()->name().data(), - iter->second->get_count(), - iter->second->_retire_time_ms); - if (dsn_unlikely(iter->second->_retire_time_ms > now)) { // Since in collect_old_metrics() we've filtered the metrics which have been // outside the retention interval, this is unlikely to happen. However, we @@ -314,7 +301,6 @@ metric_entity::retire_old_metrics(const old_metric_list &old_metrics) // Then retire it from the entity. _metrics.erase(iter); ++retired_stat.num_retired_metrics; - LOG_INFO_F("retire metric {}", iter->second->prototype()->name().data()); } return retired_stat; @@ -575,7 +561,6 @@ metric_registry::collected_old_entities_info metric_registry::collect_old_metric if (!metrics_info.old_metrics.empty()) { // Those entities which have metrics that should be retired will be collected. entities_info.old_entities.emplace(entity.first, std::move(metrics_info.old_metrics)); - LOG_INFO_F("collect entity {} for metrics", entity.first); continue; } @@ -586,7 +571,6 @@ metric_registry::collected_old_entities_info metric_registry::collect_old_metric // Since this entity itself should be retired, it will be collected without any // metric. Actually it has already not had any metric itself. (void)entities_info.old_entities[entity.first]; - LOG_INFO_F("collect entity {} for stale", entity.first); } entities_info.num_all_entities = _entities.size(); @@ -630,7 +614,6 @@ metric_registry::retire_old_metrics(const old_entity_map &old_entities) // been delayed for a long enough time before retirement. _entities.erase(iter); ++entities_stat.num_retired_entities; - LOG_INFO_F("retire entity {}", iter->second->id()); } return entities_stat; @@ -672,7 +655,8 @@ metric::metric(const metric_prototype *prototype) : _prototype(prototype), _reti bool metric::is_stale() const { - // Since this metric itself is still being accessed, its reference count should be 1 at least. + // Since this metric itself is still being accessed, its reference count should be 1 + // at least. CHECK_GE(get_count(), 1); // Each metric (any type of metric) will be referenced by its entity. For a percentile, @@ -683,8 +667,8 @@ bool metric::is_stale() const return true; } - // There's only one reference for this metric that is kept in the entity. Thus - // this metric is considered useless. + // There's only one reference for this metric that is kept in the entity. Thus this + // metric is considered useless. return get_count() == 1; } diff --git a/src/utils/test/metrics_test.cpp b/src/utils/test/metrics_test.cpp index e4d10e8a47..963b115d18 100644 --- a/src/utils/test/metrics_test.cpp +++ b/src/utils/test/metrics_test.cpp @@ -2906,7 +2906,6 @@ class scoped_entity // Extend the lifetime of the metric since it's marked as "surviving". m = temp_m; _expected_surviving_metrics.emplace(&prototype, m.get()); - std::cout << "add expected surviving metrics: " << prototype.name() << std::endl; } surviving_metric_map get_actual_surviving_metrics(const metric_entity_ptr &my_entity) const; @@ -2946,7 +2945,6 @@ scoped_entity::scoped_entity(const std::string &entity_id, is_percentile_surviving, METRIC_test_server_percentile_int64, _my_percentile_int64); - std::cout << std::endl; if (is_entity_surviving) { // Extend the lifetime of the entity since it's marked as "surviving". @@ -2967,9 +2965,7 @@ scoped_entity::get_actual_surviving_metrics(const metric_entity_ptr &my_entity) // any reference which may affect the test results. for (const auto &m : my_entity->_metrics) { actual_surviving_metrics.emplace(m.first, m.second.get()); - std::cout << "add actual surviving metrics: " << m.first->name() << std::endl; } - std::cout << std::endl; return actual_surviving_metrics; } From 5da991bd1caadc7eacb9a5288c166d8c30c44927 Mon Sep 17 00:00:00 2001 From: Dan Wang Date: Thu, 12 Jan 2023 00:57:35 +0800 Subject: [PATCH 13/17] feat(new_metrics): retire the old metrics and entities that are not in use --- src/utils/metrics.cpp | 65 ++++++++++++++++++++++--------------------- src/utils/metrics.h | 63 +++++++++++++++++++++++++++++++---------- 2 files changed, 82 insertions(+), 46 deletions(-) diff --git a/src/utils/metrics.cpp b/src/utils/metrics.cpp index 43ce1ffe70..426ec6ac53 100644 --- a/src/utils/metrics.cpp +++ b/src/utils/metrics.cpp @@ -214,9 +214,9 @@ bool metric_entity::is_stale() const return _metrics.empty() && get_count() == 1; } -metric_entity::collected_old_metrics_info metric_entity::collect_old_metrics() const +metric_entity::collected_stale_metrics_info metric_entity::collect_stale_metrics() const { - collected_old_metrics_info collected_info; + collected_stale_metrics_info collected_info; auto now = dsn_now_ms(); @@ -228,7 +228,7 @@ metric_entity::collected_old_metrics_info metric_entity::collect_old_metrics() c // Previously this metric must have been scheduled to be retired. However, // since this metric is reemployed now, its scheduled time for retirement // should be reset to 0. - collected_info.old_metrics.insert(m.first); + collected_info.stale_metrics.insert(m.first); } continue; } @@ -240,7 +240,7 @@ metric_entity::collected_old_metrics_info metric_entity::collect_old_metrics() c continue; } - collected_info.old_metrics.insert(m.first); + collected_info.stale_metrics.insert(m.first); } collected_info.num_all_metrics = _metrics.size(); @@ -248,9 +248,9 @@ metric_entity::collected_old_metrics_info metric_entity::collect_old_metrics() c } metric_entity::retired_metrics_stat -metric_entity::retire_old_metrics(const old_metric_list &old_metrics) +metric_entity::retire_stale_metrics(const stale_metric_list &stale_metrics) { - if (old_metrics.empty()) { + if (stale_metrics.empty()) { // Do not lock for empty list. return retired_metrics_stat(); } @@ -261,7 +261,7 @@ metric_entity::retire_old_metrics(const old_metric_list &old_metrics) utils::auto_write_lock l(_lock); - for (const auto &m : old_metrics) { + for (const auto &m : stale_metrics) { auto iter = _metrics.find(m); if (dsn_unlikely(iter == _metrics.end())) { // This metric has been removed from the entity for some reason. @@ -280,7 +280,7 @@ metric_entity::retire_old_metrics(const old_metric_list &old_metrics) } if (dsn_unlikely(iter->second->_retire_time_ms > now)) { - // Since in collect_old_metrics() we've filtered the metrics which have been + // Since in collect_stale_metrics() we've filtered the metrics which have been // outside the retention interval, this is unlikely to happen. However, we // still check here for some special cases. continue; @@ -484,7 +484,7 @@ void metric_registry::start_timer() // the interval of the timer is also set to FLAGS_metrics_retirement_delay_ms, in // the next round, it's just about time to retire this metric. _timer.reset(new metric_timer(FLAGS_metrics_retirement_delay_ms, - std::bind(&metric_registry::process_old_metrics, this), + std::bind(&metric_registry::process_stale_metrics, this), std::bind(&metric_registry::on_close, this))); } @@ -547,20 +547,21 @@ metric_entity_ptr metric_registry::find_or_create_entity(const metric_entity_pro return entity; } -metric_registry::collected_old_entities_info metric_registry::collect_old_metrics() const +metric_registry::collected_stale_entities_info metric_registry::collect_stale_metrics() const { - collected_old_entities_info entities_info; + collected_stale_entities_info entities_info; utils::auto_read_lock l(_lock); for (const auto &entity : _entities) { - const auto &metrics_info = entity.second->collect_old_metrics(); + const auto &metrics_info = entity.second->collect_stale_metrics(); entities_info.num_all_metrics += metrics_info.num_all_metrics; entities_info.num_scheduled_metrics += metrics_info.num_scheduled_metrics; - if (!metrics_info.old_metrics.empty()) { + if (!metrics_info.stale_metrics.empty()) { // Those entities which have metrics that should be retired will be collected. - entities_info.old_entities.emplace(entity.first, std::move(metrics_info.old_metrics)); + entities_info.stale_entities.emplace(entity.first, + std::move(metrics_info.stale_metrics)); continue; } @@ -570,7 +571,7 @@ metric_registry::collected_old_entities_info metric_registry::collect_old_metric // Since this entity itself should be retired, it will be collected without any // metric. Actually it has already not had any metric itself. - (void)entities_info.old_entities[entity.first]; + (void)entities_info.stale_entities[entity.first]; } entities_info.num_all_entities = _entities.size(); @@ -578,9 +579,9 @@ metric_registry::collected_old_entities_info metric_registry::collect_old_metric } metric_registry::retired_entities_stat -metric_registry::retire_old_metrics(const old_entity_map &old_entities) +metric_registry::retire_stale_metrics(const stale_entity_map &stale_entities) { - if (old_entities.empty()) { + if (stale_entities.empty()) { // Do not lock for empty list. return retired_entities_stat(); } @@ -589,8 +590,8 @@ metric_registry::retire_old_metrics(const old_entity_map &old_entities) utils::auto_write_lock l(_lock); - for (const auto &old_entity : old_entities) { - auto iter = _entities.find(old_entity.first); + for (const auto &stale_entity : stale_entities) { + auto iter = _entities.find(stale_entity.first); if (dsn_unlikely(iter == _entities.end())) { // This entity has been removed from the registry for some reason. continue; @@ -598,8 +599,8 @@ metric_registry::retire_old_metrics(const old_entity_map &old_entities) // Try to retire metrics from this entity. Note that since the entities that should // be retired wouldn't have any metric, they will never get locked in - // metric_entity::retire_old_metrics(). - const auto &metrics_stat = iter->second->retire_old_metrics(old_entity.second); + // metric_entity::retire_stale_metrics(). + const auto &metrics_stat = iter->second->retire_stale_metrics(stale_entity.second); entities_stat.num_retired_metrics += metrics_stat.num_retired_metrics; entities_stat.num_recently_scheduled_metrics += metrics_stat.num_recently_scheduled_metrics; @@ -619,23 +620,18 @@ metric_registry::retire_old_metrics(const old_entity_map &old_entities) return entities_stat; } -void metric_registry::process_old_metrics() +void metric_registry::process_stale_metrics() { - LOG_INFO("begin to process old metrics"); + LOG_INFO_F("begin to process stale metrics"); size_t num_collected_metrics = 0; - const auto &collected_info = collect_old_metrics(); - for (const auto &old_entity : collected_info.old_entities) { - num_collected_metrics += old_entity.second.size(); + const auto &collected_info = collect_stale_metrics(); + for (const auto &stale_entity : collected_info.stale_entities) { + num_collected_metrics += stale_entity.second.size(); } // Try to process the collected metrics. - const auto &retired_stat = retire_old_metrics(collected_info.old_entities); - - LOG_INFO_F("stat for entities: total={}, collected={}, retired={}", - collected_info.num_all_entities, - collected_info.old_entities.size(), - retired_stat.num_retired_entities); + const auto &retired_stat = retire_stale_metrics(collected_info.stale_entities); LOG_INFO_F("stat for metrics: total={}, collected={}, retired={}, scheduled={}, " "recently_scheduled={}, reemployed={}", @@ -645,6 +641,11 @@ void metric_registry::process_old_metrics() collected_info.num_scheduled_metrics, retired_stat.num_recently_scheduled_metrics, retired_stat.num_reemployed_metrics); + + LOG_INFO_F("stat for entities: total={}, collected={}, retired={}", + collected_info.num_all_entities, + collected_info.stale_entities.size(), + retired_stat.num_retired_entities); } metric_prototype::metric_prototype(const ctor_args &args) : _args(args) {} diff --git a/src/utils/metrics.h b/src/utils/metrics.h index 1fc2e2ce5f..6d89760a7e 100644 --- a/src/utils/metrics.h +++ b/src/utils/metrics.h @@ -153,11 +153,16 @@ class metric_entity : public ref_counter public: using attr_map = std::unordered_map; using metric_map = std::unordered_map; - using old_metric_list = std::unordered_set; + using stale_metric_list = std::unordered_set; - struct collected_old_metrics_info + struct collected_stale_metrics_info { - old_metric_list old_metrics; + // The collected metrics that are considered stale now, or were previously considered + // stale. Both metrics will be processed by retire_stale_metrics(): if a metric is + // considered stale now, it will be either retired immediately or scheduled to be + // retired; otherwise, if it was previously considered stale which means it has become + // useful now, its retirement will be cancelled. + stale_metric_list stale_metrics; // The number of all metrics in this entity. size_t num_all_metrics = 0; @@ -165,7 +170,7 @@ class metric_entity : public ref_counter // The number of the metrics that have been scheduled to be retired for this entity. size_t num_scheduled_metrics = 0; - collected_old_metrics_info() = default; + collected_stale_metrics_info() = default; }; struct retired_metrics_stat @@ -228,12 +233,23 @@ class metric_entity : public ref_counter void encode_id(metric_json_writer &writer) const; + // Similar with what has been done by metric::is_stale(), this function is used to decide + // if an entity is stale. Like the definition for a stale metric, an entity becomes stale + // if it does not has any metric, and is no longer used by any other class. + // + // For example, once a replica is removed, its metrics will be retired after a configurable + // retention interval. After that, the entity for this replica will not has any metric itself, + // and it is no longer used by any other class. Then it will be retired. + // + // Unlike metric, once an entity is considered stale, it will be retired immediately for the + // reason that each metric in it has been delayed for a long enough time before retirement. bool is_stale() const; // The whole retirement process is divided two phases "collect" and "retire", please see - // `collect_old_metrics()` and `retire_old_metrics()` of registry for details. - collected_old_metrics_info collect_old_metrics() const; - retired_metrics_stat retire_old_metrics(const old_metric_list &old_metrics); + // metric_registry::collect_stale_metrics() and metric_registry::retire_stale_metrics() + // for details. + collected_stale_metrics_info collect_stale_metrics() const; + retired_metrics_stat retire_stale_metrics(const stale_metric_list &stale_metrics); const metric_entity_prototype *const _prototype; const std::string _id; @@ -446,11 +462,14 @@ class metric_registry : public utils::singleton { public: using entity_map = std::unordered_map; - using old_entity_map = std::unordered_map; + using stale_entity_map = std::unordered_map; - struct collected_old_entities_info + struct collected_stale_entities_info { - old_entity_map old_entities; + // Like collected_stale_metrics_info::stale_metrics, stale_entities collect the metrics + // that are considered stale now, or were previously considered stale. In addition to + // these, stale_entities also collect the entities that are ready to be retired. + stale_entity_map stale_entities; // The number of all entities in the registry. size_t num_all_entities = 0; @@ -461,7 +480,7 @@ class metric_registry : public utils::singleton // The number of the metrics that have been scheduled to be retired for the registry. size_t num_scheduled_metrics = 0; - collected_old_entities_info() = default; + collected_stale_entities_info() = default; }; struct retired_entities_stat @@ -507,6 +526,11 @@ class metric_registry : public utils::singleton const std::string &id, const metric_entity::attr_map &attrs); + // These functions are used to retire stale metrics and entities. + // + // A metric is retired if it is removed from its entity. An entity is retired if it is + // removed from the registry. + // // Since retirements do not happen frequently, while we traverse over the registry there // tend to be no metric and entity that should be retired. Therefore, we divide the whole // retirement process into two phases: "collect" and "retire". @@ -516,9 +540,9 @@ class metric_registry : public utils::singleton // // Once some metrics and entities are collected to be retired, in the second phase "retire", // they will be removed according to the specific rules. - collected_old_entities_info collect_old_metrics() const; - retired_entities_stat retire_old_metrics(const old_entity_map &old_entities); - void process_old_metrics(); + collected_stale_entities_info collect_stale_metrics() const; + retired_entities_stat retire_stale_metrics(const stale_entity_map &stale_entities); + void process_stale_metrics(); mutable utils::rw_lock_nr _lock; entity_map _entities; @@ -741,8 +765,19 @@ class metric : public ref_counter private: friend class metric_entity; + // This function is used to decide if a metric is stale. A metric becomes stale if it is no + // longer used by any other class. Typically a metric is referenced by a class. For example, + // a metric is created to measure latency for reading or writing a replica. + // + // However, once the replica is removed, this metric will no longer be used; then the framework + // of metrics will find this metric and mark it stale. After a configurable retention interval, + // this metric will finally be removed from the repository of metrics. bool is_stale() const; + // The timestamp when this metric should be retired. It is 0 if this metric has not been + // scheduled to be retired; otherwise, it is non-zero if this metric is scheduled to be + // retired. Once `_retire_time_ms` is non-zero, this metric will be retired at any time if + // current time has reached or exceeded `_retire_time_ms`. uint64_t _retire_time_ms; DISALLOW_COPY_AND_ASSIGN(metric); From 3b02d7895da0a5a3cf708be58c39fdc61a4ae35c Mon Sep 17 00:00:00 2001 From: Dan Wang Date: Thu, 2 Feb 2023 01:35:56 +0800 Subject: [PATCH 14/17] feat(new_metrics): retire stale metric entities that are not used by any other object --- src/utils/metrics.cpp | 247 ++++++++++++------------------------------ src/utils/metrics.h | 146 +++++++++---------------- 2 files changed, 119 insertions(+), 274 deletions(-) diff --git a/src/utils/metrics.cpp b/src/utils/metrics.cpp index 426ec6ac53..500e8ad0da 100644 --- a/src/utils/metrics.cpp +++ b/src/utils/metrics.cpp @@ -27,7 +27,7 @@ namespace dsn { -DSN_DEFINE_uint64("metrics", +DSN_DEFINE_uint64(metrics, metrics_retirement_delay_ms, 10 * 60 * 1000, "The minimum number of milliseconds a metric will be kept for after it is " @@ -36,7 +36,7 @@ DSN_DEFINE_uint64("metrics", metric_entity::metric_entity(const metric_entity_prototype *prototype, const std::string &id, const attr_map &attrs) - : _prototype(prototype), _id(id), _attrs(attrs) + : _prototype(prototype), _id(id), _attrs(attrs), _retire_time_ms(0) { } @@ -51,9 +51,9 @@ metric_entity::~metric_entity() namespace { -// Before destructing or retiring a metric, close it first. For example, stop the timer of -// percentile. Generally close operations in this function are asynchronous, wait_metric() -// can be used to wait until all close operations are finished. +// Before destructing a metric, close it first. For example, stop the timer of percentile. +// Generally close operations in this function are asynchronous, wait_metric() can be used +// to wait until all close operations are finished. void close_metric(const dsn::metric_ptr &m) { if (m->prototype()->type() == dsn::metric_type::kPercentile) { @@ -209,101 +209,9 @@ bool metric_entity::is_stale() const // at least. CHECK_GE(get_count(), 1); - // Once this entity did not have any metric, and had only one reference kept in the - // registry, this entity would be considered useless. - return _metrics.empty() && get_count() == 1; -} - -metric_entity::collected_stale_metrics_info metric_entity::collect_stale_metrics() const -{ - collected_stale_metrics_info collected_info; - - auto now = dsn_now_ms(); - - utils::auto_read_lock l(_lock); - - for (const auto &m : _metrics) { - if (!m.second->is_stale()) { - if (m.second->_retire_time_ms > 0) { - // Previously this metric must have been scheduled to be retired. However, - // since this metric is reemployed now, its scheduled time for retirement - // should be reset to 0. - collected_info.stale_metrics.insert(m.first); - } - continue; - } - - if (m.second->_retire_time_ms > now) { - // This metric has been scheduled to be retired, however it is still within - // the retention interval. Thus do not collect it. - ++collected_info.num_scheduled_metrics; - continue; - } - - collected_info.stale_metrics.insert(m.first); - } - - collected_info.num_all_metrics = _metrics.size(); - return collected_info; -} - -metric_entity::retired_metrics_stat -metric_entity::retire_stale_metrics(const stale_metric_list &stale_metrics) -{ - if (stale_metrics.empty()) { - // Do not lock for empty list. - return retired_metrics_stat(); - } - - retired_metrics_stat retired_stat; - - auto now = dsn_now_ms(); - - utils::auto_write_lock l(_lock); - - for (const auto &m : stale_metrics) { - auto iter = _metrics.find(m); - if (dsn_unlikely(iter == _metrics.end())) { - // This metric has been removed from the entity for some reason. - continue; - } - - if (!iter->second->is_stale()) { - if (iter->second->_retire_time_ms > 0) { - // For those metrics which are still in use, their scheduled time for - // retirement should be reset to 0 though previously they could have - // been scheduled to be retired. - iter->second->_retire_time_ms = 0; - ++retired_stat.num_reemployed_metrics; - } - continue; - } - - if (dsn_unlikely(iter->second->_retire_time_ms > now)) { - // Since in collect_stale_metrics() we've filtered the metrics which have been - // outside the retention interval, this is unlikely to happen. However, we - // still check here for some special cases. - continue; - } - - if (iter->second->_retire_time_ms == 0) { - // This metric has already not been in use, thus should be marked with a - // scheduled time for retirement. - iter->second->_retire_time_ms = now + FLAGS_metrics_retirement_delay_ms; - ++retired_stat.num_recently_scheduled_metrics; - continue; - } - - // Once this metric is outside the retention interval, close it synchronously first. - close_metric(iter->second); - wait_metric(iter->second); - - // Then retire it from the entity. - _metrics.erase(iter); - ++retired_stat.num_retired_metrics; - } - - return retired_stat; + // This entity is considered stale once there is only one reference for it kept in the + // registry. + return get_count() == 1; } void metric_filters::extract_entity_metrics(const metric_entity::metric_map &candidates, @@ -484,7 +392,7 @@ void metric_registry::start_timer() // the interval of the timer is also set to FLAGS_metrics_retirement_delay_ms, in // the next round, it's just about time to retire this metric. _timer.reset(new metric_timer(FLAGS_metrics_retirement_delay_ms, - std::bind(&metric_registry::process_stale_metrics, this), + std::bind(&metric_registry::process_stale_entities, this), std::bind(&metric_registry::on_close, this))); } @@ -547,131 +455,116 @@ metric_entity_ptr metric_registry::find_or_create_entity(const metric_entity_pro return entity; } -metric_registry::collected_stale_entities_info metric_registry::collect_stale_metrics() const +metric_registry::collected_stale_entities_info metric_registry::collect_stale_entities() const { - collected_stale_entities_info entities_info; + collected_stale_entities_info collected_info; + + auto now = dsn_now_ms(); utils::auto_read_lock l(_lock); for (const auto &entity : _entities) { - const auto &metrics_info = entity.second->collect_stale_metrics(); - - entities_info.num_all_metrics += metrics_info.num_all_metrics; - entities_info.num_scheduled_metrics += metrics_info.num_scheduled_metrics; - if (!metrics_info.stale_metrics.empty()) { - // Those entities which have metrics that should be retired will be collected. - entities_info.stale_entities.emplace(entity.first, - std::move(metrics_info.stale_metrics)); + if (!entity.second->is_stale()) { + if (entity.second->_retire_time_ms > 0) { + // This entity had been scheduled to be retired. However, it was reemployed + // after that. It has been in use since then, therefore its scheduled time + // for retirement should be reset to 0. + collected_info.stale_entities.insert(entity.first); + } continue; } - if (!entity.second->is_stale()) { + if (entity.second->_retire_time_ms > now) { + // This entity has been scheduled to be retired, however it is still within + // the retention interval. Thus do not collect it. + ++collected_info.num_scheduled_entities; continue; } - // Since this entity itself should be retired, it will be collected without any - // metric. Actually it has already not had any metric itself. - (void)entities_info.stale_entities[entity.first]; + collected_info.stale_entities.insert(entity.first); } - entities_info.num_all_entities = _entities.size(); - return entities_info; + collected_info.num_all_entities = _entities.size(); + return collected_info; } metric_registry::retired_entities_stat -metric_registry::retire_stale_metrics(const stale_entity_map &stale_entities) +metric_registry::retire_stale_entities(const stale_entity_list &stale_entities) { if (stale_entities.empty()) { // Do not lock for empty list. return retired_entities_stat(); } - retired_entities_stat entities_stat; + retired_entities_stat retired_stat; + + auto now = dsn_now_ms(); utils::auto_write_lock l(_lock); for (const auto &stale_entity : stale_entities) { - auto iter = _entities.find(stale_entity.first); + auto iter = _entities.find(stale_entity); if (dsn_unlikely(iter == _entities.end())) { - // This entity has been removed from the registry for some reason. + // The entity has been removed from the registry for some unusual reason. continue; } - // Try to retire metrics from this entity. Note that since the entities that should - // be retired wouldn't have any metric, they will never get locked in - // metric_entity::retire_stale_metrics(). - const auto &metrics_stat = iter->second->retire_stale_metrics(stale_entity.second); + if (!iter->second->is_stale()) { + if (iter->second->_retire_time_ms > 0) { + // For those entities which are reemployed, their scheduled time for retirement + // should be reset to 0 though previously they could have been scheduled to be + // retired. + iter->second->_retire_time_ms = 0; + ++retired_stat.num_reemployed_entities; + } + continue; + } - entities_stat.num_retired_metrics += metrics_stat.num_retired_metrics; - entities_stat.num_recently_scheduled_metrics += metrics_stat.num_recently_scheduled_metrics; - entities_stat.num_reemployed_metrics += metrics_stat.num_reemployed_metrics; + if (dsn_unlikely(iter->second->_retire_time_ms > now)) { + // Since in collect_stale_entities() we've filtered the metrics which have been + // outside the retention interval, this is unlikely to happen. However, we still + // check here. + continue; + } - if (!iter->second->is_stale()) { + if (iter->second->_retire_time_ms == 0) { + // The entity should be marked with a scheduled time for retirement, since it has + // already been considered stale. + iter->second->_retire_time_ms = now + FLAGS_metrics_retirement_delay_ms; + ++retired_stat.num_recently_scheduled_entities; continue; } - // Once the entity itself should be retired, it will be retired immediately without - // waiting for any retention interval, for the reason that each metric in it has - // been delayed for a long enough time before retirement. + // Once the entity is outside the retention interval, retire it from the registry. _entities.erase(iter); - ++entities_stat.num_retired_entities; + ++retired_stat.num_retired_entities; } - return entities_stat; + return retired_stat; } -void metric_registry::process_stale_metrics() +void metric_registry::process_stale_entities() { - LOG_INFO_F("begin to process stale metrics"); - - size_t num_collected_metrics = 0; - const auto &collected_info = collect_stale_metrics(); - for (const auto &stale_entity : collected_info.stale_entities) { - num_collected_metrics += stale_entity.second.size(); - } - - // Try to process the collected metrics. - const auto &retired_stat = retire_stale_metrics(collected_info.stale_entities); + LOG_INFO("begin to process stale metric entities"); - LOG_INFO_F("stat for metrics: total={}, collected={}, retired={}, scheduled={}, " - "recently_scheduled={}, reemployed={}", - collected_info.num_all_metrics, - num_collected_metrics, - retired_stat.num_retired_metrics, - collected_info.num_scheduled_metrics, - retired_stat.num_recently_scheduled_metrics, - retired_stat.num_reemployed_metrics); + const auto &collected_info = collect_stale_entities(); + const auto &retired_stat = retire_stale_entities(collected_info.stale_entities); - LOG_INFO_F("stat for entities: total={}, collected={}, retired={}", - collected_info.num_all_entities, - collected_info.stale_entities.size(), - retired_stat.num_retired_entities); + LOG_INFO("stat for metric entities: total={}, collected={}, retired={}, scheduled={}, " + "recently_scheduled={}, reemployed={}", + collected_info.num_all_entities, + collected_info.stale_entities.size(), + retired_stat.num_retired_entities, + collected_info.num_scheduled_entities, + retired_stat.num_recently_scheduled_entities, + retired_stat.num_reemployed_entities); } metric_prototype::metric_prototype(const ctor_args &args) : _args(args) {} metric_prototype::~metric_prototype() {} -metric::metric(const metric_prototype *prototype) : _prototype(prototype), _retire_time_ms(0) {} - -bool metric::is_stale() const -{ - // Since this metric itself is still being accessed, its reference count should be 1 - // at least. - CHECK_GE(get_count(), 1); - - // Each metric (any type of metric) will be referenced by its entity. For a percentile, - // it will also be referenced by its timer. Thus the reference count for a percentile - // will be at least 2. However, once a percentile has only 2 references, it will be - // considered useless since it is not used by any other class. - if (_prototype->type() == metric_type::kPercentile && get_count() == 2) { - return true; - } - - // There's only one reference for this metric that is kept in the entity. Thus this - // metric is considered useless. - return get_count() == 1; -} +metric::metric(const metric_prototype *prototype) : _prototype(prototype) {} closeable_metric::closeable_metric(const metric_prototype *prototype) : metric(prototype) {} diff --git a/src/utils/metrics.h b/src/utils/metrics.h index 6d89760a7e..db948f6a63 100644 --- a/src/utils/metrics.h +++ b/src/utils/metrics.h @@ -153,41 +153,6 @@ class metric_entity : public ref_counter public: using attr_map = std::unordered_map; using metric_map = std::unordered_map; - using stale_metric_list = std::unordered_set; - - struct collected_stale_metrics_info - { - // The collected metrics that are considered stale now, or were previously considered - // stale. Both metrics will be processed by retire_stale_metrics(): if a metric is - // considered stale now, it will be either retired immediately or scheduled to be - // retired; otherwise, if it was previously considered stale which means it has become - // useful now, its retirement will be cancelled. - stale_metric_list stale_metrics; - - // The number of all metrics in this entity. - size_t num_all_metrics = 0; - - // The number of the metrics that have been scheduled to be retired for this entity. - size_t num_scheduled_metrics = 0; - - collected_stale_metrics_info() = default; - }; - - struct retired_metrics_stat - { - // The number of retired metrics for this entity. - size_t num_retired_metrics = 0; - - // The number of the metrics that were just considered useless and scheduled to - // be retired in this round for this entity. - size_t num_recently_scheduled_metrics = 0; - - // The number of the metrics that had previously been scheduled to be retired and - // were recently reemployed for this entity. - size_t num_reemployed_metrics = 0; - - retired_metrics_stat() = default; - }; const metric_entity_prototype *prototype() const { return _prototype; } @@ -233,24 +198,18 @@ class metric_entity : public ref_counter void encode_id(metric_json_writer &writer) const; - // Similar with what has been done by metric::is_stale(), this function is used to decide - // if an entity is stale. Like the definition for a stale metric, an entity becomes stale - // if it does not has any metric, and is no longer used by any other class. + // Decide if an entity is stale. An entity becomes stale if it is no longer used by any other + // object. // - // For example, once a replica is removed, its metrics will be retired after a configurable - // retention interval. After that, the entity for this replica will not has any metric itself, - // and it is no longer used by any other class. Then it will be retired. + // An entity could be bound to one or multiple objects. Once all of these objects are + // destroyed, this entity will become stale, which means all of the metrics held by this + // entity are also stale. // - // Unlike metric, once an entity is considered stale, it will be retired immediately for the - // reason that each metric in it has been delayed for a long enough time before retirement. + // For example, once a replica is removed, the replica entity (and all metrics it holds) will + // become stale; then, this entity is scheduled to be retired after a configurable retention + // interval; finally, this entity will be removed from the registry with all metrics it holds. bool is_stale() const; - // The whole retirement process is divided two phases "collect" and "retire", please see - // metric_registry::collect_stale_metrics() and metric_registry::retire_stale_metrics() - // for details. - collected_stale_metrics_info collect_stale_metrics() const; - retired_metrics_stat retire_stale_metrics(const stale_metric_list &stale_metrics); - const metric_entity_prototype *const _prototype; const std::string _id; @@ -258,6 +217,12 @@ class metric_entity : public ref_counter attr_map _attrs; metric_map _metrics; + // The timestamp when this entity should be retired: + // * default value is 0, which means this entity has not been scheduled to be retired; + // * otherwise, non-zero value means this entity has been scheduled to be retired, and will + // be retired at any time once current time has reached or exceeded this timestamp. + uint64_t _retire_time_ms; + DISALLOW_COPY_AND_ASSIGN(metric_entity); }; @@ -462,42 +427,39 @@ class metric_registry : public utils::singleton { public: using entity_map = std::unordered_map; - using stale_entity_map = std::unordered_map; + using stale_entity_list = std::unordered_set; struct collected_stale_entities_info { - // Like collected_stale_metrics_info::stale_metrics, stale_entities collect the metrics - // that are considered stale now, or were previously considered stale. In addition to - // these, stale_entities also collect the entities that are ready to be retired. - stale_entity_map stale_entities; + // The stale entities are collected to be processed by retire_stale_entities(). Following + // kinds of stale entities will be collected: + // * entities that should be retired immediately. The entities that are still within + // the retention interval will not be collected. + // * entities that were previously considered stale however have already been reemployed, + // which means its retirement should be cancelled by retire_stale_entities(). + stale_entity_list stale_entities; // The number of all entities in the registry. size_t num_all_entities = 0; - // The number of all metrics in the registry. - size_t num_all_metrics = 0; - - // The number of the metrics that have been scheduled to be retired for the registry. - size_t num_scheduled_metrics = 0; + // The number of the entities that have been scheduled to be retired. + size_t num_scheduled_entities = 0; collected_stale_entities_info() = default; }; struct retired_entities_stat { - // The number of retired entities for the registry. + // The number of retired entities. size_t num_retired_entities = 0; - // The number of retired metrics for the registry. - size_t num_retired_metrics = 0; - - // The number of the metrics that were just considered useless and scheduled to - // be retired in this round for the registry. - size_t num_recently_scheduled_metrics = 0; + // The number of entities that were recently considered stale and scheduled to be + // retired. + size_t num_recently_scheduled_entities = 0; - // The number of the metrics that had previously been scheduled to be retired and - // were recently reemployed for the registry. - size_t num_reemployed_metrics = 0; + // The number of the entities that had previously been scheduled to be retired and + // were recently reemployed. + size_t num_reemployed_entities = 0; retired_entities_stat() = default; }; @@ -526,23 +488,28 @@ class metric_registry : public utils::singleton const std::string &id, const metric_entity::attr_map &attrs); - // These functions are used to retire stale metrics and entities. + // These functions are used to retire stale entities. // - // A metric is retired if it is removed from its entity. An entity is retired if it is - // removed from the registry. + // Since retirement is infrequent, there tend to be no entity that should be retired. + // Therefore, the whole retirement process is divided into two phases: "collect" and + // "retire". // - // Since retirements do not happen frequently, while we traverse over the registry there - // tend to be no metric and entity that should be retired. Therefore, we divide the whole - // retirement process into two phases: "collect" and "retire". + // At the first phase "collect", we just check if there are entities that: + // * has become stale, but has not been scheduled to be retired, or + // * should be retired immediately, or + // * previously were scheduled to be retired, now has been reemployed. // - // In the first phase "collect", we just read and check which metrics and entitie should - // be retired, thus it just needs read lock that is lightweight. + // The "check" operation just needs read lock which is lightweight. If some entities were + // found following above conditions, albeit infrequenly, they would be collected to be + // processed at the next phase. // - // Once some metrics and entities are collected to be retired, in the second phase "retire", - // they will be removed according to the specific rules. - collected_stale_entities_info collect_stale_metrics() const; - retired_entities_stat retire_stale_metrics(const stale_entity_map &stale_entities); - void process_stale_metrics(); + // Collected entities, if any, will be processed at the second phase "retire": + // * stale entities will be schedule to be retired; + // * the expired entities will be retired; + // * reset the retirement timestamp to 0 for reemployed entities. + collected_stale_entities_info collect_stale_entities() const; + retired_entities_stat retire_stale_entities(const stale_entity_list &stale_entities); + void process_stale_entities(); mutable utils::rw_lock_nr _lock; entity_map _entities; @@ -765,21 +732,6 @@ class metric : public ref_counter private: friend class metric_entity; - // This function is used to decide if a metric is stale. A metric becomes stale if it is no - // longer used by any other class. Typically a metric is referenced by a class. For example, - // a metric is created to measure latency for reading or writing a replica. - // - // However, once the replica is removed, this metric will no longer be used; then the framework - // of metrics will find this metric and mark it stale. After a configurable retention interval, - // this metric will finally be removed from the repository of metrics. - bool is_stale() const; - - // The timestamp when this metric should be retired. It is 0 if this metric has not been - // scheduled to be retired; otherwise, it is non-zero if this metric is scheduled to be - // retired. Once `_retire_time_ms` is non-zero, this metric will be retired at any time if - // current time has reached or exceeded `_retire_time_ms`. - uint64_t _retire_time_ms; - DISALLOW_COPY_AND_ASSIGN(metric); }; From 390ec9c678d2f7ab579f44cf4d920bf0fd0e704e Mon Sep 17 00:00:00 2001 From: Dan Wang Date: Thu, 2 Feb 2023 01:36:23 +0800 Subject: [PATCH 15/17] feat(new_metrics): retire stale metric entities that are not used by any other object --- src/utils/test/metrics_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/test/metrics_test.cpp b/src/utils/test/metrics_test.cpp index 963b115d18..f55a728ec1 100644 --- a/src/utils/test/metrics_test.cpp +++ b/src/utils/test/metrics_test.cpp @@ -2905,7 +2905,6 @@ class scoped_entity // Extend the lifetime of the metric since it's marked as "surviving". m = temp_m; - _expected_surviving_metrics.emplace(&prototype, m.get()); } surviving_metric_map get_actual_surviving_metrics(const metric_entity_ptr &my_entity) const; @@ -2949,6 +2948,7 @@ scoped_entity::scoped_entity(const std::string &entity_id, if (is_entity_surviving) { // Extend the lifetime of the entity since it's marked as "surviving". _my_entity = my_entity; + _expected_surviving_metrics = _expected_all_metrics; } test_survival_immediately_after_initialization(); From e042cb771f0aff687ce35f4c85eeec25fd2e6bd5 Mon Sep 17 00:00:00 2001 From: Dan Wang Date: Thu, 2 Feb 2023 14:12:13 +0800 Subject: [PATCH 16/17] feat(new_metrics): retire stale metric entities that are not used by any other object --- src/utils/metrics.cpp | 63 +++++++++++---------------------- src/utils/metrics.h | 12 +++---- src/utils/test/metrics_test.cpp | 29 +++++++-------- 3 files changed, 41 insertions(+), 63 deletions(-) diff --git a/src/utils/metrics.cpp b/src/utils/metrics.cpp index 500e8ad0da..6906c31c84 100644 --- a/src/utils/metrics.cpp +++ b/src/utils/metrics.cpp @@ -28,10 +28,9 @@ namespace dsn { DSN_DEFINE_uint64(metrics, - metrics_retirement_delay_ms, + entity_retirement_delay_ms, 10 * 60 * 1000, - "The minimum number of milliseconds a metric will be kept for after it is " - "no longer active."); + "The retention internal (milliseconds) for an entity after it becomes stale."); metric_entity::metric_entity(const metric_entity_prototype *prototype, const std::string &id, @@ -49,43 +48,20 @@ metric_entity::~metric_entity() close(close_option::kWait); } -namespace { - -// Before destructing a metric, close it first. For example, stop the timer of percentile. -// Generally close operations in this function are asynchronous, wait_metric() can be used -// to wait until all close operations are finished. -void close_metric(const dsn::metric_ptr &m) -{ - if (m->prototype()->type() == dsn::metric_type::kPercentile) { - auto p = dsn::down_cast(m.get()); - p->close(); - } -} - -// Wait until all operations in close_metric() are finished. -void wait_metric(const dsn::metric_ptr &m) -{ - if (m->prototype()->type() == dsn::metric_type::kPercentile) { - auto p = dsn::down_cast(m.get()); - p->wait(); - } -} - -} // anonymous namespace - void metric_entity::close(close_option option) const { utils::auto_write_lock l(_lock); - // The reason why each metric is closed in the entity rather than in the destructor of each - // metric is that close() for the metric will return immediately without waiting for any close - // operation to be finished. - // - // Thus, to close all metrics owned by an entity, it's more efficient to firstly issue a close - // request for all metrics; then, just wait for all of the close operations to be finished. - // It's inefficient to wait for each metric to be closed one by one. + // To close all metrics owned by an entity, it's more efficient to firstly issue an asynchronous + // close request to each metric; then, just wait for all of the close operations to be finished. + // It's inefficient to wait for each metric to be closed one by one. Therefore, the metric is + // not + // closed in its destructor. for (auto &m : _metrics) { - close_metric(m.second); + if (m.second->prototype()->type() == metric_type::kPercentile) { + auto p = down_cast(m.second.get()); + p->close(); + } } if (option == close_option::kNoWait) { @@ -94,7 +70,10 @@ void metric_entity::close(close_option option) const // Wait for all of the close operations to be finished. for (auto &m : _metrics) { - wait_metric(m.second); + if (m.second->prototype()->type() == metric_type::kPercentile) { + auto p = down_cast(m.second.get()); + p->wait(); + } } } @@ -387,11 +366,11 @@ void metric_registry::start_timer() return; } - // Once a metric is considered useless, it will retire formally after the retention - // interval, namely FLAGS_metrics_retirement_delay_ms milliseconds. Therefore, if - // the interval of the timer is also set to FLAGS_metrics_retirement_delay_ms, in - // the next round, it's just about time to retire this metric. - _timer.reset(new metric_timer(FLAGS_metrics_retirement_delay_ms, + // Once an entity is considered stale, it will be retired after the retention interval, + // namely FLAGS_entity_retirement_delay_ms milliseconds. Therefore, if the interval of + // the timer is also set to FLAGS_entity_retirement_delay_ms, in the next round, it's + // just about time to retire this entity. + _timer.reset(new metric_timer(FLAGS_entity_retirement_delay_ms, std::bind(&metric_registry::process_stale_entities, this), std::bind(&metric_registry::on_close, this))); } @@ -530,7 +509,7 @@ metric_registry::retire_stale_entities(const stale_entity_list &stale_entities) if (iter->second->_retire_time_ms == 0) { // The entity should be marked with a scheduled time for retirement, since it has // already been considered stale. - iter->second->_retire_time_ms = now + FLAGS_metrics_retirement_delay_ms; + iter->second->_retire_time_ms = now + FLAGS_entity_retirement_delay_ms; ++retired_stat.num_recently_scheduled_entities; continue; } diff --git a/src/utils/metrics.h b/src/utils/metrics.h index db948f6a63..910d14b8d0 100644 --- a/src/utils/metrics.h +++ b/src/utils/metrics.h @@ -373,14 +373,12 @@ class metrics_http_service : public http_server_base DISALLOW_COPY_AND_ASSIGN(metrics_http_service); }; -// `metric_timer` is a timer class that encapsulates the details how each percentile is -// computed periodically. +// `metric_timer` is a timer class that runs metric-related computations periodically, such as +// calculating percentile, checking if there are stale entities. It accepts `on_exec` and +// `on_close` as the callbacks for execution and close. // -// To be instantiated, it requires `interval_ms` at which a percentile is computed and `exec` -// which is used to compute percentile. -// -// In case that all percentiles are computed at the same time and lead to very high load, -// first computation for percentile will be delayed at a random interval. +// In case that all metrics (such as percentiles) are computed at the same time and lead to very +// high load, first calculation will be delayed at a random interval. class metric_timer { public: diff --git a/src/utils/test/metrics_test.cpp b/src/utils/test/metrics_test.cpp index f55a728ec1..20414a2743 100644 --- a/src/utils/test/metrics_test.cpp +++ b/src/utils/test/metrics_test.cpp @@ -32,7 +32,7 @@ namespace dsn { -DSN_DECLARE_uint64(metrics_retirement_delay_ms); +DSN_DECLARE_uint64(entity_retirement_delay_ms); class my_gauge : public metric { @@ -2840,35 +2840,35 @@ class MetricsRetirementTest : public testing::TestWithParam Date: Tue, 7 Feb 2023 11:58:38 +0800 Subject: [PATCH 17/17] feat(new_metrics): retire stale metric entities that are not used by any other object --- src/utils/metrics.cpp | 25 ++++++++++++------------- src/utils/metrics.h | 24 ++++++++++++------------ 2 files changed, 24 insertions(+), 25 deletions(-) diff --git a/src/utils/metrics.cpp b/src/utils/metrics.cpp index 6906c31c84..007fb2198b 100644 --- a/src/utils/metrics.cpp +++ b/src/utils/metrics.cpp @@ -48,15 +48,14 @@ metric_entity::~metric_entity() close(close_option::kWait); } -void metric_entity::close(close_option option) const +void metric_entity::close(close_option option) { utils::auto_write_lock l(_lock); // To close all metrics owned by an entity, it's more efficient to firstly issue an asynchronous // close request to each metric; then, just wait for all of the close operations to be finished. // It's inefficient to wait for each metric to be closed one by one. Therefore, the metric is - // not - // closed in its destructor. + // not closed in its destructor. for (auto &m : _metrics) { if (m.second->prototype()->type() == metric_type::kPercentile) { auto p = down_cast(m.second.get()); @@ -434,9 +433,9 @@ metric_entity_ptr metric_registry::find_or_create_entity(const metric_entity_pro return entity; } -metric_registry::collected_stale_entities_info metric_registry::collect_stale_entities() const +metric_registry::collected_entities_info metric_registry::collect_stale_entities() const { - collected_stale_entities_info collected_info; + collected_entities_info collected_info; auto now = dsn_now_ms(); @@ -448,7 +447,7 @@ metric_registry::collected_stale_entities_info metric_registry::collect_stale_en // This entity had been scheduled to be retired. However, it was reemployed // after that. It has been in use since then, therefore its scheduled time // for retirement should be reset to 0. - collected_info.stale_entities.insert(entity.first); + collected_info.collected_entities.insert(entity.first); } continue; } @@ -460,7 +459,7 @@ metric_registry::collected_stale_entities_info metric_registry::collect_stale_en continue; } - collected_info.stale_entities.insert(entity.first); + collected_info.collected_entities.insert(entity.first); } collected_info.num_all_entities = _entities.size(); @@ -468,9 +467,9 @@ metric_registry::collected_stale_entities_info metric_registry::collect_stale_en } metric_registry::retired_entities_stat -metric_registry::retire_stale_entities(const stale_entity_list &stale_entities) +metric_registry::retire_stale_entities(const collected_entity_list &collected_entities) { - if (stale_entities.empty()) { + if (collected_entities.empty()) { // Do not lock for empty list. return retired_entities_stat(); } @@ -481,8 +480,8 @@ metric_registry::retire_stale_entities(const stale_entity_list &stale_entities) utils::auto_write_lock l(_lock); - for (const auto &stale_entity : stale_entities) { - auto iter = _entities.find(stale_entity); + for (const auto &collected_entity : collected_entities) { + auto iter = _entities.find(collected_entity); if (dsn_unlikely(iter == _entities.end())) { // The entity has been removed from the registry for some unusual reason. continue; @@ -527,12 +526,12 @@ void metric_registry::process_stale_entities() LOG_INFO("begin to process stale metric entities"); const auto &collected_info = collect_stale_entities(); - const auto &retired_stat = retire_stale_entities(collected_info.stale_entities); + const auto &retired_stat = retire_stale_entities(collected_info.collected_entities); LOG_INFO("stat for metric entities: total={}, collected={}, retired={}, scheduled={}, " "recently_scheduled={}, reemployed={}", collected_info.num_all_entities, - collected_info.stale_entities.size(), + collected_info.collected_entities.size(), retired_stat.num_retired_entities, collected_info.num_scheduled_entities, retired_stat.num_recently_scheduled_entities, diff --git a/src/utils/metrics.h b/src/utils/metrics.h index 910d14b8d0..340d2fd9d5 100644 --- a/src/utils/metrics.h +++ b/src/utils/metrics.h @@ -190,7 +190,7 @@ class metric_entity : public ref_counter kWait, kNoWait, }; - void close(close_option option) const; + void close(close_option option); void set_attributes(const attr_map &attrs); @@ -425,17 +425,17 @@ class metric_registry : public utils::singleton { public: using entity_map = std::unordered_map; - using stale_entity_list = std::unordered_set; + using collected_entity_list = std::unordered_set; - struct collected_stale_entities_info + struct collected_entities_info { - // The stale entities are collected to be processed by retire_stale_entities(). Following - // kinds of stale entities will be collected: + // The collected entities that will be processed by retire_stale_entities(). Following + // kinds of entities will be collected: // * entities that should be retired immediately. The entities that are still within // the retention interval will not be collected. // * entities that were previously considered stale however have already been reemployed, // which means its retirement should be cancelled by retire_stale_entities(). - stale_entity_list stale_entities; + collected_entity_list collected_entities; // The number of all entities in the registry. size_t num_all_entities = 0; @@ -443,7 +443,7 @@ class metric_registry : public utils::singleton // The number of the entities that have been scheduled to be retired. size_t num_scheduled_entities = 0; - collected_stale_entities_info() = default; + collected_entities_info() = default; }; struct retired_entities_stat @@ -497,16 +497,16 @@ class metric_registry : public utils::singleton // * should be retired immediately, or // * previously were scheduled to be retired, now has been reemployed. // - // The "check" operation just needs read lock which is lightweight. If some entities were - // found following above conditions, albeit infrequenly, they would be collected to be - // processed at the next phase. + // All operations in the first phase are read-only, needing just read lock which is more + // lightweight. If some entities were found following above conditions, albeit infrequenly, + // they would be collected to be processed at the next phase. // // Collected entities, if any, will be processed at the second phase "retire": // * stale entities will be schedule to be retired; // * the expired entities will be retired; // * reset the retirement timestamp to 0 for reemployed entities. - collected_stale_entities_info collect_stale_entities() const; - retired_entities_stat retire_stale_entities(const stale_entity_list &stale_entities); + collected_entities_info collect_stale_entities() const; + retired_entities_stat retire_stale_entities(const collected_entity_list &collected_entities); void process_stale_entities(); mutable utils::rw_lock_nr _lock;